mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	Add support for Daitch-Mokotoff Soundex in contrib/fuzzystrmatch.
This modernized version of Soundex works significantly better than the original, particularly for non-English names. Dag Lem, reviewed by quite a few people along the way Discussion: https://postgr.es/m/yger1atbgfy.fsf@sid.nimrod.no
This commit is contained in:
		
							
								
								
									
										2
									
								
								contrib/fuzzystrmatch/.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								contrib/fuzzystrmatch/.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,3 +1,5 @@ | |||||||
|  | # Generated files | ||||||
|  | /daitch_mokotoff.h | ||||||
| # Generated subdirectories | # Generated subdirectories | ||||||
| /log/ | /log/ | ||||||
| /results/ | /results/ | ||||||
|   | |||||||
| @@ -3,14 +3,17 @@ | |||||||
| MODULE_big = fuzzystrmatch | MODULE_big = fuzzystrmatch | ||||||
| OBJS = \ | OBJS = \ | ||||||
| 	$(WIN32RES) \ | 	$(WIN32RES) \ | ||||||
|  | 	daitch_mokotoff.o \ | ||||||
| 	dmetaphone.o \ | 	dmetaphone.o \ | ||||||
| 	fuzzystrmatch.o | 	fuzzystrmatch.o | ||||||
|  |  | ||||||
| EXTENSION = fuzzystrmatch | EXTENSION = fuzzystrmatch | ||||||
| DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--1.0--1.1.sql | DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--1.1--1.2.sql \ | ||||||
|  | 	fuzzystrmatch--1.0--1.1.sql | ||||||
|  |  | ||||||
| PGFILEDESC = "fuzzystrmatch - similarities and distance between strings" | PGFILEDESC = "fuzzystrmatch - similarities and distance between strings" | ||||||
|  |  | ||||||
| REGRESS = fuzzystrmatch | REGRESS = fuzzystrmatch fuzzystrmatch_utf8 | ||||||
|  |  | ||||||
| ifdef USE_PGXS | ifdef USE_PGXS | ||||||
| PG_CONFIG = pg_config | PG_CONFIG = pg_config | ||||||
| @@ -22,3 +25,16 @@ top_builddir = ../.. | |||||||
| include $(top_builddir)/src/Makefile.global | include $(top_builddir)/src/Makefile.global | ||||||
| include $(top_srcdir)/contrib/contrib-global.mk | include $(top_srcdir)/contrib/contrib-global.mk | ||||||
| endif | endif | ||||||
|  |  | ||||||
|  | # Force this dependency to be known even without dependency info built: | ||||||
|  | daitch_mokotoff.o: daitch_mokotoff.h | ||||||
|  |  | ||||||
|  | daitch_mokotoff.h: daitch_mokotoff_header.pl | ||||||
|  | 	$(PERL) $< $@ | ||||||
|  |  | ||||||
|  | # daitch_mokotoff.h is included in tarballs, so it has to be made by | ||||||
|  | # "distprep" and not cleaned except by "maintainer-clean". | ||||||
|  | distprep: daitch_mokotoff.h | ||||||
|  |  | ||||||
|  | maintainer-clean: | ||||||
|  | 	rm -f daitch_mokotoff.h | ||||||
|   | |||||||
							
								
								
									
										577
									
								
								contrib/fuzzystrmatch/daitch_mokotoff.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										577
									
								
								contrib/fuzzystrmatch/daitch_mokotoff.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,577 @@ | |||||||
|  | /* | ||||||
|  |  * Daitch-Mokotoff Soundex | ||||||
|  |  * | ||||||
|  |  * Copyright (c) 2023, PostgreSQL Global Development Group | ||||||
|  |  * | ||||||
|  |  * This module was originally sponsored by Finance Norway / | ||||||
|  |  * Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no> | ||||||
|  |  * | ||||||
|  |  * The implementation of the Daitch-Mokotoff Soundex System aims at correctness | ||||||
|  |  * and high performance, and can be summarized as follows: | ||||||
|  |  * | ||||||
|  |  * - The processing of each phoneme is initiated by an O(1) table lookup. | ||||||
|  |  * - For phonemes containing more than one character, a coding tree is traversed | ||||||
|  |  *   to process the complete phoneme. | ||||||
|  |  * - The (alternate) soundex codes are produced digit by digit in-place in | ||||||
|  |  *   another tree structure. | ||||||
|  |  * | ||||||
|  |  * References: | ||||||
|  |  * | ||||||
|  |  * https://www.avotaynu.com/soundex.htm | ||||||
|  |  * https://www.jewishgen.org/InfoFiles/Soundex.html | ||||||
|  |  * https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex | ||||||
|  |  * https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php) | ||||||
|  |  * https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java) | ||||||
|  |  * https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm) | ||||||
|  |  * | ||||||
|  |  * A few notes on other implementations: | ||||||
|  |  * | ||||||
|  |  * - All other known implementations have the same unofficial rules for "UE", | ||||||
|  |  *   these are also adapted by this implementation (0, 1, NC). | ||||||
|  |  * - The only other known implementation which is capable of generating all | ||||||
|  |  *   correct soundex codes in all cases is the JOS Soundex Calculator at | ||||||
|  |  *   https://www.jewishgen.org/jos/jossound.htm | ||||||
|  |  * - "J" is considered (only) a vowel in dmlat.php | ||||||
|  |  * - The official rules for "RS" are commented out in dmlat.php | ||||||
|  |  * - Identical code digits for adjacent letters are not collapsed correctly in | ||||||
|  |  *   dmsoundex.php when double digit codes are involved. E.g. "BESST" yields | ||||||
|  |  *   744300 instead of 743000 as for "BEST". | ||||||
|  |  * - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java | ||||||
|  |  * - "Y" is not considered a vowel in DaitchMokotoffSoundex.java | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | #include "postgres.h" | ||||||
|  |  | ||||||
|  | #include "catalog/pg_type.h" | ||||||
|  | #include "mb/pg_wchar.h" | ||||||
|  | #include "utils/array.h" | ||||||
|  | #include "utils/builtins.h" | ||||||
|  | #include "utils/memutils.h" | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* | ||||||
|  |  * The soundex coding chart table is adapted from | ||||||
|  |  * https://www.jewishgen.org/InfoFiles/Soundex.html | ||||||
|  |  * See daitch_mokotoff_header.pl for details. | ||||||
|  | */ | ||||||
|  |  | ||||||
|  | /* Generated coding chart table */ | ||||||
|  | #include "daitch_mokotoff.h" | ||||||
|  |  | ||||||
|  | #define DM_CODE_DIGITS 6 | ||||||
|  |  | ||||||
|  | /* Node in soundex code tree */ | ||||||
|  | typedef struct dm_node | ||||||
|  | { | ||||||
|  | 	int			soundex_length; /* Length of generated soundex code */ | ||||||
|  | 	char		soundex[DM_CODE_DIGITS];	/* Soundex code */ | ||||||
|  | 	int			is_leaf;		/* Candidate for complete soundex code */ | ||||||
|  | 	int			last_update;	/* Letter number for last update of node */ | ||||||
|  | 	char		code_digit;		/* Last code digit, 0 - 9 */ | ||||||
|  |  | ||||||
|  | 	/* | ||||||
|  | 	 * One or two alternate code digits leading to this node. If there are two | ||||||
|  | 	 * digits, one of them is always an 'X'. Repeated code digits and 'X' lead | ||||||
|  | 	 * back to the same node. | ||||||
|  | 	 */ | ||||||
|  | 	char		prev_code_digits[2]; | ||||||
|  | 	/* One or two alternate code digits moving forward. */ | ||||||
|  | 	char		next_code_digits[2]; | ||||||
|  | 	/* ORed together code index(es) used to reach current node. */ | ||||||
|  | 	int			prev_code_index; | ||||||
|  | 	int			next_code_index; | ||||||
|  | 	/* Possible nodes branching out from this node - digits 0-9. */ | ||||||
|  | 	struct dm_node *children[10]; | ||||||
|  | 	/* Next node in linked list. Alternating index for each iteration. */ | ||||||
|  | 	struct dm_node *next[2]; | ||||||
|  | } dm_node; | ||||||
|  |  | ||||||
|  | /* Template for new node in soundex code tree. */ | ||||||
|  | static const dm_node start_node = { | ||||||
|  | 	.soundex_length = 0, | ||||||
|  | 	.soundex = "000000",		/* Six digits */ | ||||||
|  | 	.is_leaf = 0, | ||||||
|  | 	.last_update = 0, | ||||||
|  | 	.code_digit = '\0', | ||||||
|  | 	.prev_code_digits = {'\0', '\0'}, | ||||||
|  | 	.next_code_digits = {'\0', '\0'}, | ||||||
|  | 	.prev_code_index = 0, | ||||||
|  | 	.next_code_index = 0, | ||||||
|  | 	.children = {NULL}, | ||||||
|  | 	.next = {NULL} | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | /* Dummy soundex codes at end of input. */ | ||||||
|  | static const dm_codes end_codes[2] = | ||||||
|  | { | ||||||
|  | 	{ | ||||||
|  | 		"X", "X", "X" | ||||||
|  | 	} | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | /* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */ | ||||||
|  | static const char iso8859_1_to_ascii_upper[] = | ||||||
|  | /* | ||||||
|  | "`abcdefghijklmnopqrstuvwxyz{|}~                                  ¡¢£¤¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ" | ||||||
|  | */ | ||||||
|  | "`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~                                  !                             ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY"; | ||||||
|  |  | ||||||
|  | /* Internal C implementation */ | ||||||
|  | static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex); | ||||||
|  |  | ||||||
|  |  | ||||||
|  | PG_FUNCTION_INFO_V1(daitch_mokotoff); | ||||||
|  |  | ||||||
|  | Datum | ||||||
|  | daitch_mokotoff(PG_FUNCTION_ARGS) | ||||||
|  | { | ||||||
|  | 	text	   *arg = PG_GETARG_TEXT_PP(0); | ||||||
|  | 	Datum		retval; | ||||||
|  | 	char	   *string; | ||||||
|  | 	ArrayBuildState *soundex; | ||||||
|  | 	MemoryContext old_ctx, | ||||||
|  | 				tmp_ctx; | ||||||
|  |  | ||||||
|  | 	/* Work in a temporary context to simplify cleanup. */ | ||||||
|  | 	tmp_ctx = AllocSetContextCreate(CurrentMemoryContext, | ||||||
|  | 									"daitch_mokotoff temporary context", | ||||||
|  | 									ALLOCSET_DEFAULT_SIZES); | ||||||
|  | 	old_ctx = MemoryContextSwitchTo(tmp_ctx); | ||||||
|  |  | ||||||
|  | 	/* We must convert the string to UTF-8 if it isn't already. */ | ||||||
|  | 	string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg), | ||||||
|  | 							  PG_UTF8); | ||||||
|  |  | ||||||
|  | 	/* The result is built in this ArrayBuildState. */ | ||||||
|  | 	soundex = initArrayResult(TEXTOID, tmp_ctx, false); | ||||||
|  |  | ||||||
|  | 	if (!daitch_mokotoff_coding(string, soundex)) | ||||||
|  | 	{ | ||||||
|  | 		/* No encodable characters in input */ | ||||||
|  | 		MemoryContextSwitchTo(old_ctx); | ||||||
|  | 		MemoryContextDelete(tmp_ctx); | ||||||
|  | 		PG_RETURN_NULL(); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	retval = makeArrayResult(soundex, old_ctx); | ||||||
|  |  | ||||||
|  | 	MemoryContextSwitchTo(old_ctx); | ||||||
|  | 	MemoryContextDelete(tmp_ctx); | ||||||
|  |  | ||||||
|  | 	PG_RETURN_DATUM(retval); | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* Initialize soundex code tree node for next code digit. */ | ||||||
|  | static void | ||||||
|  | initialize_node(dm_node *node, int last_update) | ||||||
|  | { | ||||||
|  | 	if (node->last_update < last_update) | ||||||
|  | 	{ | ||||||
|  | 		node->prev_code_digits[0] = node->next_code_digits[0]; | ||||||
|  | 		node->prev_code_digits[1] = node->next_code_digits[1]; | ||||||
|  | 		node->next_code_digits[0] = '\0'; | ||||||
|  | 		node->next_code_digits[1] = '\0'; | ||||||
|  | 		node->prev_code_index = node->next_code_index; | ||||||
|  | 		node->next_code_index = 0; | ||||||
|  | 		node->is_leaf = 0; | ||||||
|  | 		node->last_update = last_update; | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* Update soundex code tree node with next code digit. */ | ||||||
|  | static void | ||||||
|  | add_next_code_digit(dm_node *node, int code_index, char code_digit) | ||||||
|  | { | ||||||
|  | 	/* OR in index 1 or 2. */ | ||||||
|  | 	node->next_code_index |= code_index; | ||||||
|  |  | ||||||
|  | 	if (!node->next_code_digits[0]) | ||||||
|  | 		node->next_code_digits[0] = code_digit; | ||||||
|  | 	else if (node->next_code_digits[0] != code_digit) | ||||||
|  | 		node->next_code_digits[1] = code_digit; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* Mark soundex code tree node as leaf. */ | ||||||
|  | static void | ||||||
|  | set_leaf(dm_node *first_node[2], dm_node *last_node[2], | ||||||
|  | 		 dm_node *node, int ix_node) | ||||||
|  | { | ||||||
|  | 	if (!node->is_leaf) | ||||||
|  | 	{ | ||||||
|  | 		node->is_leaf = 1; | ||||||
|  |  | ||||||
|  | 		if (first_node[ix_node] == NULL) | ||||||
|  | 			first_node[ix_node] = node; | ||||||
|  | 		else | ||||||
|  | 			last_node[ix_node]->next[ix_node] = node; | ||||||
|  |  | ||||||
|  | 		last_node[ix_node] = node; | ||||||
|  | 		node->next[ix_node] = NULL; | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* Find next node corresponding to code digit, or create a new node. */ | ||||||
|  | static dm_node * | ||||||
|  | find_or_create_child_node(dm_node *parent, char code_digit, | ||||||
|  | 						  ArrayBuildState *soundex) | ||||||
|  | { | ||||||
|  | 	int			i = code_digit - '0'; | ||||||
|  | 	dm_node   **nodes = parent->children; | ||||||
|  | 	dm_node    *node = nodes[i]; | ||||||
|  |  | ||||||
|  | 	if (node) | ||||||
|  | 	{ | ||||||
|  | 		/* Found existing child node. Skip completed nodes. */ | ||||||
|  | 		return node->soundex_length < DM_CODE_DIGITS ? node : NULL; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/* Create new child node. */ | ||||||
|  | 	node = palloc_object(dm_node); | ||||||
|  | 	nodes[i] = node; | ||||||
|  |  | ||||||
|  | 	*node = start_node; | ||||||
|  | 	memcpy(node->soundex, parent->soundex, sizeof(parent->soundex)); | ||||||
|  | 	node->soundex_length = parent->soundex_length; | ||||||
|  | 	node->soundex[node->soundex_length++] = code_digit; | ||||||
|  | 	node->code_digit = code_digit; | ||||||
|  | 	node->next_code_index = node->prev_code_index; | ||||||
|  |  | ||||||
|  | 	if (node->soundex_length < DM_CODE_DIGITS) | ||||||
|  | 	{ | ||||||
|  | 		return node; | ||||||
|  | 	} | ||||||
|  | 	else | ||||||
|  | 	{ | ||||||
|  | 		/* Append completed soundex code to output array. */ | ||||||
|  | 		text	   *out = cstring_to_text_with_len(node->soundex, | ||||||
|  | 												   DM_CODE_DIGITS); | ||||||
|  |  | ||||||
|  | 		accumArrayResult(soundex, | ||||||
|  | 						 PointerGetDatum(out), | ||||||
|  | 						 false, | ||||||
|  | 						 TEXTOID, | ||||||
|  | 						 CurrentMemoryContext); | ||||||
|  | 		return NULL; | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* Update node for next code digit(s). */ | ||||||
|  | static void | ||||||
|  | update_node(dm_node *first_node[2], dm_node *last_node[2], | ||||||
|  | 			dm_node *node, int ix_node, | ||||||
|  | 			int letter_no, int prev_code_index, int next_code_index, | ||||||
|  | 			const char *next_code_digits, int digit_no, | ||||||
|  | 			ArrayBuildState *soundex) | ||||||
|  | { | ||||||
|  | 	int			i; | ||||||
|  | 	char		next_code_digit = next_code_digits[digit_no]; | ||||||
|  | 	int			num_dirty_nodes = 0; | ||||||
|  | 	dm_node    *dirty_nodes[2]; | ||||||
|  |  | ||||||
|  | 	initialize_node(node, letter_no); | ||||||
|  |  | ||||||
|  | 	if (node->prev_code_index && !(node->prev_code_index & prev_code_index)) | ||||||
|  | 	{ | ||||||
|  | 		/* | ||||||
|  | 		 * If the sound (vowel / consonant) of this letter encoding doesn't | ||||||
|  | 		 * correspond to the coding index of the previous letter, we skip this | ||||||
|  | 		 * letter encoding. Note that currently, only "J" can be either a | ||||||
|  | 		 * vowel or a consonant. | ||||||
|  | 		 */ | ||||||
|  | 		return; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	if (next_code_digit == 'X' || | ||||||
|  | 		(digit_no == 0 && | ||||||
|  | 		 (node->prev_code_digits[0] == next_code_digit || | ||||||
|  | 		  node->prev_code_digits[1] == next_code_digit))) | ||||||
|  | 	{ | ||||||
|  | 		/* The code digit is the same as one of the previous (i.e. not added). */ | ||||||
|  | 		dirty_nodes[num_dirty_nodes++] = node; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	if (next_code_digit != 'X' && | ||||||
|  | 		(digit_no > 0 || | ||||||
|  | 		 node->prev_code_digits[0] != next_code_digit || | ||||||
|  | 		 node->prev_code_digits[1])) | ||||||
|  | 	{ | ||||||
|  | 		/* The code digit is different from one of the previous (i.e. added). */ | ||||||
|  | 		node = find_or_create_child_node(node, next_code_digit, soundex); | ||||||
|  | 		if (node) | ||||||
|  | 		{ | ||||||
|  | 			initialize_node(node, letter_no); | ||||||
|  | 			dirty_nodes[num_dirty_nodes++] = node; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	for (i = 0; i < num_dirty_nodes; i++) | ||||||
|  | 	{ | ||||||
|  | 		/* Add code digit leading to the current node. */ | ||||||
|  | 		add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit); | ||||||
|  |  | ||||||
|  | 		if (next_code_digits[++digit_no]) | ||||||
|  | 		{ | ||||||
|  | 			update_node(first_node, last_node, dirty_nodes[i], ix_node, | ||||||
|  | 						letter_no, prev_code_index, next_code_index, | ||||||
|  | 						next_code_digits, digit_no, | ||||||
|  | 						soundex); | ||||||
|  | 		} | ||||||
|  | 		else | ||||||
|  | 		{ | ||||||
|  | 			/* Add incomplete leaf node to linked list. */ | ||||||
|  | 			set_leaf(first_node, last_node, dirty_nodes[i], ix_node); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* Update soundex tree leaf nodes. */ | ||||||
|  | static void | ||||||
|  | update_leaves(dm_node *first_node[2], int *ix_node, int letter_no, | ||||||
|  | 			  const dm_codes *codes, const dm_codes *next_codes, | ||||||
|  | 			  ArrayBuildState *soundex) | ||||||
|  | { | ||||||
|  | 	int			i, | ||||||
|  | 				j, | ||||||
|  | 				code_index; | ||||||
|  | 	dm_node    *node, | ||||||
|  | 			   *last_node[2]; | ||||||
|  | 	const dm_code *code, | ||||||
|  | 			   *next_code; | ||||||
|  | 	int			ix_node_next = (*ix_node + 1) & 1;	/* Alternating index: 0, 1 */ | ||||||
|  |  | ||||||
|  | 	/* Initialize for new linked list of leaves. */ | ||||||
|  | 	first_node[ix_node_next] = NULL; | ||||||
|  | 	last_node[ix_node_next] = NULL; | ||||||
|  |  | ||||||
|  | 	/* Process all nodes. */ | ||||||
|  | 	for (node = first_node[*ix_node]; node; node = node->next[*ix_node]) | ||||||
|  | 	{ | ||||||
|  | 		/* One or two alternate code sequences. */ | ||||||
|  | 		for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++) | ||||||
|  | 		{ | ||||||
|  | 			/* Coding for previous letter - before vowel: 1, all other: 2 */ | ||||||
|  | 			int			prev_code_index = (code[0][0] > '1') + 1; | ||||||
|  |  | ||||||
|  | 			/* One or two alternate next code sequences. */ | ||||||
|  | 			for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++) | ||||||
|  | 			{ | ||||||
|  | 				/* Determine which code to use. */ | ||||||
|  | 				if (letter_no == 0) | ||||||
|  | 				{ | ||||||
|  | 					/* This is the first letter. */ | ||||||
|  | 					code_index = 0; | ||||||
|  | 				} | ||||||
|  | 				else if (next_code[0][0] <= '1') | ||||||
|  | 				{ | ||||||
|  | 					/* The next letter is a vowel. */ | ||||||
|  | 					code_index = 1; | ||||||
|  | 				} | ||||||
|  | 				else | ||||||
|  | 				{ | ||||||
|  | 					/* All other cases. */ | ||||||
|  | 					code_index = 2; | ||||||
|  | 				} | ||||||
|  |  | ||||||
|  | 				/* One or two sequential code digits. */ | ||||||
|  | 				update_node(first_node, last_node, node, ix_node_next, | ||||||
|  | 							letter_no, prev_code_index, code_index, | ||||||
|  | 							code[code_index], 0, | ||||||
|  | 							soundex); | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	*ix_node = ix_node_next; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* | ||||||
|  |  * Return next character, converted from UTF-8 to uppercase ASCII. | ||||||
|  |  * *ix is the current string index and is incremented by the character length. | ||||||
|  |  */ | ||||||
|  | static char | ||||||
|  | read_char(const unsigned char *str, int *ix) | ||||||
|  | { | ||||||
|  | 	/* Substitute character for skipped code points. */ | ||||||
|  | 	const char	na = '\x1a'; | ||||||
|  | 	pg_wchar	c; | ||||||
|  |  | ||||||
|  | 	/* Decode UTF-8 character to ISO 10646 code point. */ | ||||||
|  | 	str += *ix; | ||||||
|  | 	c = utf8_to_unicode(str); | ||||||
|  |  | ||||||
|  | 	/* Advance *ix, but (for safety) not if we've reached end of string. */ | ||||||
|  | 	if (c) | ||||||
|  | 		*ix += pg_utf_mblen(str); | ||||||
|  |  | ||||||
|  | 	/* Convert. */ | ||||||
|  | 	if (c >= (unsigned char) '[' && c <= (unsigned char) ']') | ||||||
|  | 	{ | ||||||
|  | 		/* ASCII characters [, \, and ] are reserved for Ą, Ę, and Ţ/Ț. */ | ||||||
|  | 		return na; | ||||||
|  | 	} | ||||||
|  | 	else if (c < 0x60) | ||||||
|  | 	{ | ||||||
|  | 		/* Other non-lowercase ASCII characters can be used as-is. */ | ||||||
|  | 		return (char) c; | ||||||
|  | 	} | ||||||
|  | 	else if (c < 0x100) | ||||||
|  | 	{ | ||||||
|  | 		/* ISO-8859-1 code point; convert to upper-case ASCII via table. */ | ||||||
|  | 		return iso8859_1_to_ascii_upper[c - 0x60]; | ||||||
|  | 	} | ||||||
|  | 	else | ||||||
|  | 	{ | ||||||
|  | 		/* Conversion of non-ASCII characters in the coding chart. */ | ||||||
|  | 		switch (c) | ||||||
|  | 		{ | ||||||
|  | 			case 0x0104: | ||||||
|  | 			case 0x0105: | ||||||
|  | 				/* Ą/ą */ | ||||||
|  | 				return '['; | ||||||
|  | 			case 0x0118: | ||||||
|  | 			case 0x0119: | ||||||
|  | 				/* Ę/ę */ | ||||||
|  | 				return '\\'; | ||||||
|  | 			case 0x0162: | ||||||
|  | 			case 0x0163: | ||||||
|  | 			case 0x021A: | ||||||
|  | 			case 0x021B: | ||||||
|  | 				/* Ţ/ţ or Ț/ț */ | ||||||
|  | 				return ']'; | ||||||
|  | 			default: | ||||||
|  | 				return na; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* Read next ASCII character, skipping any characters not in [A-\]]. */ | ||||||
|  | static char | ||||||
|  | read_valid_char(const char *str, int *ix) | ||||||
|  | { | ||||||
|  | 	char		c; | ||||||
|  |  | ||||||
|  | 	while ((c = read_char((const unsigned char *) str, ix)) != '\0') | ||||||
|  | 	{ | ||||||
|  | 		if (c >= 'A' && c <= ']') | ||||||
|  | 			break; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return c; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* Return sound coding for "letter" (letter sequence) */ | ||||||
|  | static const dm_codes * | ||||||
|  | read_letter(const char *str, int *ix) | ||||||
|  | { | ||||||
|  | 	char		c, | ||||||
|  | 				cmp; | ||||||
|  | 	int			i, | ||||||
|  | 				j; | ||||||
|  | 	const dm_letter *letters; | ||||||
|  | 	const dm_codes *codes; | ||||||
|  |  | ||||||
|  | 	/* First letter in sequence. */ | ||||||
|  | 	if ((c = read_valid_char(str, ix)) == '\0') | ||||||
|  | 		return NULL; | ||||||
|  |  | ||||||
|  | 	letters = &letter_[c - 'A']; | ||||||
|  | 	codes = letters->codes; | ||||||
|  | 	i = *ix; | ||||||
|  |  | ||||||
|  | 	/* Any subsequent letters in sequence. */ | ||||||
|  | 	while ((letters = letters->letters) && (c = read_valid_char(str, &i))) | ||||||
|  | 	{ | ||||||
|  | 		for (j = 0; (cmp = letters[j].letter); j++) | ||||||
|  | 		{ | ||||||
|  | 			if (cmp == c) | ||||||
|  | 			{ | ||||||
|  | 				/* Letter found. */ | ||||||
|  | 				letters = &letters[j]; | ||||||
|  | 				if (letters->codes) | ||||||
|  | 				{ | ||||||
|  | 					/* Coding for letter sequence found. */ | ||||||
|  | 					codes = letters->codes; | ||||||
|  | 					*ix = i; | ||||||
|  | 				} | ||||||
|  | 				break; | ||||||
|  | 			} | ||||||
|  | 		} | ||||||
|  | 		if (!cmp) | ||||||
|  | 		{ | ||||||
|  | 			/* The sequence of letters has no coding. */ | ||||||
|  | 			break; | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return codes; | ||||||
|  | } | ||||||
|  |  | ||||||
|  |  | ||||||
|  | /* | ||||||
|  |  * Generate all Daitch-Mokotoff soundex codes for word, | ||||||
|  |  * adding them to the "soundex" ArrayBuildState. | ||||||
|  |  * Returns false if string has no encodable characters, else true. | ||||||
|  |  */ | ||||||
|  | static bool | ||||||
|  | daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex) | ||||||
|  | { | ||||||
|  | 	int			i = 0; | ||||||
|  | 	int			letter_no = 0; | ||||||
|  | 	int			ix_node = 0; | ||||||
|  | 	const dm_codes *codes, | ||||||
|  | 			   *next_codes; | ||||||
|  | 	dm_node    *first_node[2], | ||||||
|  | 			   *node; | ||||||
|  |  | ||||||
|  | 	/* First letter. */ | ||||||
|  | 	if (!(codes = read_letter(word, &i))) | ||||||
|  | 	{ | ||||||
|  | 		/* No encodable character in input. */ | ||||||
|  | 		return false; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/* Starting point. */ | ||||||
|  | 	first_node[ix_node] = palloc_object(dm_node); | ||||||
|  | 	*first_node[ix_node] = start_node; | ||||||
|  |  | ||||||
|  | 	/* | ||||||
|  | 	 * Loop until either the word input is exhausted, or all generated soundex | ||||||
|  | 	 * codes are completed to six digits. | ||||||
|  | 	 */ | ||||||
|  | 	while (codes && first_node[ix_node]) | ||||||
|  | 	{ | ||||||
|  | 		next_codes = read_letter(word, &i); | ||||||
|  |  | ||||||
|  | 		/* Update leaf nodes. */ | ||||||
|  | 		update_leaves(first_node, &ix_node, letter_no, | ||||||
|  | 					  codes, next_codes ? next_codes : end_codes, | ||||||
|  | 					  soundex); | ||||||
|  |  | ||||||
|  | 		codes = next_codes; | ||||||
|  | 		letter_no++; | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	/* Append all remaining (incomplete) soundex codes to output array. */ | ||||||
|  | 	for (node = first_node[ix_node]; node; node = node->next[ix_node]) | ||||||
|  | 	{ | ||||||
|  | 		text	   *out = cstring_to_text_with_len(node->soundex, | ||||||
|  | 												   DM_CODE_DIGITS); | ||||||
|  |  | ||||||
|  | 		accumArrayResult(soundex, | ||||||
|  | 						 PointerGetDatum(out), | ||||||
|  | 						 false, | ||||||
|  | 						 TEXTOID, | ||||||
|  | 						 CurrentMemoryContext); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	return true; | ||||||
|  | } | ||||||
							
								
								
									
										223
									
								
								contrib/fuzzystrmatch/daitch_mokotoff_header.pl
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										223
									
								
								contrib/fuzzystrmatch/daitch_mokotoff_header.pl
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,223 @@ | |||||||
|  | #!/usr/bin/perl | ||||||
|  | # | ||||||
|  | # Generation of types and lookup tables for Daitch-Mokotoff soundex. | ||||||
|  | # | ||||||
|  | # Copyright (c) 2023, PostgreSQL Global Development Group | ||||||
|  | # | ||||||
|  | # This module was originally sponsored by Finance Norway / | ||||||
|  | # Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no> | ||||||
|  | # | ||||||
|  |  | ||||||
|  | use strict; | ||||||
|  | use warnings; | ||||||
|  |  | ||||||
|  | use utf8; | ||||||
|  | use open IO => ':utf8', ':std'; | ||||||
|  | use Data::Dumper; | ||||||
|  |  | ||||||
|  | die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1; | ||||||
|  | my $output_file = $ARGV[0]; | ||||||
|  |  | ||||||
|  | # Open the output file | ||||||
|  | open my $OUTPUT, '>', $output_file | ||||||
|  |   or die "Could not open output file $output_file: $!\n"; | ||||||
|  |  | ||||||
|  | # Parse code table and generate tree for letter transitions. | ||||||
|  | my %codes; | ||||||
|  | my $table = [ {}, [ [ "", "", "" ] ] ]; | ||||||
|  | while (<DATA>) | ||||||
|  | { | ||||||
|  | 	chomp; | ||||||
|  | 	my ($letters, $codes) = split(/\s+/); | ||||||
|  | 	my @codes = map { [ split(/,/) ] } split(/\|/, $codes); | ||||||
|  |  | ||||||
|  | 	my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes); | ||||||
|  | 	my $val = join( | ||||||
|  | 		",\n", | ||||||
|  | 		map { | ||||||
|  | 			"\t{\n\t\t" | ||||||
|  | 			  . join(", ", map { "\"$_\"" } @$_) . "\n\t}" | ||||||
|  | 		} @codes); | ||||||
|  | 	$codes{$key} = $val; | ||||||
|  |  | ||||||
|  | 	for my $letter (split(/,/, $letters)) | ||||||
|  | 	{ | ||||||
|  | 		my $ref = $table->[0]; | ||||||
|  | 		# Link each character to the next in the letter combination. | ||||||
|  | 		my @c = split(//, $letter); | ||||||
|  | 		my $last_c = pop(@c); | ||||||
|  | 		for my $c (@c) | ||||||
|  | 		{ | ||||||
|  | 			$ref->{$c} //= [ {}, undef ]; | ||||||
|  | 			$ref->{$c}[0] //= {}; | ||||||
|  | 			$ref = $ref->{$c}[0]; | ||||||
|  | 		} | ||||||
|  | 		# The sound code for the letter combination is stored at the last character. | ||||||
|  | 		$ref->{$last_c}[1] = $key; | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | close(DATA); | ||||||
|  |  | ||||||
|  | print $OUTPUT <<EOF; | ||||||
|  | /* | ||||||
|  |  * Constants and lookup tables for Daitch-Mokotoff Soundex | ||||||
|  |  * | ||||||
|  |  * Copyright (c) 2023, PostgreSQL Global Development Group | ||||||
|  |  * | ||||||
|  |  * This file is generated by daitch_mokotoff_header.pl | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | /* Coding chart table: Soundex codes */ | ||||||
|  | typedef char dm_code[2 + 1];	/* One or two sequential code digits + NUL */ | ||||||
|  | typedef dm_code dm_codes[3];	/* Start of name, before a vowel, any other */ | ||||||
|  |  | ||||||
|  | /* Coding chart table: Letter in input sequence */ | ||||||
|  | struct dm_letter | ||||||
|  | { | ||||||
|  | 	char		letter;			/* Present letter in sequence */ | ||||||
|  | 	const struct dm_letter *letters;	/* List of possible successive letters */ | ||||||
|  | 	const dm_codes *codes;		/* Code sequence(s) for complete sequence */ | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | typedef struct dm_letter dm_letter; | ||||||
|  |  | ||||||
|  | /* Codes for letter sequence at start of name, before a vowel, and any other. */ | ||||||
|  | EOF | ||||||
|  |  | ||||||
|  | for my $key (sort keys %codes) | ||||||
|  | { | ||||||
|  | 	print $OUTPUT "static const dm_codes $key\[2\] =\n{\n" | ||||||
|  | 	  . $codes{$key} | ||||||
|  | 	  . "\n};\n"; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | print $OUTPUT <<EOF; | ||||||
|  |  | ||||||
|  | /* Coding for alternative following letters in sequence. */ | ||||||
|  | EOF | ||||||
|  |  | ||||||
|  | sub hash2code | ||||||
|  | { | ||||||
|  | 	my ($ref, $letter) = @_; | ||||||
|  |  | ||||||
|  | 	my @letters = (); | ||||||
|  |  | ||||||
|  | 	my $h = $ref->[0]; | ||||||
|  | 	for my $key (sort keys %$h) | ||||||
|  | 	{ | ||||||
|  | 		$ref = $h->{$key}; | ||||||
|  | 		my $children = "NULL"; | ||||||
|  | 		if (defined $ref->[0]) | ||||||
|  | 		{ | ||||||
|  | 			$children = "letter_$letter$key"; | ||||||
|  | 			hash2code($ref, "$letter$key"); | ||||||
|  | 		} | ||||||
|  | 		my $codes = $ref->[1] // "NULL"; | ||||||
|  | 		push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}"); | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n"; | ||||||
|  | 	for (@letters) | ||||||
|  | 	{ | ||||||
|  | 		print $OUTPUT "$_,\n"; | ||||||
|  | 	} | ||||||
|  | 	print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n"; | ||||||
|  | 	print $OUTPUT "};\n"; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | hash2code($table, ''); | ||||||
|  |  | ||||||
|  | close $OUTPUT; | ||||||
|  |  | ||||||
|  | # Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html | ||||||
|  | # | ||||||
|  | # The conversion from the coding chart to the table should be self | ||||||
|  | # explanatory, but note the differences stated below. | ||||||
|  | # | ||||||
|  | # X = NC (not coded) | ||||||
|  | # | ||||||
|  | # The non-ASCII letters in the coding chart are coded with substitute | ||||||
|  | # lowercase ASCII letters, which sort after the uppercase ASCII letters: | ||||||
|  | # | ||||||
|  | # Ą => a (use '[' for table lookup) | ||||||
|  | # Ę => e (use '\\' for table lookup) | ||||||
|  | # Ţ => t (use ']' for table lookup) | ||||||
|  | # | ||||||
|  | # The rule for "UE" does not correspond to the coding chart, however | ||||||
|  | # it is used by all other known implementations, including the one at | ||||||
|  | # https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey"). | ||||||
|  | # | ||||||
|  | # Note that the implementation assumes that vowels are assigned code | ||||||
|  | # 0 or 1. "J" can be either a vowel or a consonant. | ||||||
|  | # | ||||||
|  |  | ||||||
|  | __DATA__ | ||||||
|  | AI,AJ,AY				0,1,X | ||||||
|  | AU						0,7,X | ||||||
|  | a						X,X,6|X,X,X | ||||||
|  | A						0,X,X | ||||||
|  | B						7,7,7 | ||||||
|  | CHS						5,54,54 | ||||||
|  | CH						5,5,5|4,4,4 | ||||||
|  | CK						5,5,5|45,45,45 | ||||||
|  | CZ,CS,CSZ,CZS			4,4,4 | ||||||
|  | C						5,5,5|4,4,4 | ||||||
|  | DRZ,DRS					4,4,4 | ||||||
|  | DS,DSH,DSZ				4,4,4 | ||||||
|  | DZ,DZH,DZS				4,4,4 | ||||||
|  | D,DT					3,3,3 | ||||||
|  | EI,EJ,EY				0,1,X | ||||||
|  | EU						1,1,X | ||||||
|  | e						X,X,6|X,X,X | ||||||
|  | E						0,X,X | ||||||
|  | FB						7,7,7 | ||||||
|  | F						7,7,7 | ||||||
|  | G						5,5,5 | ||||||
|  | H						5,5,X | ||||||
|  | IA,IE,IO,IU				1,X,X | ||||||
|  | I						0,X,X | ||||||
|  | J						1,X,X|4,4,4 | ||||||
|  | KS						5,54,54 | ||||||
|  | KH						5,5,5 | ||||||
|  | K						5,5,5 | ||||||
|  | L						8,8,8 | ||||||
|  | MN						66,66,66 | ||||||
|  | M						6,6,6 | ||||||
|  | NM						66,66,66 | ||||||
|  | N						6,6,6 | ||||||
|  | OI,OJ,OY				0,1,X | ||||||
|  | O						0,X,X | ||||||
|  | P,PF,PH					7,7,7 | ||||||
|  | Q						5,5,5 | ||||||
|  | RZ,RS					94,94,94|4,4,4 | ||||||
|  | R						9,9,9 | ||||||
|  | SCHTSCH,SCHTSH,SCHTCH	2,4,4 | ||||||
|  | SCH						4,4,4 | ||||||
|  | SHTCH,SHCH,SHTSH		2,4,4 | ||||||
|  | SHT,SCHT,SCHD			2,43,43 | ||||||
|  | SH						4,4,4 | ||||||
|  | STCH,STSCH,SC			2,4,4 | ||||||
|  | STRZ,STRS,STSH			2,4,4 | ||||||
|  | ST						2,43,43 | ||||||
|  | SZCZ,SZCS				2,4,4 | ||||||
|  | SZT,SHD,SZD,SD			2,43,43 | ||||||
|  | SZ						4,4,4 | ||||||
|  | S						4,4,4 | ||||||
|  | TCH,TTCH,TTSCH			4,4,4 | ||||||
|  | TH						3,3,3 | ||||||
|  | TRZ,TRS					4,4,4 | ||||||
|  | TSCH,TSH				4,4,4 | ||||||
|  | TS,TTS,TTSZ,TC			4,4,4 | ||||||
|  | TZ,TTZ,TZS,TSZ			4,4,4 | ||||||
|  | t						3,3,3|4,4,4 | ||||||
|  | T						3,3,3 | ||||||
|  | UI,UJ,UY,UE				0,1,X | ||||||
|  | U						0,X,X | ||||||
|  | V						7,7,7 | ||||||
|  | W						7,7,7 | ||||||
|  | X						5,54,54 | ||||||
|  | Y						1,X,X | ||||||
|  | ZDZ,ZDZH,ZHDZH			2,4,4 | ||||||
|  | ZD,ZHD					2,43,43 | ||||||
|  | ZH,ZS,ZSCH,ZSH			4,4,4 | ||||||
|  | Z						4,4,4 | ||||||
| @@ -65,3 +65,174 @@ SELECT dmetaphone_alt('gumbo'); | |||||||
|  KMP |  KMP | ||||||
| (1 row) | (1 row) | ||||||
|  |  | ||||||
|  | -- Wovels | ||||||
|  | SELECT daitch_mokotoff('Augsburg'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {054795} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Breuer'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {791900} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Freud'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {793000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- The letter "H" | ||||||
|  | SELECT daitch_mokotoff('Halberstadt'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {587943,587433} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Mannheim'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {665600} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- Adjacent sounds | ||||||
|  | SELECT daitch_mokotoff('Chernowitz'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {596740,496740} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- Adjacent letters with identical adjacent code digits | ||||||
|  | SELECT daitch_mokotoff('Cherkassy'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {595400,495400} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Kleinman'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {586660} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- More than one word | ||||||
|  | SELECT daitch_mokotoff('Nowy Targ'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {673950} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- Padded with "0" | ||||||
|  | SELECT daitch_mokotoff('Berlin'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {798600} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- Other examples from https://www.avotaynu.com/soundex.htm | ||||||
|  | SELECT daitch_mokotoff('Ceniow'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {567000,467000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Tsenyuv'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {467000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Holubica'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {587500,587400} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Golubitsa'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {587400} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Przemysl'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {794648,746480} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Pshemeshil'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {746480} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Rosochowaciec'); | ||||||
|  |                       daitch_mokotoff                       | ||||||
|  | ----------------------------------------------------------- | ||||||
|  |  {945755,945754,945745,945744,944755,944754,944745,944744} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Rosokhovatsets'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {945744} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- Ignored characters | ||||||
|  | SELECT daitch_mokotoff('''OBrien'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {079600} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('O''Brien'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {079600} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- "Difficult" cases, likely to cause trouble for other implementations. | ||||||
|  | SELECT daitch_mokotoff('CJC'); | ||||||
|  |                daitch_mokotoff                | ||||||
|  | --------------------------------------------- | ||||||
|  |  {550000,540000,545000,450000,400000,440000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('BESST'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {743000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('BOUEY'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {710000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('HANNMANN'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {566600} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('MCCOYJR'); | ||||||
|  |                       daitch_mokotoff                       | ||||||
|  | ----------------------------------------------------------- | ||||||
|  |  {651900,654900,654190,654490,645190,645490,641900,644900} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('ACCURSO'); | ||||||
|  |                       daitch_mokotoff                       | ||||||
|  | ----------------------------------------------------------- | ||||||
|  |  {059400,054000,054940,054400,045940,045400,049400,044000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('BIERSCHBACH'); | ||||||
|  |                       daitch_mokotoff                       | ||||||
|  | ----------------------------------------------------------- | ||||||
|  |  {794575,794574,794750,794740,745750,745740,747500,747400} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										61
									
								
								contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8.out
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8.out
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,61 @@ | |||||||
|  | /* | ||||||
|  |  * This test must be run in a database with UTF-8 encoding, | ||||||
|  |  * because other encodings don't support all the characters used. | ||||||
|  |  */ | ||||||
|  | SELECT getdatabaseencoding() <> 'UTF8' | ||||||
|  |        AS skip_test \gset | ||||||
|  | \if :skip_test | ||||||
|  | \quit | ||||||
|  | \endif | ||||||
|  | set client_encoding = utf8; | ||||||
|  | -- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; | ||||||
|  | -- Accents | ||||||
|  | SELECT daitch_mokotoff('Müller'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {689000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Schäfer'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {479000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Straßburg'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {294795} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Éregon'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {095600} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | -- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html | ||||||
|  | SELECT daitch_mokotoff('gąszczu'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {564000,540000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('brzęczy'); | ||||||
|  |         daitch_mokotoff         | ||||||
|  | ------------------------------- | ||||||
|  |  {794640,794400,746400,744000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('ţamas'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {364000,464000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('țamas'); | ||||||
|  |  daitch_mokotoff  | ||||||
|  | ----------------- | ||||||
|  |  {364000,464000} | ||||||
|  | (1 row) | ||||||
|  |  | ||||||
							
								
								
									
										8
									
								
								contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8_1.out
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8_1.out
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,8 @@ | |||||||
|  | /* | ||||||
|  |  * This test must be run in a database with UTF-8 encoding, | ||||||
|  |  * because other encodings don't support all the characters used. | ||||||
|  |  */ | ||||||
|  | SELECT getdatabaseencoding() <> 'UTF8' | ||||||
|  |        AS skip_test \gset | ||||||
|  | \if :skip_test | ||||||
|  | \quit | ||||||
							
								
								
									
										8
									
								
								contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										8
									
								
								contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,8 @@ | |||||||
|  | /* contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql */ | ||||||
|  |  | ||||||
|  | -- complain if script is sourced in psql, rather than via ALTER EXTENSION | ||||||
|  | \echo Use "ALTER EXTENSION fuzzystrmatch UPDATE TO '1.2'" to load this file. \quit | ||||||
|  |  | ||||||
|  | CREATE FUNCTION daitch_mokotoff(text) RETURNS text[] | ||||||
|  | AS 'MODULE_PATHNAME', 'daitch_mokotoff' | ||||||
|  | LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; | ||||||
| @@ -1,6 +1,6 @@ | |||||||
| # fuzzystrmatch extension | # fuzzystrmatch extension | ||||||
| comment = 'determine similarities and distance between strings' | comment = 'determine similarities and distance between strings' | ||||||
| default_version = '1.1' | default_version = '1.2' | ||||||
| module_pathname = '$libdir/fuzzystrmatch' | module_pathname = '$libdir/fuzzystrmatch' | ||||||
| relocatable = true | relocatable = true | ||||||
| trusted = true | trusted = true | ||||||
|   | |||||||
| @@ -1,10 +1,19 @@ | |||||||
| # Copyright (c) 2022-2023, PostgreSQL Global Development Group | # Copyright (c) 2022-2023, PostgreSQL Global Development Group | ||||||
|  |  | ||||||
| fuzzystrmatch_sources = files( | fuzzystrmatch_sources = files( | ||||||
|   'fuzzystrmatch.c', |   'daitch_mokotoff.c', | ||||||
|   'dmetaphone.c', |   'dmetaphone.c', | ||||||
|  |   'fuzzystrmatch.c', | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  | daitch_mokotoff_h = custom_target('daitch_mokotoff', | ||||||
|  |   input: 'daitch_mokotoff_header.pl', | ||||||
|  |   output: 'daitch_mokotoff.h', | ||||||
|  |   command: [perl, '@INPUT@', '@OUTPUT@'], | ||||||
|  | ) | ||||||
|  | generated_sources += daitch_mokotoff_h | ||||||
|  | fuzzystrmatch_sources += daitch_mokotoff_h | ||||||
|  |  | ||||||
| if host_system == 'windows' | if host_system == 'windows' | ||||||
|   fuzzystrmatch_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ |   fuzzystrmatch_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ | ||||||
|     '--NAME', 'fuzzystrmatch', |     '--NAME', 'fuzzystrmatch', | ||||||
| @@ -13,6 +22,7 @@ endif | |||||||
|  |  | ||||||
| fuzzystrmatch = shared_module('fuzzystrmatch', | fuzzystrmatch = shared_module('fuzzystrmatch', | ||||||
|   fuzzystrmatch_sources, |   fuzzystrmatch_sources, | ||||||
|  |   include_directories: include_directories('.'), | ||||||
|   kwargs: contrib_mod_args, |   kwargs: contrib_mod_args, | ||||||
| ) | ) | ||||||
| contrib_targets += fuzzystrmatch | contrib_targets += fuzzystrmatch | ||||||
| @@ -21,6 +31,7 @@ install_data( | |||||||
|   'fuzzystrmatch.control', |   'fuzzystrmatch.control', | ||||||
|   'fuzzystrmatch--1.0--1.1.sql', |   'fuzzystrmatch--1.0--1.1.sql', | ||||||
|   'fuzzystrmatch--1.1.sql', |   'fuzzystrmatch--1.1.sql', | ||||||
|  |   'fuzzystrmatch--1.1--1.2.sql', | ||||||
|   kwargs: contrib_data_args, |   kwargs: contrib_data_args, | ||||||
| ) | ) | ||||||
|  |  | ||||||
| @@ -31,6 +42,7 @@ tests += { | |||||||
|   'regress': { |   'regress': { | ||||||
|     'sql': [ |     'sql': [ | ||||||
|       'fuzzystrmatch', |       'fuzzystrmatch', | ||||||
|  |       'fuzzystrmatch_utf8', | ||||||
|     ], |     ], | ||||||
|   }, |   }, | ||||||
| } | } | ||||||
|   | |||||||
| @@ -19,3 +19,48 @@ SELECT metaphone('GUMBO', 4); | |||||||
|  |  | ||||||
| SELECT dmetaphone('gumbo'); | SELECT dmetaphone('gumbo'); | ||||||
| SELECT dmetaphone_alt('gumbo'); | SELECT dmetaphone_alt('gumbo'); | ||||||
|  |  | ||||||
|  | -- Wovels | ||||||
|  | SELECT daitch_mokotoff('Augsburg'); | ||||||
|  | SELECT daitch_mokotoff('Breuer'); | ||||||
|  | SELECT daitch_mokotoff('Freud'); | ||||||
|  |  | ||||||
|  | -- The letter "H" | ||||||
|  | SELECT daitch_mokotoff('Halberstadt'); | ||||||
|  | SELECT daitch_mokotoff('Mannheim'); | ||||||
|  |  | ||||||
|  | -- Adjacent sounds | ||||||
|  | SELECT daitch_mokotoff('Chernowitz'); | ||||||
|  |  | ||||||
|  | -- Adjacent letters with identical adjacent code digits | ||||||
|  | SELECT daitch_mokotoff('Cherkassy'); | ||||||
|  | SELECT daitch_mokotoff('Kleinman'); | ||||||
|  |  | ||||||
|  | -- More than one word | ||||||
|  | SELECT daitch_mokotoff('Nowy Targ'); | ||||||
|  |  | ||||||
|  | -- Padded with "0" | ||||||
|  | SELECT daitch_mokotoff('Berlin'); | ||||||
|  |  | ||||||
|  | -- Other examples from https://www.avotaynu.com/soundex.htm | ||||||
|  | SELECT daitch_mokotoff('Ceniow'); | ||||||
|  | SELECT daitch_mokotoff('Tsenyuv'); | ||||||
|  | SELECT daitch_mokotoff('Holubica'); | ||||||
|  | SELECT daitch_mokotoff('Golubitsa'); | ||||||
|  | SELECT daitch_mokotoff('Przemysl'); | ||||||
|  | SELECT daitch_mokotoff('Pshemeshil'); | ||||||
|  | SELECT daitch_mokotoff('Rosochowaciec'); | ||||||
|  | SELECT daitch_mokotoff('Rosokhovatsets'); | ||||||
|  |  | ||||||
|  | -- Ignored characters | ||||||
|  | SELECT daitch_mokotoff('''OBrien'); | ||||||
|  | SELECT daitch_mokotoff('O''Brien'); | ||||||
|  |  | ||||||
|  | -- "Difficult" cases, likely to cause trouble for other implementations. | ||||||
|  | SELECT daitch_mokotoff('CJC'); | ||||||
|  | SELECT daitch_mokotoff('BESST'); | ||||||
|  | SELECT daitch_mokotoff('BOUEY'); | ||||||
|  | SELECT daitch_mokotoff('HANNMANN'); | ||||||
|  | SELECT daitch_mokotoff('MCCOYJR'); | ||||||
|  | SELECT daitch_mokotoff('ACCURSO'); | ||||||
|  | SELECT daitch_mokotoff('BIERSCHBACH'); | ||||||
|   | |||||||
							
								
								
									
										26
									
								
								contrib/fuzzystrmatch/sql/fuzzystrmatch_utf8.sql
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								contrib/fuzzystrmatch/sql/fuzzystrmatch_utf8.sql
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | |||||||
|  | /* | ||||||
|  |  * This test must be run in a database with UTF-8 encoding, | ||||||
|  |  * because other encodings don't support all the characters used. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | SELECT getdatabaseencoding() <> 'UTF8' | ||||||
|  |        AS skip_test \gset | ||||||
|  | \if :skip_test | ||||||
|  | \quit | ||||||
|  | \endif | ||||||
|  |  | ||||||
|  | set client_encoding = utf8; | ||||||
|  |  | ||||||
|  | -- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; | ||||||
|  |  | ||||||
|  | -- Accents | ||||||
|  | SELECT daitch_mokotoff('Müller'); | ||||||
|  | SELECT daitch_mokotoff('Schäfer'); | ||||||
|  | SELECT daitch_mokotoff('Straßburg'); | ||||||
|  | SELECT daitch_mokotoff('Éregon'); | ||||||
|  |  | ||||||
|  | -- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html | ||||||
|  | SELECT daitch_mokotoff('gąszczu'); | ||||||
|  | SELECT daitch_mokotoff('brzęczy'); | ||||||
|  | SELECT daitch_mokotoff('ţamas'); | ||||||
|  | SELECT daitch_mokotoff('țamas'); | ||||||
| @@ -17,6 +17,8 @@ | |||||||
|    At present, the <function>soundex</function>, <function>metaphone</function>, |    At present, the <function>soundex</function>, <function>metaphone</function>, | ||||||
|    <function>dmetaphone</function>, and <function>dmetaphone_alt</function> functions do |    <function>dmetaphone</function>, and <function>dmetaphone_alt</function> functions do | ||||||
|    not work well with multibyte encodings (such as UTF-8). |    not work well with multibyte encodings (such as UTF-8). | ||||||
|  |    Use <function>daitch_mokotoff</function> | ||||||
|  |    or <function>levenshtein</function> with such data. | ||||||
|   </para> |   </para> | ||||||
|  </caution> |  </caution> | ||||||
|  |  | ||||||
| @@ -88,6 +90,159 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2; | |||||||
| </programlisting> | </programlisting> | ||||||
|  </sect2> |  </sect2> | ||||||
|  |  | ||||||
|  |  <sect2 id="fuzzystrmatch-daitch-mokotoff"> | ||||||
|  |   <title>Daitch-Mokotoff Soundex</title> | ||||||
|  |  | ||||||
|  |   <para> | ||||||
|  |    Like the original Soundex system, Daitch-Mokotoff Soundex matches | ||||||
|  |    similar-sounding names by converting them to the same code. | ||||||
|  |    However, Daitch-Mokotoff Soundex is significantly more useful for | ||||||
|  |    non-English names than the original system. | ||||||
|  |    Major improvements over the original system include: | ||||||
|  |  | ||||||
|  |    <itemizedlist spacing="compact" mark="bullet"> | ||||||
|  |     <listitem> | ||||||
|  |      <para> | ||||||
|  |       The code is based on the first six meaningful letters rather than four. | ||||||
|  |      </para> | ||||||
|  |     </listitem> | ||||||
|  |     <listitem> | ||||||
|  |      <para> | ||||||
|  |       A letter or combination of letters maps into ten possible codes rather | ||||||
|  |       than seven. | ||||||
|  |      </para> | ||||||
|  |     </listitem> | ||||||
|  |     <listitem> | ||||||
|  |      <para> | ||||||
|  |       Where two consecutive letters have a single sound, they are coded as a | ||||||
|  |       single number. | ||||||
|  |      </para> | ||||||
|  |     </listitem> | ||||||
|  |     <listitem> | ||||||
|  |      <para> | ||||||
|  |       When a letter or combination of letters may have different sounds, | ||||||
|  |       multiple codes are emitted to cover all possibilities. | ||||||
|  |      </para> | ||||||
|  |     </listitem> | ||||||
|  |    </itemizedlist> | ||||||
|  |   </para> | ||||||
|  |  | ||||||
|  |   <indexterm> | ||||||
|  |    <primary>daitch_mokotoff</primary> | ||||||
|  |   </indexterm> | ||||||
|  |  | ||||||
|  |   <para> | ||||||
|  |    This function generates the Daitch-Mokotoff soundex codes for its input: | ||||||
|  |   </para> | ||||||
|  |  | ||||||
|  | <synopsis> | ||||||
|  | daitch_mokotoff(<parameter>source</parameter> text) returns text[] | ||||||
|  | </synopsis> | ||||||
|  |  | ||||||
|  |   <para> | ||||||
|  |    The result may contain one or more codes depending on how many plausible | ||||||
|  |    pronunciations there are, so it is represented as an array. | ||||||
|  |   </para> | ||||||
|  |  | ||||||
|  |   <para> | ||||||
|  |    Since a Daitch-Mokotoff soundex code consists of only 6 digits, | ||||||
|  |    <parameter>source</parameter> should be preferably a single word or name. | ||||||
|  |   </para> | ||||||
|  |  | ||||||
|  |   <para> | ||||||
|  |    Here are some examples: | ||||||
|  |   </para> | ||||||
|  |  | ||||||
|  | <programlisting> | ||||||
|  | SELECT daitch_mokotoff('George'); | ||||||
|  |  daitch_mokotoff | ||||||
|  | ----------------- | ||||||
|  |  {595000} | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('John'); | ||||||
|  |  daitch_mokotoff | ||||||
|  | ----------------- | ||||||
|  |  {160000,460000} | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Bierschbach'); | ||||||
|  |                       daitch_mokotoff | ||||||
|  | ----------------------------------------------------------- | ||||||
|  |  {794575,794574,794750,794740,745750,745740,747500,747400} | ||||||
|  |  | ||||||
|  | SELECT daitch_mokotoff('Schwartzenegger'); | ||||||
|  |  daitch_mokotoff | ||||||
|  | ----------------- | ||||||
|  |  {479465} | ||||||
|  | </programlisting> | ||||||
|  |  | ||||||
|  |   <para> | ||||||
|  |    For matching of single names, returned text arrays can be matched | ||||||
|  |    directly using the <literal>&&</literal> operator: any overlap | ||||||
|  |    can be considered a match.  A GIN index may | ||||||
|  |    be used for efficiency, see <xref linkend="gin"/> and this example: | ||||||
|  |   </para> | ||||||
|  |  | ||||||
|  | <programlisting> | ||||||
|  | CREATE TABLE s (nm text); | ||||||
|  | CREATE INDEX ix_s_dm ON s USING gin (daitch_mokotoff(nm)) WITH (fastupdate = off); | ||||||
|  |  | ||||||
|  | INSERT INTO s (nm) VALUES | ||||||
|  |   ('Schwartzenegger'), | ||||||
|  |   ('John'), | ||||||
|  |   ('James'), | ||||||
|  |   ('Steinman'), | ||||||
|  |   ('Steinmetz'); | ||||||
|  |  | ||||||
|  | SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Swartzenegger'); | ||||||
|  | SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Jane'); | ||||||
|  | SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Jens'); | ||||||
|  | </programlisting> | ||||||
|  |  | ||||||
|  |   <para> | ||||||
|  |    For indexing and matching of any number of names in any order, Full Text | ||||||
|  |    Search features can be used. See <xref linkend="textsearch"/> and this | ||||||
|  |    example: | ||||||
|  |   </para> | ||||||
|  |  | ||||||
|  | <programlisting> | ||||||
|  | CREATE FUNCTION soundex_tsvector(v_name text) RETURNS tsvector | ||||||
|  | BEGIN ATOMIC | ||||||
|  |   SELECT to_tsvector('simple', | ||||||
|  |                      string_agg(array_to_string(daitch_mokotoff(n), ' '), ' ')) | ||||||
|  |   FROM regexp_split_to_table(v_name, '\s+') AS n; | ||||||
|  | END; | ||||||
|  |  | ||||||
|  | CREATE FUNCTION soundex_tsquery(v_name text) RETURNS tsquery | ||||||
|  | BEGIN ATOMIC | ||||||
|  |   SELECT string_agg('(' || array_to_string(daitch_mokotoff(n), '|') || ')', '&')::tsquery | ||||||
|  |   FROM regexp_split_to_table(v_name, '\s+') AS n; | ||||||
|  | END; | ||||||
|  |  | ||||||
|  | CREATE TABLE s (nm text); | ||||||
|  | CREATE INDEX ix_s_txt ON s USING gin (soundex_tsvector(nm)) WITH (fastupdate = off); | ||||||
|  |  | ||||||
|  | INSERT INTO s (nm) VALUES | ||||||
|  |   ('John Doe'), | ||||||
|  |   ('Jane Roe'), | ||||||
|  |   ('Public John Q.'), | ||||||
|  |   ('George Best'), | ||||||
|  |   ('John Yamson'); | ||||||
|  |  | ||||||
|  | SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('john'); | ||||||
|  | SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('jane doe'); | ||||||
|  | SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('john public'); | ||||||
|  | SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('besst, giorgio'); | ||||||
|  | SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('Jameson John'); | ||||||
|  | </programlisting> | ||||||
|  |  | ||||||
|  |   <para> | ||||||
|  |    If it is desired to avoid recalculation of soundex codes during index | ||||||
|  |    rechecks, an index on a separate column can be used instead of an index on | ||||||
|  |    an expression.  A stored generated column can be used for this; see | ||||||
|  |    <xref linkend="ddl-generated-columns"/>. | ||||||
|  |   </para> | ||||||
|  |  </sect2> | ||||||
|  |  | ||||||
|  <sect2 id="fuzzystrmatch-levenshtein"> |  <sect2 id="fuzzystrmatch-levenshtein"> | ||||||
|   <title>Levenshtein</title> |   <title>Levenshtein</title> | ||||||
|  |  | ||||||
| @@ -104,10 +259,10 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2; | |||||||
|   </indexterm> |   </indexterm> | ||||||
|  |  | ||||||
| <synopsis> | <synopsis> | ||||||
| levenshtein(text source, text target, int ins_cost, int del_cost, int sub_cost) returns int | levenshtein(source text, target text, ins_cost int, del_cost int, sub_cost int) returns int | ||||||
| levenshtein(text source, text target) returns int | levenshtein(source text, target text) returns int | ||||||
| levenshtein_less_equal(text source, text target, int ins_cost, int del_cost, int sub_cost, int max_d) returns int | levenshtein_less_equal(source text, target text, ins_cost int, del_cost int, sub_cost int, max_d int) returns int | ||||||
| levenshtein_less_equal(text source, text target, int max_d) returns int | levenshtein_less_equal(source text, target text, max_d int) returns int | ||||||
| </synopsis> | </synopsis> | ||||||
|  |  | ||||||
|   <para> |   <para> | ||||||
| @@ -177,7 +332,7 @@ test=# SELECT levenshtein_less_equal('extensive', 'exhaustive', 4); | |||||||
|   </indexterm> |   </indexterm> | ||||||
|  |  | ||||||
| <synopsis> | <synopsis> | ||||||
| metaphone(text source, int max_output_length) returns text | metaphone(source text, max_output_length int) returns text | ||||||
| </synopsis> | </synopsis> | ||||||
|  |  | ||||||
|   <para> |   <para> | ||||||
| @@ -220,8 +375,8 @@ test=# SELECT metaphone('GUMBO', 4); | |||||||
|   </indexterm> |   </indexterm> | ||||||
|  |  | ||||||
| <synopsis> | <synopsis> | ||||||
| dmetaphone(text source) returns text | dmetaphone(source text) returns text | ||||||
| dmetaphone_alt(text source) returns text | dmetaphone_alt(source text) returns text | ||||||
| </synopsis> | </synopsis> | ||||||
|  |  | ||||||
|   <para> |   <para> | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user