mirror of
https://github.com/postgres/postgres.git
synced 2025-04-20 00:42:27 +03:00
Add support for Daitch-Mokotoff Soundex in contrib/fuzzystrmatch.
This modernized version of Soundex works significantly better than the original, particularly for non-English names. Dag Lem, reviewed by quite a few people along the way Discussion: https://postgr.es/m/yger1atbgfy.fsf@sid.nimrod.no
This commit is contained in:
parent
728015a470
commit
a290378a37
2
contrib/fuzzystrmatch/.gitignore
vendored
2
contrib/fuzzystrmatch/.gitignore
vendored
@ -1,3 +1,5 @@
|
|||||||
|
# Generated files
|
||||||
|
/daitch_mokotoff.h
|
||||||
# Generated subdirectories
|
# Generated subdirectories
|
||||||
/log/
|
/log/
|
||||||
/results/
|
/results/
|
||||||
|
@ -3,14 +3,17 @@
|
|||||||
MODULE_big = fuzzystrmatch
|
MODULE_big = fuzzystrmatch
|
||||||
OBJS = \
|
OBJS = \
|
||||||
$(WIN32RES) \
|
$(WIN32RES) \
|
||||||
|
daitch_mokotoff.o \
|
||||||
dmetaphone.o \
|
dmetaphone.o \
|
||||||
fuzzystrmatch.o
|
fuzzystrmatch.o
|
||||||
|
|
||||||
EXTENSION = fuzzystrmatch
|
EXTENSION = fuzzystrmatch
|
||||||
DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--1.0--1.1.sql
|
DATA = fuzzystrmatch--1.1.sql fuzzystrmatch--1.1--1.2.sql \
|
||||||
|
fuzzystrmatch--1.0--1.1.sql
|
||||||
|
|
||||||
PGFILEDESC = "fuzzystrmatch - similarities and distance between strings"
|
PGFILEDESC = "fuzzystrmatch - similarities and distance between strings"
|
||||||
|
|
||||||
REGRESS = fuzzystrmatch
|
REGRESS = fuzzystrmatch fuzzystrmatch_utf8
|
||||||
|
|
||||||
ifdef USE_PGXS
|
ifdef USE_PGXS
|
||||||
PG_CONFIG = pg_config
|
PG_CONFIG = pg_config
|
||||||
@ -22,3 +25,16 @@ top_builddir = ../..
|
|||||||
include $(top_builddir)/src/Makefile.global
|
include $(top_builddir)/src/Makefile.global
|
||||||
include $(top_srcdir)/contrib/contrib-global.mk
|
include $(top_srcdir)/contrib/contrib-global.mk
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# Force this dependency to be known even without dependency info built:
|
||||||
|
daitch_mokotoff.o: daitch_mokotoff.h
|
||||||
|
|
||||||
|
daitch_mokotoff.h: daitch_mokotoff_header.pl
|
||||||
|
$(PERL) $< $@
|
||||||
|
|
||||||
|
# daitch_mokotoff.h is included in tarballs, so it has to be made by
|
||||||
|
# "distprep" and not cleaned except by "maintainer-clean".
|
||||||
|
distprep: daitch_mokotoff.h
|
||||||
|
|
||||||
|
maintainer-clean:
|
||||||
|
rm -f daitch_mokotoff.h
|
||||||
|
577
contrib/fuzzystrmatch/daitch_mokotoff.c
Normal file
577
contrib/fuzzystrmatch/daitch_mokotoff.c
Normal file
@ -0,0 +1,577 @@
|
|||||||
|
/*
|
||||||
|
* Daitch-Mokotoff Soundex
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023, PostgreSQL Global Development Group
|
||||||
|
*
|
||||||
|
* This module was originally sponsored by Finance Norway /
|
||||||
|
* Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
|
||||||
|
*
|
||||||
|
* The implementation of the Daitch-Mokotoff Soundex System aims at correctness
|
||||||
|
* and high performance, and can be summarized as follows:
|
||||||
|
*
|
||||||
|
* - The processing of each phoneme is initiated by an O(1) table lookup.
|
||||||
|
* - For phonemes containing more than one character, a coding tree is traversed
|
||||||
|
* to process the complete phoneme.
|
||||||
|
* - The (alternate) soundex codes are produced digit by digit in-place in
|
||||||
|
* another tree structure.
|
||||||
|
*
|
||||||
|
* References:
|
||||||
|
*
|
||||||
|
* https://www.avotaynu.com/soundex.htm
|
||||||
|
* https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||||
|
* https://familypedia.fandom.com/wiki/Daitch-Mokotoff_Soundex
|
||||||
|
* https://stevemorse.org/census/soundex.html (dmlat.php, dmsoundex.php)
|
||||||
|
* https://github.com/apache/commons-codec/ (dmrules.txt, DaitchMokotoffSoundex.java)
|
||||||
|
* https://metacpan.org/pod/Text::Phonetic (DaitchMokotoff.pm)
|
||||||
|
*
|
||||||
|
* A few notes on other implementations:
|
||||||
|
*
|
||||||
|
* - All other known implementations have the same unofficial rules for "UE",
|
||||||
|
* these are also adapted by this implementation (0, 1, NC).
|
||||||
|
* - The only other known implementation which is capable of generating all
|
||||||
|
* correct soundex codes in all cases is the JOS Soundex Calculator at
|
||||||
|
* https://www.jewishgen.org/jos/jossound.htm
|
||||||
|
* - "J" is considered (only) a vowel in dmlat.php
|
||||||
|
* - The official rules for "RS" are commented out in dmlat.php
|
||||||
|
* - Identical code digits for adjacent letters are not collapsed correctly in
|
||||||
|
* dmsoundex.php when double digit codes are involved. E.g. "BESST" yields
|
||||||
|
* 744300 instead of 743000 as for "BEST".
|
||||||
|
* - "J" is considered (only) a consonant in DaitchMokotoffSoundex.java
|
||||||
|
* - "Y" is not considered a vowel in DaitchMokotoffSoundex.java
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "postgres.h"
|
||||||
|
|
||||||
|
#include "catalog/pg_type.h"
|
||||||
|
#include "mb/pg_wchar.h"
|
||||||
|
#include "utils/array.h"
|
||||||
|
#include "utils/builtins.h"
|
||||||
|
#include "utils/memutils.h"
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The soundex coding chart table is adapted from
|
||||||
|
* https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||||
|
* See daitch_mokotoff_header.pl for details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Generated coding chart table */
|
||||||
|
#include "daitch_mokotoff.h"
|
||||||
|
|
||||||
|
#define DM_CODE_DIGITS 6
|
||||||
|
|
||||||
|
/* Node in soundex code tree */
|
||||||
|
typedef struct dm_node
|
||||||
|
{
|
||||||
|
int soundex_length; /* Length of generated soundex code */
|
||||||
|
char soundex[DM_CODE_DIGITS]; /* Soundex code */
|
||||||
|
int is_leaf; /* Candidate for complete soundex code */
|
||||||
|
int last_update; /* Letter number for last update of node */
|
||||||
|
char code_digit; /* Last code digit, 0 - 9 */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* One or two alternate code digits leading to this node. If there are two
|
||||||
|
* digits, one of them is always an 'X'. Repeated code digits and 'X' lead
|
||||||
|
* back to the same node.
|
||||||
|
*/
|
||||||
|
char prev_code_digits[2];
|
||||||
|
/* One or two alternate code digits moving forward. */
|
||||||
|
char next_code_digits[2];
|
||||||
|
/* ORed together code index(es) used to reach current node. */
|
||||||
|
int prev_code_index;
|
||||||
|
int next_code_index;
|
||||||
|
/* Possible nodes branching out from this node - digits 0-9. */
|
||||||
|
struct dm_node *children[10];
|
||||||
|
/* Next node in linked list. Alternating index for each iteration. */
|
||||||
|
struct dm_node *next[2];
|
||||||
|
} dm_node;
|
||||||
|
|
||||||
|
/* Template for new node in soundex code tree. */
|
||||||
|
static const dm_node start_node = {
|
||||||
|
.soundex_length = 0,
|
||||||
|
.soundex = "000000", /* Six digits */
|
||||||
|
.is_leaf = 0,
|
||||||
|
.last_update = 0,
|
||||||
|
.code_digit = '\0',
|
||||||
|
.prev_code_digits = {'\0', '\0'},
|
||||||
|
.next_code_digits = {'\0', '\0'},
|
||||||
|
.prev_code_index = 0,
|
||||||
|
.next_code_index = 0,
|
||||||
|
.children = {NULL},
|
||||||
|
.next = {NULL}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Dummy soundex codes at end of input. */
|
||||||
|
static const dm_codes end_codes[2] =
|
||||||
|
{
|
||||||
|
{
|
||||||
|
"X", "X", "X"
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Mapping from ISO8859-1 to upper-case ASCII, covering the range 0x60..0xFF. */
|
||||||
|
static const char iso8859_1_to_ascii_upper[] =
|
||||||
|
/*
|
||||||
|
"`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬ ®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
|
||||||
|
*/
|
||||||
|
"`ABCDEFGHIJKLMNOPQRSTUVWXYZ{|}~ ! ?AAAAAAECEEEEIIIIDNOOOOO*OUUUUYDSAAAAAAECEEEEIIIIDNOOOOO/OUUUUYDY";
|
||||||
|
|
||||||
|
/* Internal C implementation */
|
||||||
|
static bool daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex);
|
||||||
|
|
||||||
|
|
||||||
|
PG_FUNCTION_INFO_V1(daitch_mokotoff);
|
||||||
|
|
||||||
|
Datum
|
||||||
|
daitch_mokotoff(PG_FUNCTION_ARGS)
|
||||||
|
{
|
||||||
|
text *arg = PG_GETARG_TEXT_PP(0);
|
||||||
|
Datum retval;
|
||||||
|
char *string;
|
||||||
|
ArrayBuildState *soundex;
|
||||||
|
MemoryContext old_ctx,
|
||||||
|
tmp_ctx;
|
||||||
|
|
||||||
|
/* Work in a temporary context to simplify cleanup. */
|
||||||
|
tmp_ctx = AllocSetContextCreate(CurrentMemoryContext,
|
||||||
|
"daitch_mokotoff temporary context",
|
||||||
|
ALLOCSET_DEFAULT_SIZES);
|
||||||
|
old_ctx = MemoryContextSwitchTo(tmp_ctx);
|
||||||
|
|
||||||
|
/* We must convert the string to UTF-8 if it isn't already. */
|
||||||
|
string = pg_server_to_any(text_to_cstring(arg), VARSIZE_ANY_EXHDR(arg),
|
||||||
|
PG_UTF8);
|
||||||
|
|
||||||
|
/* The result is built in this ArrayBuildState. */
|
||||||
|
soundex = initArrayResult(TEXTOID, tmp_ctx, false);
|
||||||
|
|
||||||
|
if (!daitch_mokotoff_coding(string, soundex))
|
||||||
|
{
|
||||||
|
/* No encodable characters in input */
|
||||||
|
MemoryContextSwitchTo(old_ctx);
|
||||||
|
MemoryContextDelete(tmp_ctx);
|
||||||
|
PG_RETURN_NULL();
|
||||||
|
}
|
||||||
|
|
||||||
|
retval = makeArrayResult(soundex, old_ctx);
|
||||||
|
|
||||||
|
MemoryContextSwitchTo(old_ctx);
|
||||||
|
MemoryContextDelete(tmp_ctx);
|
||||||
|
|
||||||
|
PG_RETURN_DATUM(retval);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Initialize soundex code tree node for next code digit. */
|
||||||
|
static void
|
||||||
|
initialize_node(dm_node *node, int last_update)
|
||||||
|
{
|
||||||
|
if (node->last_update < last_update)
|
||||||
|
{
|
||||||
|
node->prev_code_digits[0] = node->next_code_digits[0];
|
||||||
|
node->prev_code_digits[1] = node->next_code_digits[1];
|
||||||
|
node->next_code_digits[0] = '\0';
|
||||||
|
node->next_code_digits[1] = '\0';
|
||||||
|
node->prev_code_index = node->next_code_index;
|
||||||
|
node->next_code_index = 0;
|
||||||
|
node->is_leaf = 0;
|
||||||
|
node->last_update = last_update;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Update soundex code tree node with next code digit. */
|
||||||
|
static void
|
||||||
|
add_next_code_digit(dm_node *node, int code_index, char code_digit)
|
||||||
|
{
|
||||||
|
/* OR in index 1 or 2. */
|
||||||
|
node->next_code_index |= code_index;
|
||||||
|
|
||||||
|
if (!node->next_code_digits[0])
|
||||||
|
node->next_code_digits[0] = code_digit;
|
||||||
|
else if (node->next_code_digits[0] != code_digit)
|
||||||
|
node->next_code_digits[1] = code_digit;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Mark soundex code tree node as leaf. */
|
||||||
|
static void
|
||||||
|
set_leaf(dm_node *first_node[2], dm_node *last_node[2],
|
||||||
|
dm_node *node, int ix_node)
|
||||||
|
{
|
||||||
|
if (!node->is_leaf)
|
||||||
|
{
|
||||||
|
node->is_leaf = 1;
|
||||||
|
|
||||||
|
if (first_node[ix_node] == NULL)
|
||||||
|
first_node[ix_node] = node;
|
||||||
|
else
|
||||||
|
last_node[ix_node]->next[ix_node] = node;
|
||||||
|
|
||||||
|
last_node[ix_node] = node;
|
||||||
|
node->next[ix_node] = NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Find next node corresponding to code digit, or create a new node. */
|
||||||
|
static dm_node *
|
||||||
|
find_or_create_child_node(dm_node *parent, char code_digit,
|
||||||
|
ArrayBuildState *soundex)
|
||||||
|
{
|
||||||
|
int i = code_digit - '0';
|
||||||
|
dm_node **nodes = parent->children;
|
||||||
|
dm_node *node = nodes[i];
|
||||||
|
|
||||||
|
if (node)
|
||||||
|
{
|
||||||
|
/* Found existing child node. Skip completed nodes. */
|
||||||
|
return node->soundex_length < DM_CODE_DIGITS ? node : NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Create new child node. */
|
||||||
|
node = palloc_object(dm_node);
|
||||||
|
nodes[i] = node;
|
||||||
|
|
||||||
|
*node = start_node;
|
||||||
|
memcpy(node->soundex, parent->soundex, sizeof(parent->soundex));
|
||||||
|
node->soundex_length = parent->soundex_length;
|
||||||
|
node->soundex[node->soundex_length++] = code_digit;
|
||||||
|
node->code_digit = code_digit;
|
||||||
|
node->next_code_index = node->prev_code_index;
|
||||||
|
|
||||||
|
if (node->soundex_length < DM_CODE_DIGITS)
|
||||||
|
{
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Append completed soundex code to output array. */
|
||||||
|
text *out = cstring_to_text_with_len(node->soundex,
|
||||||
|
DM_CODE_DIGITS);
|
||||||
|
|
||||||
|
accumArrayResult(soundex,
|
||||||
|
PointerGetDatum(out),
|
||||||
|
false,
|
||||||
|
TEXTOID,
|
||||||
|
CurrentMemoryContext);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Update node for next code digit(s). */
|
||||||
|
static void
|
||||||
|
update_node(dm_node *first_node[2], dm_node *last_node[2],
|
||||||
|
dm_node *node, int ix_node,
|
||||||
|
int letter_no, int prev_code_index, int next_code_index,
|
||||||
|
const char *next_code_digits, int digit_no,
|
||||||
|
ArrayBuildState *soundex)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
char next_code_digit = next_code_digits[digit_no];
|
||||||
|
int num_dirty_nodes = 0;
|
||||||
|
dm_node *dirty_nodes[2];
|
||||||
|
|
||||||
|
initialize_node(node, letter_no);
|
||||||
|
|
||||||
|
if (node->prev_code_index && !(node->prev_code_index & prev_code_index))
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If the sound (vowel / consonant) of this letter encoding doesn't
|
||||||
|
* correspond to the coding index of the previous letter, we skip this
|
||||||
|
* letter encoding. Note that currently, only "J" can be either a
|
||||||
|
* vowel or a consonant.
|
||||||
|
*/
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (next_code_digit == 'X' ||
|
||||||
|
(digit_no == 0 &&
|
||||||
|
(node->prev_code_digits[0] == next_code_digit ||
|
||||||
|
node->prev_code_digits[1] == next_code_digit)))
|
||||||
|
{
|
||||||
|
/* The code digit is the same as one of the previous (i.e. not added). */
|
||||||
|
dirty_nodes[num_dirty_nodes++] = node;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (next_code_digit != 'X' &&
|
||||||
|
(digit_no > 0 ||
|
||||||
|
node->prev_code_digits[0] != next_code_digit ||
|
||||||
|
node->prev_code_digits[1]))
|
||||||
|
{
|
||||||
|
/* The code digit is different from one of the previous (i.e. added). */
|
||||||
|
node = find_or_create_child_node(node, next_code_digit, soundex);
|
||||||
|
if (node)
|
||||||
|
{
|
||||||
|
initialize_node(node, letter_no);
|
||||||
|
dirty_nodes[num_dirty_nodes++] = node;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < num_dirty_nodes; i++)
|
||||||
|
{
|
||||||
|
/* Add code digit leading to the current node. */
|
||||||
|
add_next_code_digit(dirty_nodes[i], next_code_index, next_code_digit);
|
||||||
|
|
||||||
|
if (next_code_digits[++digit_no])
|
||||||
|
{
|
||||||
|
update_node(first_node, last_node, dirty_nodes[i], ix_node,
|
||||||
|
letter_no, prev_code_index, next_code_index,
|
||||||
|
next_code_digits, digit_no,
|
||||||
|
soundex);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Add incomplete leaf node to linked list. */
|
||||||
|
set_leaf(first_node, last_node, dirty_nodes[i], ix_node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Update soundex tree leaf nodes. */
|
||||||
|
static void
|
||||||
|
update_leaves(dm_node *first_node[2], int *ix_node, int letter_no,
|
||||||
|
const dm_codes *codes, const dm_codes *next_codes,
|
||||||
|
ArrayBuildState *soundex)
|
||||||
|
{
|
||||||
|
int i,
|
||||||
|
j,
|
||||||
|
code_index;
|
||||||
|
dm_node *node,
|
||||||
|
*last_node[2];
|
||||||
|
const dm_code *code,
|
||||||
|
*next_code;
|
||||||
|
int ix_node_next = (*ix_node + 1) & 1; /* Alternating index: 0, 1 */
|
||||||
|
|
||||||
|
/* Initialize for new linked list of leaves. */
|
||||||
|
first_node[ix_node_next] = NULL;
|
||||||
|
last_node[ix_node_next] = NULL;
|
||||||
|
|
||||||
|
/* Process all nodes. */
|
||||||
|
for (node = first_node[*ix_node]; node; node = node->next[*ix_node])
|
||||||
|
{
|
||||||
|
/* One or two alternate code sequences. */
|
||||||
|
for (i = 0; i < 2 && (code = codes[i]) && code[0][0]; i++)
|
||||||
|
{
|
||||||
|
/* Coding for previous letter - before vowel: 1, all other: 2 */
|
||||||
|
int prev_code_index = (code[0][0] > '1') + 1;
|
||||||
|
|
||||||
|
/* One or two alternate next code sequences. */
|
||||||
|
for (j = 0; j < 2 && (next_code = next_codes[j]) && next_code[0][0]; j++)
|
||||||
|
{
|
||||||
|
/* Determine which code to use. */
|
||||||
|
if (letter_no == 0)
|
||||||
|
{
|
||||||
|
/* This is the first letter. */
|
||||||
|
code_index = 0;
|
||||||
|
}
|
||||||
|
else if (next_code[0][0] <= '1')
|
||||||
|
{
|
||||||
|
/* The next letter is a vowel. */
|
||||||
|
code_index = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* All other cases. */
|
||||||
|
code_index = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* One or two sequential code digits. */
|
||||||
|
update_node(first_node, last_node, node, ix_node_next,
|
||||||
|
letter_no, prev_code_index, code_index,
|
||||||
|
code[code_index], 0,
|
||||||
|
soundex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
*ix_node = ix_node_next;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Return next character, converted from UTF-8 to uppercase ASCII.
|
||||||
|
* *ix is the current string index and is incremented by the character length.
|
||||||
|
*/
|
||||||
|
static char
|
||||||
|
read_char(const unsigned char *str, int *ix)
|
||||||
|
{
|
||||||
|
/* Substitute character for skipped code points. */
|
||||||
|
const char na = '\x1a';
|
||||||
|
pg_wchar c;
|
||||||
|
|
||||||
|
/* Decode UTF-8 character to ISO 10646 code point. */
|
||||||
|
str += *ix;
|
||||||
|
c = utf8_to_unicode(str);
|
||||||
|
|
||||||
|
/* Advance *ix, but (for safety) not if we've reached end of string. */
|
||||||
|
if (c)
|
||||||
|
*ix += pg_utf_mblen(str);
|
||||||
|
|
||||||
|
/* Convert. */
|
||||||
|
if (c >= (unsigned char) '[' && c <= (unsigned char) ']')
|
||||||
|
{
|
||||||
|
/* ASCII characters [, \, and ] are reserved for Ą, Ę, and Ţ/Ț. */
|
||||||
|
return na;
|
||||||
|
}
|
||||||
|
else if (c < 0x60)
|
||||||
|
{
|
||||||
|
/* Other non-lowercase ASCII characters can be used as-is. */
|
||||||
|
return (char) c;
|
||||||
|
}
|
||||||
|
else if (c < 0x100)
|
||||||
|
{
|
||||||
|
/* ISO-8859-1 code point; convert to upper-case ASCII via table. */
|
||||||
|
return iso8859_1_to_ascii_upper[c - 0x60];
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Conversion of non-ASCII characters in the coding chart. */
|
||||||
|
switch (c)
|
||||||
|
{
|
||||||
|
case 0x0104:
|
||||||
|
case 0x0105:
|
||||||
|
/* Ą/ą */
|
||||||
|
return '[';
|
||||||
|
case 0x0118:
|
||||||
|
case 0x0119:
|
||||||
|
/* Ę/ę */
|
||||||
|
return '\\';
|
||||||
|
case 0x0162:
|
||||||
|
case 0x0163:
|
||||||
|
case 0x021A:
|
||||||
|
case 0x021B:
|
||||||
|
/* Ţ/ţ or Ț/ț */
|
||||||
|
return ']';
|
||||||
|
default:
|
||||||
|
return na;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Read next ASCII character, skipping any characters not in [A-\]]. */
|
||||||
|
static char
|
||||||
|
read_valid_char(const char *str, int *ix)
|
||||||
|
{
|
||||||
|
char c;
|
||||||
|
|
||||||
|
while ((c = read_char((const unsigned char *) str, ix)) != '\0')
|
||||||
|
{
|
||||||
|
if (c >= 'A' && c <= ']')
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Return sound coding for "letter" (letter sequence) */
|
||||||
|
static const dm_codes *
|
||||||
|
read_letter(const char *str, int *ix)
|
||||||
|
{
|
||||||
|
char c,
|
||||||
|
cmp;
|
||||||
|
int i,
|
||||||
|
j;
|
||||||
|
const dm_letter *letters;
|
||||||
|
const dm_codes *codes;
|
||||||
|
|
||||||
|
/* First letter in sequence. */
|
||||||
|
if ((c = read_valid_char(str, ix)) == '\0')
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
letters = &letter_[c - 'A'];
|
||||||
|
codes = letters->codes;
|
||||||
|
i = *ix;
|
||||||
|
|
||||||
|
/* Any subsequent letters in sequence. */
|
||||||
|
while ((letters = letters->letters) && (c = read_valid_char(str, &i)))
|
||||||
|
{
|
||||||
|
for (j = 0; (cmp = letters[j].letter); j++)
|
||||||
|
{
|
||||||
|
if (cmp == c)
|
||||||
|
{
|
||||||
|
/* Letter found. */
|
||||||
|
letters = &letters[j];
|
||||||
|
if (letters->codes)
|
||||||
|
{
|
||||||
|
/* Coding for letter sequence found. */
|
||||||
|
codes = letters->codes;
|
||||||
|
*ix = i;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!cmp)
|
||||||
|
{
|
||||||
|
/* The sequence of letters has no coding. */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return codes;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generate all Daitch-Mokotoff soundex codes for word,
|
||||||
|
* adding them to the "soundex" ArrayBuildState.
|
||||||
|
* Returns false if string has no encodable characters, else true.
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
daitch_mokotoff_coding(const char *word, ArrayBuildState *soundex)
|
||||||
|
{
|
||||||
|
int i = 0;
|
||||||
|
int letter_no = 0;
|
||||||
|
int ix_node = 0;
|
||||||
|
const dm_codes *codes,
|
||||||
|
*next_codes;
|
||||||
|
dm_node *first_node[2],
|
||||||
|
*node;
|
||||||
|
|
||||||
|
/* First letter. */
|
||||||
|
if (!(codes = read_letter(word, &i)))
|
||||||
|
{
|
||||||
|
/* No encodable character in input. */
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Starting point. */
|
||||||
|
first_node[ix_node] = palloc_object(dm_node);
|
||||||
|
*first_node[ix_node] = start_node;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Loop until either the word input is exhausted, or all generated soundex
|
||||||
|
* codes are completed to six digits.
|
||||||
|
*/
|
||||||
|
while (codes && first_node[ix_node])
|
||||||
|
{
|
||||||
|
next_codes = read_letter(word, &i);
|
||||||
|
|
||||||
|
/* Update leaf nodes. */
|
||||||
|
update_leaves(first_node, &ix_node, letter_no,
|
||||||
|
codes, next_codes ? next_codes : end_codes,
|
||||||
|
soundex);
|
||||||
|
|
||||||
|
codes = next_codes;
|
||||||
|
letter_no++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Append all remaining (incomplete) soundex codes to output array. */
|
||||||
|
for (node = first_node[ix_node]; node; node = node->next[ix_node])
|
||||||
|
{
|
||||||
|
text *out = cstring_to_text_with_len(node->soundex,
|
||||||
|
DM_CODE_DIGITS);
|
||||||
|
|
||||||
|
accumArrayResult(soundex,
|
||||||
|
PointerGetDatum(out),
|
||||||
|
false,
|
||||||
|
TEXTOID,
|
||||||
|
CurrentMemoryContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
223
contrib/fuzzystrmatch/daitch_mokotoff_header.pl
Executable file
223
contrib/fuzzystrmatch/daitch_mokotoff_header.pl
Executable file
@ -0,0 +1,223 @@
|
|||||||
|
#!/usr/bin/perl
|
||||||
|
#
|
||||||
|
# Generation of types and lookup tables for Daitch-Mokotoff soundex.
|
||||||
|
#
|
||||||
|
# Copyright (c) 2023, PostgreSQL Global Development Group
|
||||||
|
#
|
||||||
|
# This module was originally sponsored by Finance Norway /
|
||||||
|
# Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
|
||||||
|
#
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
|
||||||
|
use utf8;
|
||||||
|
use open IO => ':utf8', ':std';
|
||||||
|
use Data::Dumper;
|
||||||
|
|
||||||
|
die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
|
||||||
|
my $output_file = $ARGV[0];
|
||||||
|
|
||||||
|
# Open the output file
|
||||||
|
open my $OUTPUT, '>', $output_file
|
||||||
|
or die "Could not open output file $output_file: $!\n";
|
||||||
|
|
||||||
|
# Parse code table and generate tree for letter transitions.
|
||||||
|
my %codes;
|
||||||
|
my $table = [ {}, [ [ "", "", "" ] ] ];
|
||||||
|
while (<DATA>)
|
||||||
|
{
|
||||||
|
chomp;
|
||||||
|
my ($letters, $codes) = split(/\s+/);
|
||||||
|
my @codes = map { [ split(/,/) ] } split(/\|/, $codes);
|
||||||
|
|
||||||
|
my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes);
|
||||||
|
my $val = join(
|
||||||
|
",\n",
|
||||||
|
map {
|
||||||
|
"\t{\n\t\t"
|
||||||
|
. join(", ", map { "\"$_\"" } @$_) . "\n\t}"
|
||||||
|
} @codes);
|
||||||
|
$codes{$key} = $val;
|
||||||
|
|
||||||
|
for my $letter (split(/,/, $letters))
|
||||||
|
{
|
||||||
|
my $ref = $table->[0];
|
||||||
|
# Link each character to the next in the letter combination.
|
||||||
|
my @c = split(//, $letter);
|
||||||
|
my $last_c = pop(@c);
|
||||||
|
for my $c (@c)
|
||||||
|
{
|
||||||
|
$ref->{$c} //= [ {}, undef ];
|
||||||
|
$ref->{$c}[0] //= {};
|
||||||
|
$ref = $ref->{$c}[0];
|
||||||
|
}
|
||||||
|
# The sound code for the letter combination is stored at the last character.
|
||||||
|
$ref->{$last_c}[1] = $key;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(DATA);
|
||||||
|
|
||||||
|
print $OUTPUT <<EOF;
|
||||||
|
/*
|
||||||
|
* Constants and lookup tables for Daitch-Mokotoff Soundex
|
||||||
|
*
|
||||||
|
* Copyright (c) 2023, PostgreSQL Global Development Group
|
||||||
|
*
|
||||||
|
* This file is generated by daitch_mokotoff_header.pl
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Coding chart table: Soundex codes */
|
||||||
|
typedef char dm_code[2 + 1]; /* One or two sequential code digits + NUL */
|
||||||
|
typedef dm_code dm_codes[3]; /* Start of name, before a vowel, any other */
|
||||||
|
|
||||||
|
/* Coding chart table: Letter in input sequence */
|
||||||
|
struct dm_letter
|
||||||
|
{
|
||||||
|
char letter; /* Present letter in sequence */
|
||||||
|
const struct dm_letter *letters; /* List of possible successive letters */
|
||||||
|
const dm_codes *codes; /* Code sequence(s) for complete sequence */
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct dm_letter dm_letter;
|
||||||
|
|
||||||
|
/* Codes for letter sequence at start of name, before a vowel, and any other. */
|
||||||
|
EOF
|
||||||
|
|
||||||
|
for my $key (sort keys %codes)
|
||||||
|
{
|
||||||
|
print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
|
||||||
|
. $codes{$key}
|
||||||
|
. "\n};\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
print $OUTPUT <<EOF;
|
||||||
|
|
||||||
|
/* Coding for alternative following letters in sequence. */
|
||||||
|
EOF
|
||||||
|
|
||||||
|
sub hash2code
|
||||||
|
{
|
||||||
|
my ($ref, $letter) = @_;
|
||||||
|
|
||||||
|
my @letters = ();
|
||||||
|
|
||||||
|
my $h = $ref->[0];
|
||||||
|
for my $key (sort keys %$h)
|
||||||
|
{
|
||||||
|
$ref = $h->{$key};
|
||||||
|
my $children = "NULL";
|
||||||
|
if (defined $ref->[0])
|
||||||
|
{
|
||||||
|
$children = "letter_$letter$key";
|
||||||
|
hash2code($ref, "$letter$key");
|
||||||
|
}
|
||||||
|
my $codes = $ref->[1] // "NULL";
|
||||||
|
push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
|
||||||
|
}
|
||||||
|
|
||||||
|
print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
|
||||||
|
for (@letters)
|
||||||
|
{
|
||||||
|
print $OUTPUT "$_,\n";
|
||||||
|
}
|
||||||
|
print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
|
||||||
|
print $OUTPUT "};\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
hash2code($table, '');
|
||||||
|
|
||||||
|
close $OUTPUT;
|
||||||
|
|
||||||
|
# Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||||
|
#
|
||||||
|
# The conversion from the coding chart to the table should be self
|
||||||
|
# explanatory, but note the differences stated below.
|
||||||
|
#
|
||||||
|
# X = NC (not coded)
|
||||||
|
#
|
||||||
|
# The non-ASCII letters in the coding chart are coded with substitute
|
||||||
|
# lowercase ASCII letters, which sort after the uppercase ASCII letters:
|
||||||
|
#
|
||||||
|
# Ą => a (use '[' for table lookup)
|
||||||
|
# Ę => e (use '\\' for table lookup)
|
||||||
|
# Ţ => t (use ']' for table lookup)
|
||||||
|
#
|
||||||
|
# The rule for "UE" does not correspond to the coding chart, however
|
||||||
|
# it is used by all other known implementations, including the one at
|
||||||
|
# https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
|
||||||
|
#
|
||||||
|
# Note that the implementation assumes that vowels are assigned code
|
||||||
|
# 0 or 1. "J" can be either a vowel or a consonant.
|
||||||
|
#
|
||||||
|
|
||||||
|
__DATA__
|
||||||
|
AI,AJ,AY 0,1,X
|
||||||
|
AU 0,7,X
|
||||||
|
a X,X,6|X,X,X
|
||||||
|
A 0,X,X
|
||||||
|
B 7,7,7
|
||||||
|
CHS 5,54,54
|
||||||
|
CH 5,5,5|4,4,4
|
||||||
|
CK 5,5,5|45,45,45
|
||||||
|
CZ,CS,CSZ,CZS 4,4,4
|
||||||
|
C 5,5,5|4,4,4
|
||||||
|
DRZ,DRS 4,4,4
|
||||||
|
DS,DSH,DSZ 4,4,4
|
||||||
|
DZ,DZH,DZS 4,4,4
|
||||||
|
D,DT 3,3,3
|
||||||
|
EI,EJ,EY 0,1,X
|
||||||
|
EU 1,1,X
|
||||||
|
e X,X,6|X,X,X
|
||||||
|
E 0,X,X
|
||||||
|
FB 7,7,7
|
||||||
|
F 7,7,7
|
||||||
|
G 5,5,5
|
||||||
|
H 5,5,X
|
||||||
|
IA,IE,IO,IU 1,X,X
|
||||||
|
I 0,X,X
|
||||||
|
J 1,X,X|4,4,4
|
||||||
|
KS 5,54,54
|
||||||
|
KH 5,5,5
|
||||||
|
K 5,5,5
|
||||||
|
L 8,8,8
|
||||||
|
MN 66,66,66
|
||||||
|
M 6,6,6
|
||||||
|
NM 66,66,66
|
||||||
|
N 6,6,6
|
||||||
|
OI,OJ,OY 0,1,X
|
||||||
|
O 0,X,X
|
||||||
|
P,PF,PH 7,7,7
|
||||||
|
Q 5,5,5
|
||||||
|
RZ,RS 94,94,94|4,4,4
|
||||||
|
R 9,9,9
|
||||||
|
SCHTSCH,SCHTSH,SCHTCH 2,4,4
|
||||||
|
SCH 4,4,4
|
||||||
|
SHTCH,SHCH,SHTSH 2,4,4
|
||||||
|
SHT,SCHT,SCHD 2,43,43
|
||||||
|
SH 4,4,4
|
||||||
|
STCH,STSCH,SC 2,4,4
|
||||||
|
STRZ,STRS,STSH 2,4,4
|
||||||
|
ST 2,43,43
|
||||||
|
SZCZ,SZCS 2,4,4
|
||||||
|
SZT,SHD,SZD,SD 2,43,43
|
||||||
|
SZ 4,4,4
|
||||||
|
S 4,4,4
|
||||||
|
TCH,TTCH,TTSCH 4,4,4
|
||||||
|
TH 3,3,3
|
||||||
|
TRZ,TRS 4,4,4
|
||||||
|
TSCH,TSH 4,4,4
|
||||||
|
TS,TTS,TTSZ,TC 4,4,4
|
||||||
|
TZ,TTZ,TZS,TSZ 4,4,4
|
||||||
|
t 3,3,3|4,4,4
|
||||||
|
T 3,3,3
|
||||||
|
UI,UJ,UY,UE 0,1,X
|
||||||
|
U 0,X,X
|
||||||
|
V 7,7,7
|
||||||
|
W 7,7,7
|
||||||
|
X 5,54,54
|
||||||
|
Y 1,X,X
|
||||||
|
ZDZ,ZDZH,ZHDZH 2,4,4
|
||||||
|
ZD,ZHD 2,43,43
|
||||||
|
ZH,ZS,ZSCH,ZSH 4,4,4
|
||||||
|
Z 4,4,4
|
@ -65,3 +65,174 @@ SELECT dmetaphone_alt('gumbo');
|
|||||||
KMP
|
KMP
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
-- Wovels
|
||||||
|
SELECT daitch_mokotoff('Augsburg');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{054795}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Breuer');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{791900}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Freud');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{793000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- The letter "H"
|
||||||
|
SELECT daitch_mokotoff('Halberstadt');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{587943,587433}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Mannheim');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{665600}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Adjacent sounds
|
||||||
|
SELECT daitch_mokotoff('Chernowitz');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{596740,496740}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Adjacent letters with identical adjacent code digits
|
||||||
|
SELECT daitch_mokotoff('Cherkassy');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{595400,495400}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Kleinman');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{586660}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- More than one word
|
||||||
|
SELECT daitch_mokotoff('Nowy Targ');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{673950}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Padded with "0"
|
||||||
|
SELECT daitch_mokotoff('Berlin');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{798600}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Other examples from https://www.avotaynu.com/soundex.htm
|
||||||
|
SELECT daitch_mokotoff('Ceniow');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{567000,467000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Tsenyuv');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{467000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Holubica');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{587500,587400}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Golubitsa');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{587400}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Przemysl');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{794648,746480}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Pshemeshil');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{746480}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Rosochowaciec');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------------------------------------------------
|
||||||
|
{945755,945754,945745,945744,944755,944754,944745,944744}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Rosokhovatsets');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{945744}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Ignored characters
|
||||||
|
SELECT daitch_mokotoff('''OBrien');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{079600}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('O''Brien');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{079600}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- "Difficult" cases, likely to cause trouble for other implementations.
|
||||||
|
SELECT daitch_mokotoff('CJC');
|
||||||
|
daitch_mokotoff
|
||||||
|
---------------------------------------------
|
||||||
|
{550000,540000,545000,450000,400000,440000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('BESST');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{743000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('BOUEY');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{710000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('HANNMANN');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{566600}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('MCCOYJR');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------------------------------------------------
|
||||||
|
{651900,654900,654190,654490,645190,645490,641900,644900}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('ACCURSO');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------------------------------------------------
|
||||||
|
{059400,054000,054940,054400,045940,045400,049400,044000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('BIERSCHBACH');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------------------------------------------------
|
||||||
|
{794575,794574,794750,794740,745750,745740,747500,747400}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
61
contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8.out
Normal file
61
contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8.out
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
/*
|
||||||
|
* This test must be run in a database with UTF-8 encoding,
|
||||||
|
* because other encodings don't support all the characters used.
|
||||||
|
*/
|
||||||
|
SELECT getdatabaseencoding() <> 'UTF8'
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
||||||
|
\endif
|
||||||
|
set client_encoding = utf8;
|
||||||
|
-- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
|
||||||
|
-- Accents
|
||||||
|
SELECT daitch_mokotoff('Müller');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{689000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Schäfer');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{479000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Straßburg');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{294795}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Éregon');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{095600}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
-- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||||
|
SELECT daitch_mokotoff('gąszczu');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{564000,540000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('brzęczy');
|
||||||
|
daitch_mokotoff
|
||||||
|
-------------------------------
|
||||||
|
{794640,794400,746400,744000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('ţamas');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{364000,464000}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('țamas');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{364000,464000}
|
||||||
|
(1 row)
|
||||||
|
|
8
contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8_1.out
Normal file
8
contrib/fuzzystrmatch/expected/fuzzystrmatch_utf8_1.out
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
/*
|
||||||
|
* This test must be run in a database with UTF-8 encoding,
|
||||||
|
* because other encodings don't support all the characters used.
|
||||||
|
*/
|
||||||
|
SELECT getdatabaseencoding() <> 'UTF8'
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
8
contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql
Normal file
8
contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
/* contrib/fuzzystrmatch/fuzzystrmatch--1.1--1.2.sql */
|
||||||
|
|
||||||
|
-- complain if script is sourced in psql, rather than via ALTER EXTENSION
|
||||||
|
\echo Use "ALTER EXTENSION fuzzystrmatch UPDATE TO '1.2'" to load this file. \quit
|
||||||
|
|
||||||
|
CREATE FUNCTION daitch_mokotoff(text) RETURNS text[]
|
||||||
|
AS 'MODULE_PATHNAME', 'daitch_mokotoff'
|
||||||
|
LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
@ -1,6 +1,6 @@
|
|||||||
# fuzzystrmatch extension
|
# fuzzystrmatch extension
|
||||||
comment = 'determine similarities and distance between strings'
|
comment = 'determine similarities and distance between strings'
|
||||||
default_version = '1.1'
|
default_version = '1.2'
|
||||||
module_pathname = '$libdir/fuzzystrmatch'
|
module_pathname = '$libdir/fuzzystrmatch'
|
||||||
relocatable = true
|
relocatable = true
|
||||||
trusted = true
|
trusted = true
|
||||||
|
@ -1,10 +1,19 @@
|
|||||||
# Copyright (c) 2022-2023, PostgreSQL Global Development Group
|
# Copyright (c) 2022-2023, PostgreSQL Global Development Group
|
||||||
|
|
||||||
fuzzystrmatch_sources = files(
|
fuzzystrmatch_sources = files(
|
||||||
'fuzzystrmatch.c',
|
'daitch_mokotoff.c',
|
||||||
'dmetaphone.c',
|
'dmetaphone.c',
|
||||||
|
'fuzzystrmatch.c',
|
||||||
)
|
)
|
||||||
|
|
||||||
|
daitch_mokotoff_h = custom_target('daitch_mokotoff',
|
||||||
|
input: 'daitch_mokotoff_header.pl',
|
||||||
|
output: 'daitch_mokotoff.h',
|
||||||
|
command: [perl, '@INPUT@', '@OUTPUT@'],
|
||||||
|
)
|
||||||
|
generated_sources += daitch_mokotoff_h
|
||||||
|
fuzzystrmatch_sources += daitch_mokotoff_h
|
||||||
|
|
||||||
if host_system == 'windows'
|
if host_system == 'windows'
|
||||||
fuzzystrmatch_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
|
fuzzystrmatch_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
|
||||||
'--NAME', 'fuzzystrmatch',
|
'--NAME', 'fuzzystrmatch',
|
||||||
@ -13,6 +22,7 @@ endif
|
|||||||
|
|
||||||
fuzzystrmatch = shared_module('fuzzystrmatch',
|
fuzzystrmatch = shared_module('fuzzystrmatch',
|
||||||
fuzzystrmatch_sources,
|
fuzzystrmatch_sources,
|
||||||
|
include_directories: include_directories('.'),
|
||||||
kwargs: contrib_mod_args,
|
kwargs: contrib_mod_args,
|
||||||
)
|
)
|
||||||
contrib_targets += fuzzystrmatch
|
contrib_targets += fuzzystrmatch
|
||||||
@ -21,6 +31,7 @@ install_data(
|
|||||||
'fuzzystrmatch.control',
|
'fuzzystrmatch.control',
|
||||||
'fuzzystrmatch--1.0--1.1.sql',
|
'fuzzystrmatch--1.0--1.1.sql',
|
||||||
'fuzzystrmatch--1.1.sql',
|
'fuzzystrmatch--1.1.sql',
|
||||||
|
'fuzzystrmatch--1.1--1.2.sql',
|
||||||
kwargs: contrib_data_args,
|
kwargs: contrib_data_args,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -31,6 +42,7 @@ tests += {
|
|||||||
'regress': {
|
'regress': {
|
||||||
'sql': [
|
'sql': [
|
||||||
'fuzzystrmatch',
|
'fuzzystrmatch',
|
||||||
|
'fuzzystrmatch_utf8',
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
@ -19,3 +19,48 @@ SELECT metaphone('GUMBO', 4);
|
|||||||
|
|
||||||
SELECT dmetaphone('gumbo');
|
SELECT dmetaphone('gumbo');
|
||||||
SELECT dmetaphone_alt('gumbo');
|
SELECT dmetaphone_alt('gumbo');
|
||||||
|
|
||||||
|
-- Wovels
|
||||||
|
SELECT daitch_mokotoff('Augsburg');
|
||||||
|
SELECT daitch_mokotoff('Breuer');
|
||||||
|
SELECT daitch_mokotoff('Freud');
|
||||||
|
|
||||||
|
-- The letter "H"
|
||||||
|
SELECT daitch_mokotoff('Halberstadt');
|
||||||
|
SELECT daitch_mokotoff('Mannheim');
|
||||||
|
|
||||||
|
-- Adjacent sounds
|
||||||
|
SELECT daitch_mokotoff('Chernowitz');
|
||||||
|
|
||||||
|
-- Adjacent letters with identical adjacent code digits
|
||||||
|
SELECT daitch_mokotoff('Cherkassy');
|
||||||
|
SELECT daitch_mokotoff('Kleinman');
|
||||||
|
|
||||||
|
-- More than one word
|
||||||
|
SELECT daitch_mokotoff('Nowy Targ');
|
||||||
|
|
||||||
|
-- Padded with "0"
|
||||||
|
SELECT daitch_mokotoff('Berlin');
|
||||||
|
|
||||||
|
-- Other examples from https://www.avotaynu.com/soundex.htm
|
||||||
|
SELECT daitch_mokotoff('Ceniow');
|
||||||
|
SELECT daitch_mokotoff('Tsenyuv');
|
||||||
|
SELECT daitch_mokotoff('Holubica');
|
||||||
|
SELECT daitch_mokotoff('Golubitsa');
|
||||||
|
SELECT daitch_mokotoff('Przemysl');
|
||||||
|
SELECT daitch_mokotoff('Pshemeshil');
|
||||||
|
SELECT daitch_mokotoff('Rosochowaciec');
|
||||||
|
SELECT daitch_mokotoff('Rosokhovatsets');
|
||||||
|
|
||||||
|
-- Ignored characters
|
||||||
|
SELECT daitch_mokotoff('''OBrien');
|
||||||
|
SELECT daitch_mokotoff('O''Brien');
|
||||||
|
|
||||||
|
-- "Difficult" cases, likely to cause trouble for other implementations.
|
||||||
|
SELECT daitch_mokotoff('CJC');
|
||||||
|
SELECT daitch_mokotoff('BESST');
|
||||||
|
SELECT daitch_mokotoff('BOUEY');
|
||||||
|
SELECT daitch_mokotoff('HANNMANN');
|
||||||
|
SELECT daitch_mokotoff('MCCOYJR');
|
||||||
|
SELECT daitch_mokotoff('ACCURSO');
|
||||||
|
SELECT daitch_mokotoff('BIERSCHBACH');
|
||||||
|
26
contrib/fuzzystrmatch/sql/fuzzystrmatch_utf8.sql
Normal file
26
contrib/fuzzystrmatch/sql/fuzzystrmatch_utf8.sql
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
/*
|
||||||
|
* This test must be run in a database with UTF-8 encoding,
|
||||||
|
* because other encodings don't support all the characters used.
|
||||||
|
*/
|
||||||
|
|
||||||
|
SELECT getdatabaseencoding() <> 'UTF8'
|
||||||
|
AS skip_test \gset
|
||||||
|
\if :skip_test
|
||||||
|
\quit
|
||||||
|
\endif
|
||||||
|
|
||||||
|
set client_encoding = utf8;
|
||||||
|
|
||||||
|
-- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
|
||||||
|
|
||||||
|
-- Accents
|
||||||
|
SELECT daitch_mokotoff('Müller');
|
||||||
|
SELECT daitch_mokotoff('Schäfer');
|
||||||
|
SELECT daitch_mokotoff('Straßburg');
|
||||||
|
SELECT daitch_mokotoff('Éregon');
|
||||||
|
|
||||||
|
-- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html
|
||||||
|
SELECT daitch_mokotoff('gąszczu');
|
||||||
|
SELECT daitch_mokotoff('brzęczy');
|
||||||
|
SELECT daitch_mokotoff('ţamas');
|
||||||
|
SELECT daitch_mokotoff('țamas');
|
@ -17,6 +17,8 @@
|
|||||||
At present, the <function>soundex</function>, <function>metaphone</function>,
|
At present, the <function>soundex</function>, <function>metaphone</function>,
|
||||||
<function>dmetaphone</function>, and <function>dmetaphone_alt</function> functions do
|
<function>dmetaphone</function>, and <function>dmetaphone_alt</function> functions do
|
||||||
not work well with multibyte encodings (such as UTF-8).
|
not work well with multibyte encodings (such as UTF-8).
|
||||||
|
Use <function>daitch_mokotoff</function>
|
||||||
|
or <function>levenshtein</function> with such data.
|
||||||
</para>
|
</para>
|
||||||
</caution>
|
</caution>
|
||||||
|
|
||||||
@ -88,6 +90,159 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2;
|
|||||||
</programlisting>
|
</programlisting>
|
||||||
</sect2>
|
</sect2>
|
||||||
|
|
||||||
|
<sect2 id="fuzzystrmatch-daitch-mokotoff">
|
||||||
|
<title>Daitch-Mokotoff Soundex</title>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
Like the original Soundex system, Daitch-Mokotoff Soundex matches
|
||||||
|
similar-sounding names by converting them to the same code.
|
||||||
|
However, Daitch-Mokotoff Soundex is significantly more useful for
|
||||||
|
non-English names than the original system.
|
||||||
|
Major improvements over the original system include:
|
||||||
|
|
||||||
|
<itemizedlist spacing="compact" mark="bullet">
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
The code is based on the first six meaningful letters rather than four.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
A letter or combination of letters maps into ten possible codes rather
|
||||||
|
than seven.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
Where two consecutive letters have a single sound, they are coded as a
|
||||||
|
single number.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>
|
||||||
|
When a letter or combination of letters may have different sounds,
|
||||||
|
multiple codes are emitted to cover all possibilities.
|
||||||
|
</para>
|
||||||
|
</listitem>
|
||||||
|
</itemizedlist>
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<indexterm>
|
||||||
|
<primary>daitch_mokotoff</primary>
|
||||||
|
</indexterm>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
This function generates the Daitch-Mokotoff soundex codes for its input:
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<synopsis>
|
||||||
|
daitch_mokotoff(<parameter>source</parameter> text) returns text[]
|
||||||
|
</synopsis>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
The result may contain one or more codes depending on how many plausible
|
||||||
|
pronunciations there are, so it is represented as an array.
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
Since a Daitch-Mokotoff soundex code consists of only 6 digits,
|
||||||
|
<parameter>source</parameter> should be preferably a single word or name.
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
Here are some examples:
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<programlisting>
|
||||||
|
SELECT daitch_mokotoff('George');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{595000}
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('John');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{160000,460000}
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Bierschbach');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------------------------------------------------
|
||||||
|
{794575,794574,794750,794740,745750,745740,747500,747400}
|
||||||
|
|
||||||
|
SELECT daitch_mokotoff('Schwartzenegger');
|
||||||
|
daitch_mokotoff
|
||||||
|
-----------------
|
||||||
|
{479465}
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
For matching of single names, returned text arrays can be matched
|
||||||
|
directly using the <literal>&&</literal> operator: any overlap
|
||||||
|
can be considered a match. A GIN index may
|
||||||
|
be used for efficiency, see <xref linkend="gin"/> and this example:
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<programlisting>
|
||||||
|
CREATE TABLE s (nm text);
|
||||||
|
CREATE INDEX ix_s_dm ON s USING gin (daitch_mokotoff(nm)) WITH (fastupdate = off);
|
||||||
|
|
||||||
|
INSERT INTO s (nm) VALUES
|
||||||
|
('Schwartzenegger'),
|
||||||
|
('John'),
|
||||||
|
('James'),
|
||||||
|
('Steinman'),
|
||||||
|
('Steinmetz');
|
||||||
|
|
||||||
|
SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Swartzenegger');
|
||||||
|
SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Jane');
|
||||||
|
SELECT * FROM s WHERE daitch_mokotoff(nm) && daitch_mokotoff('Jens');
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
For indexing and matching of any number of names in any order, Full Text
|
||||||
|
Search features can be used. See <xref linkend="textsearch"/> and this
|
||||||
|
example:
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<programlisting>
|
||||||
|
CREATE FUNCTION soundex_tsvector(v_name text) RETURNS tsvector
|
||||||
|
BEGIN ATOMIC
|
||||||
|
SELECT to_tsvector('simple',
|
||||||
|
string_agg(array_to_string(daitch_mokotoff(n), ' '), ' '))
|
||||||
|
FROM regexp_split_to_table(v_name, '\s+') AS n;
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE FUNCTION soundex_tsquery(v_name text) RETURNS tsquery
|
||||||
|
BEGIN ATOMIC
|
||||||
|
SELECT string_agg('(' || array_to_string(daitch_mokotoff(n), '|') || ')', '&')::tsquery
|
||||||
|
FROM regexp_split_to_table(v_name, '\s+') AS n;
|
||||||
|
END;
|
||||||
|
|
||||||
|
CREATE TABLE s (nm text);
|
||||||
|
CREATE INDEX ix_s_txt ON s USING gin (soundex_tsvector(nm)) WITH (fastupdate = off);
|
||||||
|
|
||||||
|
INSERT INTO s (nm) VALUES
|
||||||
|
('John Doe'),
|
||||||
|
('Jane Roe'),
|
||||||
|
('Public John Q.'),
|
||||||
|
('George Best'),
|
||||||
|
('John Yamson');
|
||||||
|
|
||||||
|
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('john');
|
||||||
|
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('jane doe');
|
||||||
|
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('john public');
|
||||||
|
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('besst, giorgio');
|
||||||
|
SELECT * FROM s WHERE soundex_tsvector(nm) @@ soundex_tsquery('Jameson John');
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
If it is desired to avoid recalculation of soundex codes during index
|
||||||
|
rechecks, an index on a separate column can be used instead of an index on
|
||||||
|
an expression. A stored generated column can be used for this; see
|
||||||
|
<xref linkend="ddl-generated-columns"/>.
|
||||||
|
</para>
|
||||||
|
</sect2>
|
||||||
|
|
||||||
<sect2 id="fuzzystrmatch-levenshtein">
|
<sect2 id="fuzzystrmatch-levenshtein">
|
||||||
<title>Levenshtein</title>
|
<title>Levenshtein</title>
|
||||||
|
|
||||||
@ -104,10 +259,10 @@ SELECT * FROM s WHERE difference(s.nm, 'john') > 2;
|
|||||||
</indexterm>
|
</indexterm>
|
||||||
|
|
||||||
<synopsis>
|
<synopsis>
|
||||||
levenshtein(text source, text target, int ins_cost, int del_cost, int sub_cost) returns int
|
levenshtein(source text, target text, ins_cost int, del_cost int, sub_cost int) returns int
|
||||||
levenshtein(text source, text target) returns int
|
levenshtein(source text, target text) returns int
|
||||||
levenshtein_less_equal(text source, text target, int ins_cost, int del_cost, int sub_cost, int max_d) returns int
|
levenshtein_less_equal(source text, target text, ins_cost int, del_cost int, sub_cost int, max_d int) returns int
|
||||||
levenshtein_less_equal(text source, text target, int max_d) returns int
|
levenshtein_less_equal(source text, target text, max_d int) returns int
|
||||||
</synopsis>
|
</synopsis>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
@ -177,7 +332,7 @@ test=# SELECT levenshtein_less_equal('extensive', 'exhaustive', 4);
|
|||||||
</indexterm>
|
</indexterm>
|
||||||
|
|
||||||
<synopsis>
|
<synopsis>
|
||||||
metaphone(text source, int max_output_length) returns text
|
metaphone(source text, max_output_length int) returns text
|
||||||
</synopsis>
|
</synopsis>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
@ -220,8 +375,8 @@ test=# SELECT metaphone('GUMBO', 4);
|
|||||||
</indexterm>
|
</indexterm>
|
||||||
|
|
||||||
<synopsis>
|
<synopsis>
|
||||||
dmetaphone(text source) returns text
|
dmetaphone(source text) returns text
|
||||||
dmetaphone_alt(text source) returns text
|
dmetaphone_alt(source text) returns text
|
||||||
</synopsis>
|
</synopsis>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user