1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-27 12:41:57 +03:00

Add support for Daitch-Mokotoff Soundex in contrib/fuzzystrmatch.

This modernized version of Soundex works significantly better than
the original, particularly for non-English names.

Dag Lem, reviewed by quite a few people along the way

Discussion: https://postgr.es/m/yger1atbgfy.fsf@sid.nimrod.no
This commit is contained in:
Tom Lane
2023-04-07 17:31:51 -04:00
parent 728015a470
commit a290378a37
13 changed files with 1315 additions and 11 deletions

View File

@ -65,3 +65,174 @@ SELECT dmetaphone_alt('gumbo');
KMP
(1 row)
-- Wovels
SELECT daitch_mokotoff('Augsburg');
daitch_mokotoff
-----------------
{054795}
(1 row)
SELECT daitch_mokotoff('Breuer');
daitch_mokotoff
-----------------
{791900}
(1 row)
SELECT daitch_mokotoff('Freud');
daitch_mokotoff
-----------------
{793000}
(1 row)
-- The letter "H"
SELECT daitch_mokotoff('Halberstadt');
daitch_mokotoff
-----------------
{587943,587433}
(1 row)
SELECT daitch_mokotoff('Mannheim');
daitch_mokotoff
-----------------
{665600}
(1 row)
-- Adjacent sounds
SELECT daitch_mokotoff('Chernowitz');
daitch_mokotoff
-----------------
{596740,496740}
(1 row)
-- Adjacent letters with identical adjacent code digits
SELECT daitch_mokotoff('Cherkassy');
daitch_mokotoff
-----------------
{595400,495400}
(1 row)
SELECT daitch_mokotoff('Kleinman');
daitch_mokotoff
-----------------
{586660}
(1 row)
-- More than one word
SELECT daitch_mokotoff('Nowy Targ');
daitch_mokotoff
-----------------
{673950}
(1 row)
-- Padded with "0"
SELECT daitch_mokotoff('Berlin');
daitch_mokotoff
-----------------
{798600}
(1 row)
-- Other examples from https://www.avotaynu.com/soundex.htm
SELECT daitch_mokotoff('Ceniow');
daitch_mokotoff
-----------------
{567000,467000}
(1 row)
SELECT daitch_mokotoff('Tsenyuv');
daitch_mokotoff
-----------------
{467000}
(1 row)
SELECT daitch_mokotoff('Holubica');
daitch_mokotoff
-----------------
{587500,587400}
(1 row)
SELECT daitch_mokotoff('Golubitsa');
daitch_mokotoff
-----------------
{587400}
(1 row)
SELECT daitch_mokotoff('Przemysl');
daitch_mokotoff
-----------------
{794648,746480}
(1 row)
SELECT daitch_mokotoff('Pshemeshil');
daitch_mokotoff
-----------------
{746480}
(1 row)
SELECT daitch_mokotoff('Rosochowaciec');
daitch_mokotoff
-----------------------------------------------------------
{945755,945754,945745,945744,944755,944754,944745,944744}
(1 row)
SELECT daitch_mokotoff('Rosokhovatsets');
daitch_mokotoff
-----------------
{945744}
(1 row)
-- Ignored characters
SELECT daitch_mokotoff('''OBrien');
daitch_mokotoff
-----------------
{079600}
(1 row)
SELECT daitch_mokotoff('O''Brien');
daitch_mokotoff
-----------------
{079600}
(1 row)
-- "Difficult" cases, likely to cause trouble for other implementations.
SELECT daitch_mokotoff('CJC');
daitch_mokotoff
---------------------------------------------
{550000,540000,545000,450000,400000,440000}
(1 row)
SELECT daitch_mokotoff('BESST');
daitch_mokotoff
-----------------
{743000}
(1 row)
SELECT daitch_mokotoff('BOUEY');
daitch_mokotoff
-----------------
{710000}
(1 row)
SELECT daitch_mokotoff('HANNMANN');
daitch_mokotoff
-----------------
{566600}
(1 row)
SELECT daitch_mokotoff('MCCOYJR');
daitch_mokotoff
-----------------------------------------------------------
{651900,654900,654190,654490,645190,645490,641900,644900}
(1 row)
SELECT daitch_mokotoff('ACCURSO');
daitch_mokotoff
-----------------------------------------------------------
{059400,054000,054940,054400,045940,045400,049400,044000}
(1 row)
SELECT daitch_mokotoff('BIERSCHBACH');
daitch_mokotoff
-----------------------------------------------------------
{794575,794574,794750,794740,745750,745740,747500,747400}
(1 row)

View File

@ -0,0 +1,61 @@
/*
* This test must be run in a database with UTF-8 encoding,
* because other encodings don't support all the characters used.
*/
SELECT getdatabaseencoding() <> 'UTF8'
AS skip_test \gset
\if :skip_test
\quit
\endif
set client_encoding = utf8;
-- CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;
-- Accents
SELECT daitch_mokotoff('Müller');
daitch_mokotoff
-----------------
{689000}
(1 row)
SELECT daitch_mokotoff('Schäfer');
daitch_mokotoff
-----------------
{479000}
(1 row)
SELECT daitch_mokotoff('Straßburg');
daitch_mokotoff
-----------------
{294795}
(1 row)
SELECT daitch_mokotoff('Éregon');
daitch_mokotoff
-----------------
{095600}
(1 row)
-- Special characters added at https://www.jewishgen.org/InfoFiles/Soundex.html
SELECT daitch_mokotoff('gąszczu');
daitch_mokotoff
-----------------
{564000,540000}
(1 row)
SELECT daitch_mokotoff('brzęczy');
daitch_mokotoff
-------------------------------
{794640,794400,746400,744000}
(1 row)
SELECT daitch_mokotoff('ţamas');
daitch_mokotoff
-----------------
{364000,464000}
(1 row)
SELECT daitch_mokotoff('țamas');
daitch_mokotoff
-----------------
{364000,464000}
(1 row)

View File

@ -0,0 +1,8 @@
/*
* This test must be run in a database with UTF-8 encoding,
* because other encodings don't support all the characters used.
*/
SELECT getdatabaseencoding() <> 'UTF8'
AS skip_test \gset
\if :skip_test
\quit