mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 10:30:33 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			220 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			220 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Perl
		
	
	
		
			Executable File
		
	
	
	
	
| #!/usr/bin/perl
 | |
| #
 | |
| # Generation of types and lookup tables for Daitch-Mokotoff soundex.
 | |
| #
 | |
| # Copyright (c) 2023-2025, PostgreSQL Global Development Group
 | |
| #
 | |
| # This module was originally sponsored by Finance Norway /
 | |
| # Trafikkforsikringsforeningen, and implemented by Dag Lem <dag@nimrod.no>
 | |
| #
 | |
| 
 | |
| use strict;
 | |
| use warnings FATAL => 'all';
 | |
| 
 | |
| die "Usage: $0 OUTPUT_FILE\n" if @ARGV != 1;
 | |
| my $output_file = $ARGV[0];
 | |
| 
 | |
| # Open the output file
 | |
| open my $OUTPUT, '>', $output_file
 | |
|   or die "Could not open output file $output_file: $!\n";
 | |
| 
 | |
| # Parse code table and generate tree for letter transitions.
 | |
| my %codes;
 | |
| my $table = [ {}, [ [ "", "", "" ] ] ];
 | |
| while (<DATA>)
 | |
| {
 | |
| 	chomp;
 | |
| 	my ($letters, $codes) = split(/\s+/);
 | |
| 	my @codes = map { [ split(/,/) ] } split(/\|/, $codes);
 | |
| 
 | |
| 	my $key = "codes_" . join("_or_", map { join("_", @$_) } @codes);
 | |
| 	my $val = join(
 | |
| 		",\n",
 | |
| 		map {
 | |
| 			"\t{\n\t\t"
 | |
| 			  . join(", ", map { "\"$_\"" } @$_) . "\n\t}"
 | |
| 		} @codes);
 | |
| 	$codes{$key} = $val;
 | |
| 
 | |
| 	for my $letter (split(/,/, $letters))
 | |
| 	{
 | |
| 		my $ref = $table->[0];
 | |
| 		# Link each character to the next in the letter combination.
 | |
| 		my @c = split(//, $letter);
 | |
| 		my $last_c = pop(@c);
 | |
| 		for my $c (@c)
 | |
| 		{
 | |
| 			$ref->{$c} //= [ {}, undef ];
 | |
| 			$ref->{$c}[0] //= {};
 | |
| 			$ref = $ref->{$c}[0];
 | |
| 		}
 | |
| 		# The sound code for the letter combination is stored at the last character.
 | |
| 		$ref->{$last_c}[1] = $key;
 | |
| 	}
 | |
| }
 | |
| close(DATA);
 | |
| 
 | |
| print $OUTPUT <<EOF;
 | |
| /*
 | |
|  * Constants and lookup tables for Daitch-Mokotoff Soundex
 | |
|  *
 | |
|  * Copyright (c) 2023-2025, PostgreSQL Global Development Group
 | |
|  *
 | |
|  * This file is generated by daitch_mokotoff_header.pl
 | |
|  */
 | |
| 
 | |
| /* Coding chart table: Soundex codes */
 | |
| typedef char dm_code[2 + 1];	/* One or two sequential code digits + NUL */
 | |
| typedef dm_code dm_codes[3];	/* Start of name, before a vowel, any other */
 | |
| 
 | |
| /* Coding chart table: Letter in input sequence */
 | |
| struct dm_letter
 | |
| {
 | |
| 	char		letter;			/* Present letter in sequence */
 | |
| 	const struct dm_letter *letters;	/* List of possible successive letters */
 | |
| 	const dm_codes *codes;		/* Code sequence(s) for complete sequence */
 | |
| };
 | |
| 
 | |
| typedef struct dm_letter dm_letter;
 | |
| 
 | |
| /* Codes for letter sequence at start of name, before a vowel, and any other. */
 | |
| EOF
 | |
| 
 | |
| for my $key (sort keys %codes)
 | |
| {
 | |
| 	print $OUTPUT "static const dm_codes $key\[2\] =\n{\n"
 | |
| 	  . $codes{$key}
 | |
| 	  . "\n};\n";
 | |
| }
 | |
| 
 | |
| print $OUTPUT <<EOF;
 | |
| 
 | |
| /* Coding for alternative following letters in sequence. */
 | |
| EOF
 | |
| 
 | |
| sub hash2code
 | |
| {
 | |
| 	my ($ref, $letter) = @_;
 | |
| 
 | |
| 	my @letters = ();
 | |
| 
 | |
| 	my $h = $ref->[0];
 | |
| 	for my $key (sort keys %$h)
 | |
| 	{
 | |
| 		$ref = $h->{$key};
 | |
| 		my $children = "NULL";
 | |
| 		if (defined $ref->[0])
 | |
| 		{
 | |
| 			$children = "letter_$letter$key";
 | |
| 			hash2code($ref, "$letter$key");
 | |
| 		}
 | |
| 		my $codes = $ref->[1] // "NULL";
 | |
| 		push(@letters, "\t{\n\t\t'$key', $children, $codes\n\t}");
 | |
| 	}
 | |
| 
 | |
| 	print $OUTPUT "static const dm_letter letter_$letter\[\] =\n{\n";
 | |
| 	for (@letters)
 | |
| 	{
 | |
| 		print $OUTPUT "$_,\n";
 | |
| 	}
 | |
| 	print $OUTPUT "\t{\n\t\t'\\0'\n\t}\n";
 | |
| 	print $OUTPUT "};\n";
 | |
| }
 | |
| 
 | |
| hash2code($table, '');
 | |
| 
 | |
| close $OUTPUT;
 | |
| 
 | |
| # Table adapted from https://www.jewishgen.org/InfoFiles/Soundex.html
 | |
| #
 | |
| # The conversion from the coding chart to the table should be self
 | |
| # explanatory, but note the differences stated below.
 | |
| #
 | |
| # X = NC (not coded)
 | |
| #
 | |
| # The non-ASCII letters in the coding chart are coded with substitute
 | |
| # lowercase ASCII letters, which sort after the uppercase ASCII letters:
 | |
| #
 | |
| # Ą => a (use '[' for table lookup)
 | |
| # Ę => e (use '\\' for table lookup)
 | |
| # Ţ => t (use ']' for table lookup)
 | |
| #
 | |
| # The rule for "UE" does not correspond to the coding chart, however
 | |
| # it is used by all other known implementations, including the one at
 | |
| # https://www.jewishgen.org/jos/jossound.htm (try e.g. "bouey").
 | |
| #
 | |
| # Note that the implementation assumes that vowels are assigned code
 | |
| # 0 or 1. "J" can be either a vowel or a consonant.
 | |
| #
 | |
| 
 | |
| __DATA__
 | |
| AI,AJ,AY				0,1,X
 | |
| AU						0,7,X
 | |
| a						X,X,6|X,X,X
 | |
| A						0,X,X
 | |
| B						7,7,7
 | |
| CHS						5,54,54
 | |
| CH						5,5,5|4,4,4
 | |
| CK						5,5,5|45,45,45
 | |
| CZ,CS,CSZ,CZS			4,4,4
 | |
| C						5,5,5|4,4,4
 | |
| DRZ,DRS					4,4,4
 | |
| DS,DSH,DSZ				4,4,4
 | |
| DZ,DZH,DZS				4,4,4
 | |
| D,DT					3,3,3
 | |
| EI,EJ,EY				0,1,X
 | |
| EU						1,1,X
 | |
| e						X,X,6|X,X,X
 | |
| E						0,X,X
 | |
| FB						7,7,7
 | |
| F						7,7,7
 | |
| G						5,5,5
 | |
| H						5,5,X
 | |
| IA,IE,IO,IU				1,X,X
 | |
| I						0,X,X
 | |
| J						1,X,X|4,4,4
 | |
| KS						5,54,54
 | |
| KH						5,5,5
 | |
| K						5,5,5
 | |
| L						8,8,8
 | |
| MN						66,66,66
 | |
| M						6,6,6
 | |
| NM						66,66,66
 | |
| N						6,6,6
 | |
| OI,OJ,OY				0,1,X
 | |
| O						0,X,X
 | |
| P,PF,PH					7,7,7
 | |
| Q						5,5,5
 | |
| RZ,RS					94,94,94|4,4,4
 | |
| R						9,9,9
 | |
| SCHTSCH,SCHTSH,SCHTCH	2,4,4
 | |
| SCH						4,4,4
 | |
| SHTCH,SHCH,SHTSH		2,4,4
 | |
| SHT,SCHT,SCHD			2,43,43
 | |
| SH						4,4,4
 | |
| STCH,STSCH,SC			2,4,4
 | |
| STRZ,STRS,STSH			2,4,4
 | |
| ST						2,43,43
 | |
| SZCZ,SZCS				2,4,4
 | |
| SZT,SHD,SZD,SD			2,43,43
 | |
| SZ						4,4,4
 | |
| S						4,4,4
 | |
| TCH,TTCH,TTSCH			4,4,4
 | |
| TH						3,3,3
 | |
| TRZ,TRS					4,4,4
 | |
| TSCH,TSH				4,4,4
 | |
| TS,TTS,TTSZ,TC			4,4,4
 | |
| TZ,TTZ,TZS,TSZ			4,4,4
 | |
| t						3,3,3|4,4,4
 | |
| T						3,3,3
 | |
| UI,UJ,UY,UE				0,1,X
 | |
| U						0,X,X
 | |
| V						7,7,7
 | |
| W						7,7,7
 | |
| X						5,54,54
 | |
| Y						1,X,X
 | |
| ZDZ,ZDZH,ZHDZH			2,4,4
 | |
| ZD,ZHD					2,43,43
 | |
| ZH,ZS,ZSCH,ZSH			4,4,4
 | |
| Z						4,4,4
 |