mirror of
https://github.com/postgres/postgres.git
synced 2025-07-27 12:41:57 +03:00
The fti.pl supplied with the fulltextindex module generate ALL possible
substrings of two characters or greater, and is case-sensitive. This patch makes it work correctly. It generates only the suffixes of each word, plus lowercases them - as specified by the README file. This brings it into line with the fti.c function, makes it case-insensitive properly, removes the problem with duplicate rows being returned from an fti search and greatly reduces the size of the generated index table. It was written by my co-worker, Brett Toolin. Christopher Kings-Lynne
This commit is contained in:
@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/perl
|
#!/usr/bin/perl
|
||||||
#
|
#
|
||||||
# This script substracts all substrings out of a specific column in a table
|
# This script substracts all suffixes of all words in a specific column in a table
|
||||||
# and generates output that can be loaded into a new table with the
|
# and generates output that can be loaded into a new table with the
|
||||||
# psql '\copy' command. The new table should have the following structure:
|
# psql '\copy' command. The new table should have the following structure:
|
||||||
#
|
#
|
||||||
@ -52,27 +52,28 @@ $PGRES_BAD_RESPONSE = 5 ;
|
|||||||
$PGRES_NONFATAL_ERROR = 6 ;
|
$PGRES_NONFATAL_ERROR = 6 ;
|
||||||
$PGRES_FATAL_ERROR = 7 ;
|
$PGRES_FATAL_ERROR = 7 ;
|
||||||
|
|
||||||
|
# the minimum length of word to include in the full text index
|
||||||
|
$MIN_WORD_LENGTH = 2;
|
||||||
|
|
||||||
|
# the minimum length of the substrings in the full text index
|
||||||
|
$MIN_SUBSTRING_LENGTH = 2;
|
||||||
|
|
||||||
$[ = 0; # make sure string offsets start at 0
|
$[ = 0; # make sure string offsets start at 0
|
||||||
|
|
||||||
sub break_up {
|
sub break_up {
|
||||||
my $string = pop @_;
|
my $string = pop @_;
|
||||||
|
|
||||||
|
# convert strings to lower case
|
||||||
|
$string = lc($string);
|
||||||
@strings = split(/\W+/, $string);
|
@strings = split(/\W+/, $string);
|
||||||
@subs = ();
|
@subs = ();
|
||||||
|
|
||||||
foreach $s (@strings) {
|
foreach $s (@strings) {
|
||||||
$len = length($s);
|
$len = length($s);
|
||||||
next if ($len < 4);
|
next if ($len <= $MIN_WORD_LENGTH);
|
||||||
|
for ($i = 0; $i <= $len - $MIN_SUBSTRING_LENGTH; $i++) {
|
||||||
$lpos = $len-1;
|
$tmp = substr($s, $i);
|
||||||
while ($lpos >= 3) {
|
push(@subs, $tmp);
|
||||||
$fpos = $lpos - 3;
|
|
||||||
while ($fpos >= 0) {
|
|
||||||
$sub = substr($s, $fpos, $lpos - $fpos + 1);
|
|
||||||
push(@subs, $sub);
|
|
||||||
$fpos = $fpos - 1;
|
|
||||||
}
|
|
||||||
$lpos = $lpos - 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user