1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-07 00:36:50 +03:00

Rename and slightly redefine the default text search parser's "word"

categories, as per discussion.  asciiword (formerly lword) is still
ASCII-letters-only, and numword (formerly word) is still the most general
mixed-alpha-and-digits case.  But word (formerly nlword) is now
any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as
before.  This is no worse than before for parsing mixed Russian/English text,
which seems to have been the design center for the original coding; and it
should simplify matters for parsing most European languages.  In particular
it will not be necessary for any language to accept strings containing digits
as being regular "words".  The hyphenated-word categories are adjusted
similarly.
This commit is contained in:
Tom Lane
2007-10-23 20:46:12 +00:00
parent 344d0cae64
commit dbaec70c15
10 changed files with 464 additions and 447 deletions

View File

@ -2,7 +2,7 @@
#
# Makefile for src/backend/snowball
#
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
#
#-------------------------------------------------------------------------
@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
stem_UTF_8_swedish.o \
stem_UTF_8_turkish.o
# second column is name of latin dictionary, if different
# Note order dependency: use of some other language as latin dictionary
# first column is language name and also name of dictionary for not-all-ASCII
# words, second is name of dictionary for all-ASCII words
# Note order dependency: use of some other language as ASCII dictionary
# must come after creation of that language
LANGUAGES= \
danish danish \
@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
while [ "$$#" -gt 0 ] ; \
do \
lang=$$1; shift; \
nonlatdictname=$$lang; \
latdictname=$$1; shift; \
nonascdictname=$$lang; \
ascdictname=$$1; shift; \
if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
stop=", StopWords=$${lang}" ; \
else \
@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
sed -e "s#_LANGNAME_#$$lang#g" | \
sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
sed -e "s#_CFGNAME_#$$lang#g" | \
sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \
sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \
sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
sed -e "s#_STOPWORDS_#$$stop#g" ; \
done >> $@
else

View File

@ -1,4 +1,4 @@
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$
-- text search configuration for _LANGNAME_ language
CREATE TEXT SEARCH DICTIONARY _DICTNAME_
@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR email, url, host, sfloat, version, uri, file, float, int, uint
FOR email, url, host, sfloat, version, uri, file, float, int, uint,
numword, hword_numpart, numhword
WITH simple;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR lhword, lpart_hword, lword
WITH _LATDICTNAME_;
FOR asciiword, hword_asciipart, asciihword
WITH _ASCDICTNAME_;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR hword, nlhword, nlpart_hword, nlword, word, part_hword
WITH _NONLATDICTNAME_;
FOR word, hword_part, hword
WITH _NONASCDICTNAME_;