mirror of
https://github.com/postgres/postgres.git
synced 2025-07-27 12:41:57 +03:00
Unaccent dictionary.
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.13 2009/04/27 16:27:35 momjian Exp $ -->
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/contrib.sgml,v 1.14 2009/08/18 10:34:39 teodor Exp $ -->
|
||||
|
||||
<appendix id="contrib">
|
||||
<title>Additional Supplied Modules</title>
|
||||
@ -113,6 +113,7 @@ psql -d dbname -f <replaceable>SHAREDIR</>/contrib/<replaceable>module</>.sql
|
||||
&tablefunc;
|
||||
&test-parser;
|
||||
&tsearch2;
|
||||
&unaccent;
|
||||
&uuid-ossp;
|
||||
&vacuumlo;
|
||||
&xml2;
|
||||
|
@ -1,4 +1,4 @@
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.63 2009/08/17 22:14:44 petere Exp $ -->
|
||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.64 2009/08/18 10:34:39 teodor Exp $ -->
|
||||
|
||||
<!entity history SYSTEM "history.sgml">
|
||||
<!entity info SYSTEM "info.sgml">
|
||||
@ -126,6 +126,7 @@
|
||||
<!entity tablefunc SYSTEM "tablefunc.sgml">
|
||||
<!entity test-parser SYSTEM "test-parser.sgml">
|
||||
<!entity tsearch2 SYSTEM "tsearch2.sgml">
|
||||
<!entity unaccent SYSTEM "unaccent.sgml">
|
||||
<!entity uuid-ossp SYSTEM "uuid-ossp.sgml">
|
||||
<!entity vacuumlo SYSTEM "vacuumlo.sgml">
|
||||
<!entity xml2 SYSTEM "xml2.sgml">
|
||||
|
150
doc/src/sgml/unaccent.sgml
Normal file
150
doc/src/sgml/unaccent.sgml
Normal file
@ -0,0 +1,150 @@
|
||||
<sect1 id="unaccent">
|
||||
<title>unaccent</title>
|
||||
|
||||
<indexterm zone="unaccent">
|
||||
<primary>unaccent</primary>
|
||||
</indexterm>
|
||||
|
||||
<para>
|
||||
<filename>unaccent</> removes accents (diacritic signs) from a lexeme.
|
||||
It's a filtering dictionary, that means its output is
|
||||
always passed to the next dictionary (if any), contrary to the standard
|
||||
behaviour. Currently, it supports most important accents from european
|
||||
languages.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Limitation: Current implementation of <filename>unaccent</>
|
||||
dictionary cannot be used as a normalizing dictionary for
|
||||
<filename>thesaurus</filename> dictionary.
|
||||
</para>
|
||||
|
||||
<sect2>
|
||||
<title>Configuration</title>
|
||||
|
||||
<para>
|
||||
A <literal>unaccent</> dictionary accepts the following options:
|
||||
</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
<literal>RULES</> is the base name of the file containing the list of
|
||||
translation rules. This file must be stored in
|
||||
<filename>$SHAREDIR/tsearch_data/</> (where <literal>$SHAREDIR</> means
|
||||
the <productname>PostgreSQL</> installation's shared-data directory).
|
||||
Its name must end in <literal>.rules</> (which is not to be included in
|
||||
the <literal>RULES</> parameter).
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
<para>
|
||||
The rules file has the following format:
|
||||
</para>
|
||||
<itemizedlist>
|
||||
<listitem>
|
||||
<para>
|
||||
Each line represents pair: character_with_accent character_without_accent
|
||||
<programlisting>
|
||||
À A
|
||||
Á A
|
||||
 A
|
||||
à A
|
||||
Ä A
|
||||
Å A
|
||||
Æ A
|
||||
</programlisting>
|
||||
</para>
|
||||
</listitem>
|
||||
</itemizedlist>
|
||||
|
||||
<para>
|
||||
Look at <filename>unaccent.rules</>, which is installed in
|
||||
<filename>$SHAREDIR/tsearch_data/</>, for an example.
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Usage</title>
|
||||
|
||||
<para>
|
||||
Running the installation script creates a text search template
|
||||
<literal>unaccent</> and a dictionary <literal>unaccent</>
|
||||
based on it, with default parameters. You can alter the
|
||||
parameters, for example
|
||||
|
||||
<programlisting>
|
||||
=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
|
||||
</programlisting>
|
||||
|
||||
or create new dictionaries based on the template.
|
||||
</para>
|
||||
|
||||
<para>
|
||||
To test the dictionary, you can try
|
||||
|
||||
<programlisting>
|
||||
=# select ts_lexize('unaccent','Hôtel');
|
||||
ts_lexize
|
||||
-----------
|
||||
{Hotel}
|
||||
(1 row)
|
||||
</programlisting>
|
||||
</para>
|
||||
|
||||
<para>
|
||||
Filtering dictionary are useful for correct work of
|
||||
<function>ts_headline</function> function.
|
||||
<programlisting>
|
||||
=# CREATE TEXT SEARCH CONFIGURATION fr ( COPY = french );
|
||||
=# ALTER TEXT SEARCH CONFIGURATION fr
|
||||
ALTER MAPPING FOR hword, hword_part, word
|
||||
WITH unaccent, french_stem;
|
||||
=# select to_tsvector('fr','Hôtels de la Mer');
|
||||
to_tsvector
|
||||
-------------------
|
||||
'hotel':1 'mer':4
|
||||
(1 row)
|
||||
|
||||
=# select to_tsvector('fr','Hôtel de la Mer') @@ to_tsquery('fr','Hotels');
|
||||
?column?
|
||||
----------
|
||||
t
|
||||
(1 row)
|
||||
=# select ts_headline('fr','Hôtel de la Mer',to_tsquery('fr','Hotels'));
|
||||
ts_headline
|
||||
------------------------
|
||||
<b>Hôtel</b>de la Mer
|
||||
(1 row)
|
||||
|
||||
</programlisting>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
<sect2>
|
||||
<title>Function</title>
|
||||
|
||||
<para>
|
||||
<function>unaccent</> function removes accents (diacritic signs) from
|
||||
argument string. Basically, it's a wrapper around
|
||||
<filename>unaccent</> dictionary.
|
||||
</para>
|
||||
|
||||
<indexterm>
|
||||
<primary>unaccent</primary>
|
||||
</indexterm>
|
||||
|
||||
<synopsis>
|
||||
unaccent(<optional><replaceable class="PARAMETER">dictionary</replaceable>,
|
||||
</optional> <replaceable class="PARAMETER">string</replaceable>)
|
||||
returns <type>text</type>
|
||||
</synopsis>
|
||||
|
||||
<para>
|
||||
<programlisting>
|
||||
SELECT unaccent('unaccent','Hôtel');
|
||||
SELECT unaccent('Hôtel');
|
||||
</programlisting>
|
||||
</para>
|
||||
</sect2>
|
||||
|
||||
</sect1>
|
Reference in New Issue
Block a user