1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-28 23:42:10 +03:00

Add sample text search dictionary templates and parsers, to replace the

hard-to-maintain textual examples currently in the SGML docs.  From
Sergey Karpov.
This commit is contained in:
Tom Lane
2007-10-15 21:36:50 +00:00
parent fb631dba2a
commit 5fcb079858
24 changed files with 1324 additions and 9 deletions

View File

@ -0,0 +1,19 @@
# $PostgreSQL: pgsql/contrib/test_parser/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
MODULE_big = test_parser
OBJS = test_parser.o
DATA_built = test_parser.sql
DATA = uninstall_test_parser.sql
DOCS = README.test_parser
REGRESS = test_parser
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = contrib/test_parser
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif

View File

@ -0,0 +1,52 @@
Example parser
==============
This is an example of a custom parser for full text search.
It recognizes space-delimited words and returns only two token types:
- 3, word, Word
- 12, blank, Space symbols
The token numbers have been chosen to keep compatibility with the default
ts_headline() function, since we do not want to implement our own version.
* Configuration
The parser has no user-configurable parameters.
* Usage
1. Compile and install
2. Load dictionary
psql mydb < test_parser.sql
3. Test it
mydb# SELECT * FROM ts_parse('testparser','That''s my first own parser');
tokid | token
-------+--------
3 | That's
12 |
3 | my
12 |
3 | first
12 |
3 | own
12 |
3 | parser
mydb# SELECT to_tsvector('testcfg','That''s my first own parser');
to_tsvector
-------------------------------------------------
'my':2 'own':4 'first':3 'parser':5 'that''s':1
mydb# SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
headline
-----------------------------------------------------------------
Supernovae <b>stars</b> are the brightest phenomena in galaxies
That's all.

View File

@ -0,0 +1,50 @@
--
-- first, define the parser. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
RESET client_min_messages;
-- make test configuration using parser
CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
-- ts_parse
SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/');
tokid | token
-------+-----------------------
3 | That's
12 |
3 | simple
12 |
3 | parser
12 |
3 | can't
12 |
3 | parse
12 |
3 | urls
12 |
3 | like
12 |
3 | http://some.url/here/
(15 rows)
SELECT to_tsvector('testcfg','That''s my first own parser');
to_tsvector
-------------------------------------------------
'my':2 'own':4 'first':3 'parser':5 'that''s':1
(1 row)
SELECT to_tsquery('testcfg', 'star');
to_tsquery
------------
'star'
(1 row)
SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies',
to_tsquery('testcfg', 'stars'));
ts_headline
-----------------------------------------------------------------
Supernovae <b>stars</b> are the brightest phenomena in galaxies
(1 row)

View File

@ -0,0 +1,26 @@
--
-- first, define the parser. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
\i test_parser.sql
\set ECHO all
RESET client_min_messages;
-- make test configuration using parser
CREATE TEXT SEARCH CONFIGURATION testcfg (PARSER = testparser);
ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
-- ts_parse
SELECT * FROM ts_parse('testparser', 'That''s simple parser can''t parse urls like http://some.url/here/');
SELECT to_tsvector('testcfg','That''s my first own parser');
SELECT to_tsquery('testcfg', 'star');
SELECT ts_headline('testcfg','Supernovae stars are the brightest phenomena in galaxies',
to_tsquery('testcfg', 'stars'));

View File

@ -0,0 +1,130 @@
/*-------------------------------------------------------------------------
*
* test_parser.c
* Simple example of a text search parser
*
* Copyright (c) 2007, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/test_parser/test_parser.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "fmgr.h"
PG_MODULE_MAGIC;
/*
* types
*/
/* self-defined type */
typedef struct {
char * buffer; /* text to parse */
int len; /* length of the text in buffer */
int pos; /* position of the parser */
} ParserState;
/* copy-paste from wparser.h of tsearch2 */
typedef struct {
int lexid;
char *alias;
char *descr;
} LexDescr;
/*
* prototypes
*/
PG_FUNCTION_INFO_V1(testprs_start);
Datum testprs_start(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_getlexeme);
Datum testprs_getlexeme(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_end);
Datum testprs_end(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(testprs_lextype);
Datum testprs_lextype(PG_FUNCTION_ARGS);
/*
* functions
*/
Datum testprs_start(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) palloc0(sizeof(ParserState));
pst->buffer = (char *) PG_GETARG_POINTER(0);
pst->len = PG_GETARG_INT32(1);
pst->pos = 0;
PG_RETURN_POINTER(pst);
}
Datum testprs_getlexeme(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
char **t = (char **) PG_GETARG_POINTER(1);
int *tlen = (int *) PG_GETARG_POINTER(2);
int type;
*tlen = pst->pos;
*t = pst->buffer + pst->pos;
if ((pst->buffer)[pst->pos] == ' ')
{
/* blank type */
type = 12;
/* go to the next non-white-space character */
while ((pst->buffer)[pst->pos] == ' ' &&
pst->pos < pst->len)
(pst->pos)++;
} else {
/* word type */
type = 3;
/* go to the next white-space character */
while ((pst->buffer)[pst->pos] != ' ' &&
pst->pos < pst->len)
(pst->pos)++;
}
*tlen = pst->pos - *tlen;
/* we are finished if (*tlen == 0) */
if (*tlen == 0)
type=0;
PG_RETURN_INT32(type);
}
Datum testprs_end(PG_FUNCTION_ARGS)
{
ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
pfree(pst);
PG_RETURN_VOID();
}
Datum testprs_lextype(PG_FUNCTION_ARGS)
{
/*
* Remarks:
* - we have to return the blanks for headline reason
* - we use the same lexids like Teodor in the default
* word parser; in this way we can reuse the headline
* function of the default word parser.
*/
LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
/* there are only two types in this parser */
descr[0].lexid = 3;
descr[0].alias = pstrdup("word");
descr[0].descr = pstrdup("Word");
descr[1].lexid = 12;
descr[1].alias = pstrdup("blank");
descr[1].descr = pstrdup("Space symbols");
descr[2].lexid = 0;
PG_RETURN_POINTER(descr);
}

View File

@ -0,0 +1,36 @@
-- $PostgreSQL: pgsql/contrib/test_parser/test_parser.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
-- Adjust this setting to control where the objects get created.
SET search_path = public;
BEGIN;
CREATE FUNCTION testprs_start(internal, int4)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION testprs_getlexeme(internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION testprs_end(internal)
RETURNS void
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION testprs_lextype(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE TEXT SEARCH PARSER testparser (
START = testprs_start,
GETTOKEN = testprs_getlexeme,
END = testprs_end,
HEADLINE = pg_catalog.prsd_headline,
LEXTYPES = testprs_lextype
);
END;

View File

@ -0,0 +1,11 @@
SET search_path = public;
DROP TEXT SEARCH PARSER testparser;
DROP FUNCTION testprs_start(internal, int4);
DROP FUNCTION testprs_getlexeme(internal, internal, internal);
DROP FUNCTION testprs_end(internal);
DROP FUNCTION testprs_lextype(internal);