Unaccent dictionary.

2025-12-04 12:02:48 +03:00 · 2009-08-18 10:34:39 +00:00
parent a88a48011c
commit 92e05bc6a5
12 changed files with 808 additions and 3 deletions
--- a/contrib/unaccent/unaccent.c
+++ b/contrib/unaccent/unaccent.c
@@ -0,0 +1,318 @@
+/*-------------------------------------------------------------------------
+ *
+ * unaccent.c
+ *    Text search unaccent dictionary
+ *
+ * Copyright (c) 2009, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *    $PostgreSQL: pgsql/contrib/unaccent/unaccent.c,v 1.1 2009/08/18 10:34:39 teodor Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "catalog/namespace.h"
+#include "commands/defrem.h"
+#include "mb/pg_wchar.h"
+#include "tsearch/ts_cache.h"
+#include "tsearch/ts_locale.h"
+#include "tsearch/ts_public.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * Unaccent dictionary uses uncompressed suffix tree to find a 
+ * character to replace. Each node of tree is an array of 
+ * SuffixChar struct with length = 256 (n-th element of array
+ * corresponds to byte)
+ */
+typedef struct SuffixChar {
+	struct SuffixChar	*nextChar;
+	char				*replaceTo;
+	int					replacelen;
+} SuffixChar;
+
+/*
+ * placeChar - put str into tree's structure, byte by byte.
+ */
+static SuffixChar*
+placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
+{
+	SuffixChar	*curnode;
+
+	if ( !node )
+	{
+		node = palloc(sizeof(SuffixChar) * 256);
+		memset(node, 0, sizeof(SuffixChar) * 256);
+	}
+
+	curnode = node + *str;
+
+	if ( lenstr == 1 )
+	{
+		if ( curnode->replaceTo )
+			elog(WARNING, "duplicate TO argument, use first one");
+		else
+		{
+			curnode->replacelen = replacelen;
+			curnode->replaceTo = palloc( replacelen );
+			memcpy(curnode->replaceTo, replaceTo, replacelen);
+		}
+	}
+	else
+	{
+		curnode->nextChar = placeChar( curnode->nextChar, str+1, lenstr-1, replaceTo, replacelen);
+	}
+
+	return node;
+}
+
+/*
+ * initSuffixTree  - create suffix tree from file. Function converts
+ * UTF8-encoded file into current encoding.
+ */
+static SuffixChar*
+initSuffixTree(char *filename) 
+{
+	SuffixChar *rootSuffixTree = NULL;
+	MemoryContext ccxt = CurrentMemoryContext;
+	tsearch_readline_state	trst;
+	bool			skip;
+
+	filename = get_tsearch_config_filename(filename, "rules");
+	if (!tsearch_readline_begin(&trst, filename))
+		ereport(ERROR,
+				(errcode(ERRCODE_CONFIG_FILE_ERROR),
+				 errmsg("could not open unaccent file \"%s\": %m",
+						filename)));
+
+	do	
+	{
+		char	src[4096];
+		char	trg[4096];
+		int		srclen;
+		int		trglen;
+		char   *line = NULL;
+
+		skip = true;
+
+		PG_TRY();
+		{
+			/*
+			 * pg_do_encoding_conversion() (called by tsearch_readline())
+			 * will emit exception if it finds untranslatable characters in current locale.
+			 * We just skip such characters.
+			 */
+			while ((line = tsearch_readline(&trst)) != NULL)
+			{
+				if ( sscanf(line, "%s\t%s\n", src, trg)!=2 )
+					continue;
+
+				srclen = strlen(src);
+				trglen = strlen(trg);
+
+				rootSuffixTree = placeChar(rootSuffixTree, 
+											(unsigned char*)src, srclen, 
+											trg, trglen);
+				skip = false;
+				pfree(line);
+			}
+		}
+		PG_CATCH();
+		{
+			ErrorData  *errdata;
+			MemoryContext ecxt;
+
+			ecxt = MemoryContextSwitchTo(ccxt);
+			errdata = CopyErrorData();
+			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
+			{
+				FlushErrorState();
+			}
+			else
+			{
+				MemoryContextSwitchTo(ecxt);
+				PG_RE_THROW();
+			}
+		}
+		PG_END_TRY();
+	}
+	while(skip);
+
+	tsearch_readline_end(&trst);
+
+	return rootSuffixTree;
+}
+
+/*
+ * findReplaceTo - find multibyte character in tree
+ */
+static SuffixChar * 
+findReplaceTo( SuffixChar *node, unsigned char *src, int srclen )
+{
+	while( node ) 
+	{
+		node = node + *src;
+		if ( srclen == 1 )
+			return node;
+
+		src++;
+		srclen--;
+		node = node->nextChar;
+	}
+
+	return NULL;
+}
+
+PG_FUNCTION_INFO_V1(unaccent_init);
+Datum       unaccent_init(PG_FUNCTION_ARGS);
+Datum
+unaccent_init(PG_FUNCTION_ARGS)
+{
+	List       *dictoptions = (List *) PG_GETARG_POINTER(0);
+	SuffixChar *rootSuffixTree;
+	bool        fileloaded = false;
+	ListCell   *l;
+
+	foreach(l, dictoptions)
+	{
+		DefElem    *defel = (DefElem *) lfirst(l);
+
+		if (pg_strcasecmp("Rules", defel->defname) == 0)
+		{
+			if (fileloaded)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("multiple Rules parameters")));
+				rootSuffixTree = initSuffixTree(defGetString(defel));
+				fileloaded = true;
+		}
+		else
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized Unaccent parameter: \"%s\"",
+							defel->defname)));
+		}
+	}
+
+	if (!fileloaded)
+	{
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("missing Rules parameter")));
+	}
+
+	PG_RETURN_POINTER(rootSuffixTree);
+}
+
+PG_FUNCTION_INFO_V1(unaccent_lexize);
+Datum       unaccent_lexize(PG_FUNCTION_ARGS);
+Datum
+unaccent_lexize(PG_FUNCTION_ARGS)
+{
+	SuffixChar *rootSuffixTree = (SuffixChar*)PG_GETARG_POINTER(0);
+	char       *srcchar = (char *) PG_GETARG_POINTER(1);
+	int32		len = PG_GETARG_INT32(2);
+	char	   *srcstart, *trgchar;
+	int			charlen;
+	TSLexeme   *res = NULL;
+	SuffixChar *node;
+
+	srcstart = srcchar;
+	while( srcchar - srcstart < len )
+	{
+		charlen = pg_mblen(srcchar);
+
+		node = findReplaceTo( rootSuffixTree, (unsigned char *) srcchar, charlen );
+		if ( node  && node->replaceTo )
+		{
+			if ( !res )
+			{
+				/* allocate res only it it's needed */
+				res = palloc0(sizeof(TSLexeme) * 2);
+				res->lexeme = trgchar = palloc( len * pg_database_encoding_max_length() + 1 /* \0 */ );
+				res->flags = TSL_FILTER;
+				if ( srcchar != srcstart )
+				{
+					memcpy(trgchar, srcstart, srcchar - srcstart);
+					trgchar += (srcchar - srcstart);
+				}
+			}
+			memcpy( trgchar, node->replaceTo, node->replacelen );
+			trgchar += node->replacelen; 
+		}
+		else if ( res )
+		{
+			memcpy( trgchar, srcchar, charlen );
+			trgchar += charlen;
+		}
+
+		srcchar += charlen;
+	}
+
+	if ( res )
+		*trgchar = '\0';
+
+	PG_RETURN_POINTER(res);
+}
+
+/*
+ * Function-like wrapper for dictionary
+ */
+PG_FUNCTION_INFO_V1(unaccent_dict);
+Datum       unaccent_dict(PG_FUNCTION_ARGS);
+Datum
+unaccent_dict(PG_FUNCTION_ARGS)
+{
+	text	*str;
+	int		strArg;
+	Oid		dictOid;
+	TSDictionaryCacheEntry	*dict;
+	TSLexeme *res;
+
+	if (PG_NARGS() == 1)
+	{
+		dictOid = TSDictionaryGetDictid(stringToQualifiedNameList("unaccent"), false);
+		strArg = 0;
+	}
+	else
+	{
+		dictOid = PG_GETARG_OID(0);
+		strArg = 1;
+	}
+	str = PG_GETARG_TEXT_P(strArg);
+
+	dict = lookup_ts_dictionary_cache(dictOid);
+
+	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
+													 PointerGetDatum(dict->dictData),
+													 PointerGetDatum(VARDATA(str)),
+													 Int32GetDatum(VARSIZE(str) - VARHDRSZ),
+													 PointerGetDatum(NULL)));
+
+	PG_FREE_IF_COPY(str, strArg);
+
+	if ( res == NULL )
+	{
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else if ( res->lexeme == NULL )
+	{
+		pfree(res);
+		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
+	}
+	else
+	{
+		text *txt = cstring_to_text(res->lexeme);
+
+		pfree(res->lexeme);
+		pfree(res);
+
+		PG_RETURN_TEXT_P(txt);
+	}
+}