mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 10:30:33 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			321 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			321 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*-------------------------------------------------------------------------
 | |
|  *
 | |
|  * unaccent.c
 | |
|  *	  Text search unaccent dictionary
 | |
|  *
 | |
|  * Copyright (c) 2009-2010, PostgreSQL Global Development Group
 | |
|  *
 | |
|  * IDENTIFICATION
 | |
|  *	  contrib/unaccent/unaccent.c
 | |
|  *
 | |
|  *-------------------------------------------------------------------------
 | |
|  */
 | |
| 
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include "fmgr.h"
 | |
| #include "catalog/namespace.h"
 | |
| #include "commands/defrem.h"
 | |
| #include "mb/pg_wchar.h"
 | |
| #include "tsearch/ts_cache.h"
 | |
| #include "tsearch/ts_locale.h"
 | |
| #include "tsearch/ts_public.h"
 | |
| #include "utils/builtins.h"
 | |
| 
 | |
| PG_MODULE_MAGIC;
 | |
| 
 | |
| /*
 | |
|  * Unaccent dictionary uses uncompressed suffix tree to find a
 | |
|  * character to replace. Each node of tree is an array of
 | |
|  * SuffixChar struct with length = 256 (n-th element of array
 | |
|  * corresponds to byte)
 | |
|  */
 | |
| typedef struct SuffixChar
 | |
| {
 | |
| 	struct SuffixChar *nextChar;
 | |
| 	char	   *replaceTo;
 | |
| 	int			replacelen;
 | |
| } SuffixChar;
 | |
| 
 | |
| /*
 | |
|  * placeChar - put str into tree's structure, byte by byte.
 | |
|  */
 | |
| static SuffixChar *
 | |
| placeChar(SuffixChar *node, unsigned char *str, int lenstr, char *replaceTo, int replacelen)
 | |
| {
 | |
| 	SuffixChar *curnode;
 | |
| 
 | |
| 	if (!node)
 | |
| 	{
 | |
| 		node = palloc(sizeof(SuffixChar) * 256);
 | |
| 		memset(node, 0, sizeof(SuffixChar) * 256);
 | |
| 	}
 | |
| 
 | |
| 	curnode = node + *str;
 | |
| 
 | |
| 	if (lenstr == 1)
 | |
| 	{
 | |
| 		if (curnode->replaceTo)
 | |
| 			elog(WARNING, "duplicate TO argument, use first one");
 | |
| 		else
 | |
| 		{
 | |
| 			curnode->replacelen = replacelen;
 | |
| 			curnode->replaceTo = palloc(replacelen);
 | |
| 			memcpy(curnode->replaceTo, replaceTo, replacelen);
 | |
| 		}
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 		curnode->nextChar = placeChar(curnode->nextChar, str + 1, lenstr - 1, replaceTo, replacelen);
 | |
| 	}
 | |
| 
 | |
| 	return node;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * initSuffixTree  - create suffix tree from file. Function converts
 | |
|  * UTF8-encoded file into current encoding.
 | |
|  */
 | |
| static SuffixChar *
 | |
| initSuffixTree(char *filename)
 | |
| {
 | |
| 	SuffixChar *volatile rootSuffixTree = NULL;
 | |
| 	MemoryContext ccxt = CurrentMemoryContext;
 | |
| 	tsearch_readline_state trst;
 | |
| 	volatile bool skip;
 | |
| 
 | |
| 	filename = get_tsearch_config_filename(filename, "rules");
 | |
| 	if (!tsearch_readline_begin(&trst, filename))
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_CONFIG_FILE_ERROR),
 | |
| 				 errmsg("could not open unaccent file \"%s\": %m",
 | |
| 						filename)));
 | |
| 
 | |
| 	do
 | |
| 	{
 | |
| 		char		src[4096];
 | |
| 		char		trg[4096];
 | |
| 		int			srclen;
 | |
| 		int			trglen;
 | |
| 		char	   *line = NULL;
 | |
| 
 | |
| 		skip = true;
 | |
| 
 | |
| 		PG_TRY();
 | |
| 		{
 | |
| 			/*
 | |
| 			 * pg_do_encoding_conversion() (called by tsearch_readline()) will
 | |
| 			 * emit exception if it finds untranslatable characters in current
 | |
| 			 * locale. We just skip such characters.
 | |
| 			 */
 | |
| 			while ((line = tsearch_readline(&trst)) != NULL)
 | |
| 			{
 | |
| 				if (sscanf(line, "%s\t%s\n", src, trg) != 2)
 | |
| 					continue;
 | |
| 
 | |
| 				srclen = strlen(src);
 | |
| 				trglen = strlen(trg);
 | |
| 
 | |
| 				rootSuffixTree = placeChar(rootSuffixTree,
 | |
| 										   (unsigned char *) src, srclen,
 | |
| 										   trg, trglen);
 | |
| 				skip = false;
 | |
| 				pfree(line);
 | |
| 			}
 | |
| 		}
 | |
| 		PG_CATCH();
 | |
| 		{
 | |
| 			ErrorData  *errdata;
 | |
| 			MemoryContext ecxt;
 | |
| 
 | |
| 			ecxt = MemoryContextSwitchTo(ccxt);
 | |
| 			errdata = CopyErrorData();
 | |
| 			if (errdata->sqlerrcode == ERRCODE_UNTRANSLATABLE_CHARACTER)
 | |
| 			{
 | |
| 				FlushErrorState();
 | |
| 			}
 | |
| 			else
 | |
| 			{
 | |
| 				MemoryContextSwitchTo(ecxt);
 | |
| 				PG_RE_THROW();
 | |
| 			}
 | |
| 		}
 | |
| 		PG_END_TRY();
 | |
| 	}
 | |
| 	while (skip);
 | |
| 
 | |
| 	tsearch_readline_end(&trst);
 | |
| 
 | |
| 	return rootSuffixTree;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * findReplaceTo - find multibyte character in tree
 | |
|  */
 | |
| static SuffixChar *
 | |
| findReplaceTo(SuffixChar *node, unsigned char *src, int srclen)
 | |
| {
 | |
| 	while (node)
 | |
| 	{
 | |
| 		node = node + *src;
 | |
| 		if (srclen == 1)
 | |
| 			return node;
 | |
| 
 | |
| 		src++;
 | |
| 		srclen--;
 | |
| 		node = node->nextChar;
 | |
| 	}
 | |
| 
 | |
| 	return NULL;
 | |
| }
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(unaccent_init);
 | |
| Datum		unaccent_init(PG_FUNCTION_ARGS);
 | |
| Datum
 | |
| unaccent_init(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	List	   *dictoptions = (List *) PG_GETARG_POINTER(0);
 | |
| 	SuffixChar *rootSuffixTree = NULL;
 | |
| 	bool		fileloaded = false;
 | |
| 	ListCell   *l;
 | |
| 
 | |
| 	foreach(l, dictoptions)
 | |
| 	{
 | |
| 		DefElem    *defel = (DefElem *) lfirst(l);
 | |
| 
 | |
| 		if (pg_strcasecmp("Rules", defel->defname) == 0)
 | |
| 		{
 | |
| 			if (fileloaded)
 | |
| 				ereport(ERROR,
 | |
| 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 						 errmsg("multiple Rules parameters")));
 | |
| 			rootSuffixTree = initSuffixTree(defGetString(defel));
 | |
| 			fileloaded = true;
 | |
| 		}
 | |
| 		else
 | |
| 		{
 | |
| 			ereport(ERROR,
 | |
| 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 					 errmsg("unrecognized Unaccent parameter: \"%s\"",
 | |
| 							defel->defname)));
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if (!fileloaded)
 | |
| 	{
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 				 errmsg("missing Rules parameter")));
 | |
| 	}
 | |
| 
 | |
| 	PG_RETURN_POINTER(rootSuffixTree);
 | |
| }
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(unaccent_lexize);
 | |
| Datum		unaccent_lexize(PG_FUNCTION_ARGS);
 | |
| Datum
 | |
| unaccent_lexize(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	SuffixChar *rootSuffixTree = (SuffixChar *) PG_GETARG_POINTER(0);
 | |
| 	char	   *srcchar = (char *) PG_GETARG_POINTER(1);
 | |
| 	int32		len = PG_GETARG_INT32(2);
 | |
| 	char	   *srcstart,
 | |
| 			   *trgchar = NULL;
 | |
| 	int			charlen;
 | |
| 	TSLexeme   *res = NULL;
 | |
| 	SuffixChar *node;
 | |
| 
 | |
| 	srcstart = srcchar;
 | |
| 	while (srcchar - srcstart < len)
 | |
| 	{
 | |
| 		charlen = pg_mblen(srcchar);
 | |
| 
 | |
| 		node = findReplaceTo(rootSuffixTree, (unsigned char *) srcchar, charlen);
 | |
| 		if (node && node->replaceTo)
 | |
| 		{
 | |
| 			if (!res)
 | |
| 			{
 | |
| 				/* allocate res only it it's needed */
 | |
| 				res = palloc0(sizeof(TSLexeme) * 2);
 | |
| 				res->lexeme = trgchar = palloc(len * pg_database_encoding_max_length() + 1 /* \0 */ );
 | |
| 				res->flags = TSL_FILTER;
 | |
| 				if (srcchar != srcstart)
 | |
| 				{
 | |
| 					memcpy(trgchar, srcstart, srcchar - srcstart);
 | |
| 					trgchar += (srcchar - srcstart);
 | |
| 				}
 | |
| 			}
 | |
| 			memcpy(trgchar, node->replaceTo, node->replacelen);
 | |
| 			trgchar += node->replacelen;
 | |
| 		}
 | |
| 		else if (res)
 | |
| 		{
 | |
| 			memcpy(trgchar, srcchar, charlen);
 | |
| 			trgchar += charlen;
 | |
| 		}
 | |
| 
 | |
| 		srcchar += charlen;
 | |
| 	}
 | |
| 
 | |
| 	if (res)
 | |
| 		*trgchar = '\0';
 | |
| 
 | |
| 	PG_RETURN_POINTER(res);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Function-like wrapper for dictionary
 | |
|  */
 | |
| PG_FUNCTION_INFO_V1(unaccent_dict);
 | |
| Datum		unaccent_dict(PG_FUNCTION_ARGS);
 | |
| Datum
 | |
| unaccent_dict(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	text	   *str;
 | |
| 	int			strArg;
 | |
| 	Oid			dictOid;
 | |
| 	TSDictionaryCacheEntry *dict;
 | |
| 	TSLexeme   *res;
 | |
| 
 | |
| 	if (PG_NARGS() == 1)
 | |
| 	{
 | |
| 		dictOid = get_ts_dict_oid(stringToQualifiedNameList("unaccent"), false);
 | |
| 		strArg = 0;
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 		dictOid = PG_GETARG_OID(0);
 | |
| 		strArg = 1;
 | |
| 	}
 | |
| 	str = PG_GETARG_TEXT_P(strArg);
 | |
| 
 | |
| 	dict = lookup_ts_dictionary_cache(dictOid);
 | |
| 
 | |
| 	res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(dict->lexize),
 | |
| 											 PointerGetDatum(dict->dictData),
 | |
| 											   PointerGetDatum(VARDATA(str)),
 | |
| 									  Int32GetDatum(VARSIZE(str) - VARHDRSZ),
 | |
| 													 PointerGetDatum(NULL)));
 | |
| 
 | |
| 	PG_FREE_IF_COPY(str, strArg);
 | |
| 
 | |
| 	if (res == NULL)
 | |
| 	{
 | |
| 		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 | |
| 	}
 | |
| 	else if (res->lexeme == NULL)
 | |
| 	{
 | |
| 		pfree(res);
 | |
| 		PG_RETURN_TEXT_P(PG_GETARG_TEXT_P_COPY(strArg));
 | |
| 	}
 | |
| 	else
 | |
| 	{
 | |
| 		text	   *txt = cstring_to_text(res->lexeme);
 | |
| 
 | |
| 		pfree(res->lexeme);
 | |
| 		pfree(res);
 | |
| 
 | |
| 		PG_RETURN_TEXT_P(txt);
 | |
| 	}
 | |
| }
 |