mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-31 10:30:33 +03:00 
			
		
		
		
	conversion of basic ASCII letters. Remove all uses of strcasecmp and strncasecmp in favor of new functions pg_strcasecmp and pg_strncasecmp; remove most but not all direct uses of toupper and tolower in favor of pg_toupper and pg_tolower. These functions use the same notions of case folding already developed for identifier case conversion. I left the straight locale-based folding in place for situations where we are just manipulating user data and not trying to match it to built-in strings --- for example, the SQL upper() function is still locale dependent. Perhaps this will prove not to be what's wanted, but at the moment we can initdb and pass regression tests in Turkish locale.
		
			
				
	
	
		
			360 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			360 lines
		
	
	
		
			7.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * default word parser
 | |
|  * Teodor Sigaev <teodor@sigaev.ru>
 | |
|  */
 | |
| #include <errno.h>
 | |
| #include <stdlib.h>
 | |
| #include <string.h>
 | |
| 
 | |
| #include "postgres.h"
 | |
| #include "utils/builtins.h"
 | |
| 
 | |
| #include "dict.h"
 | |
| #include "wparser.h"
 | |
| #include "common.h"
 | |
| #include "ts_cfg.h"
 | |
| #include "wordparser/parser.h"
 | |
| #include "wordparser/deflex.h"
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(prsd_lextype);
 | |
| Datum		prsd_lextype(PG_FUNCTION_ARGS);
 | |
| 
 | |
| Datum
 | |
| prsd_lextype(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
 | |
| 	int			i;
 | |
| 
 | |
| 	for (i = 1; i <= LASTNUM; i++)
 | |
| 	{
 | |
| 		descr[i - 1].lexid = i;
 | |
| 		descr[i - 1].alias = pstrdup(tok_alias[i]);
 | |
| 		descr[i - 1].descr = pstrdup(lex_descr[i]);
 | |
| 	}
 | |
| 
 | |
| 	descr[LASTNUM].lexid = 0;
 | |
| 
 | |
| 	PG_RETURN_POINTER(descr);
 | |
| }
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(prsd_start);
 | |
| Datum		prsd_start(PG_FUNCTION_ARGS);
 | |
| Datum
 | |
| prsd_start(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
 | |
| 	PG_RETURN_POINTER(NULL);
 | |
| }
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(prsd_getlexeme);
 | |
| Datum		prsd_getlexeme(PG_FUNCTION_ARGS);
 | |
| Datum
 | |
| prsd_getlexeme(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
 | |
| 	char	  **t = (char **) PG_GETARG_POINTER(1);
 | |
| 	int		   *tlen = (int *) PG_GETARG_POINTER(2);
 | |
| 	int			type = tsearch2_yylex();
 | |
| 
 | |
| 	*t = token;
 | |
| 	*tlen = tokenlen;
 | |
| 	PG_RETURN_INT32(type);
 | |
| }
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(prsd_end);
 | |
| Datum		prsd_end(PG_FUNCTION_ARGS);
 | |
| Datum
 | |
| prsd_end(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	/* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
 | |
| 	tsearch2_end_parse();
 | |
| 	PG_RETURN_VOID();
 | |
| }
 | |
| 
 | |
| #define LEAVETOKEN(x)	( (x)==12 )
 | |
| #define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
 | |
| #define ENDPUNCTOKEN(x) ( (x)==12 )
 | |
| 
 | |
| 
 | |
| #define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
 | |
| #define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
 | |
| #define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
 | |
| #define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
 | |
| 
 | |
| typedef struct
 | |
| {
 | |
| 	HLWORD	   *words;
 | |
| 	int			len;
 | |
| }	hlCheck;
 | |
| 
 | |
| static bool
 | |
| checkcondition_HL(void *checkval, ITEM * val)
 | |
| {
 | |
| 	int			i;
 | |
| 
 | |
| 	for (i = 0; i < ((hlCheck *) checkval)->len; i++)
 | |
| 	{
 | |
| 		if (((hlCheck *) checkval)->words[i].item == val)
 | |
| 			return true;
 | |
| 	}
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| 
 | |
| static bool
 | |
| hlCover(HLPRSTEXT * prs, QUERYTYPE * query, int *p, int *q)
 | |
| {
 | |
| 	int			i,
 | |
| 				j;
 | |
| 	ITEM	   *item = GETQUERY(query);
 | |
| 	int			pos = *p;
 | |
| 
 | |
| 	*q = 0;
 | |
| 	*p = 0x7fffffff;
 | |
| 
 | |
| 	for (j = 0; j < query->size; j++)
 | |
| 	{
 | |
| 		if (item->type != VAL)
 | |
| 		{
 | |
| 			item++;
 | |
| 			continue;
 | |
| 		}
 | |
| 		for (i = pos; i < prs->curwords; i++)
 | |
| 		{
 | |
| 			if (prs->words[i].item == item)
 | |
| 			{
 | |
| 				if (i > *q)
 | |
| 					*q = i;
 | |
| 				break;
 | |
| 			}
 | |
| 		}
 | |
| 		item++;
 | |
| 	}
 | |
| 
 | |
| 	if (*q == 0)
 | |
| 		return false;
 | |
| 
 | |
| 	item = GETQUERY(query);
 | |
| 	for (j = 0; j < query->size; j++)
 | |
| 	{
 | |
| 		if (item->type != VAL)
 | |
| 		{
 | |
| 			item++;
 | |
| 			continue;
 | |
| 		}
 | |
| 		for (i = *q; i >= pos; i--)
 | |
| 		{
 | |
| 			if (prs->words[i].item == item)
 | |
| 			{
 | |
| 				if (i < *p)
 | |
| 					*p = i;
 | |
| 				break;
 | |
| 			}
 | |
| 		}
 | |
| 		item++;
 | |
| 	}
 | |
| 
 | |
| 	if (*p <= *q)
 | |
| 	{
 | |
| 		hlCheck		ch;
 | |
| 
 | |
| 		ch.words = &(prs->words[*p]);
 | |
| 		ch.len = *q - *p + 1;
 | |
| 		if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
 | |
| 			return true;
 | |
| 		else
 | |
| 		{
 | |
| 			(*p)++;
 | |
| 			return hlCover(prs, query, p, q);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return false;
 | |
| }
 | |
| 
 | |
| PG_FUNCTION_INFO_V1(prsd_headline);
 | |
| Datum		prsd_headline(PG_FUNCTION_ARGS);
 | |
| Datum
 | |
| prsd_headline(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	HLPRSTEXT  *prs = (HLPRSTEXT *) PG_GETARG_POINTER(0);
 | |
| 	text	   *opt = (text *) PG_GETARG_POINTER(1);	/* can't be toasted */
 | |
| 	QUERYTYPE  *query = (QUERYTYPE *) PG_GETARG_POINTER(2);		/* can't be toasted */
 | |
| 
 | |
| 	/* from opt + start and and tag */
 | |
| 	int			min_words = 15;
 | |
| 	int			max_words = 35;
 | |
| 	int			shortword = 3;
 | |
| 
 | |
| 	int			p = 0,
 | |
| 				q = 0;
 | |
| 	int			bestb = -1,
 | |
| 				beste = -1;
 | |
| 	int			bestlen = -1;
 | |
| 	int			pose = 0, posb,
 | |
| 				poslen,
 | |
| 				curlen;
 | |
| 
 | |
| 	int			i;
 | |
| 
 | |
| 	/* config */
 | |
| 	prs->startsel = NULL;
 | |
| 	prs->stopsel = NULL;
 | |
| 	if (opt)
 | |
| 	{
 | |
| 		Map		   *map,
 | |
| 				   *mptr;
 | |
| 
 | |
| 		parse_cfgdict(opt, &map);
 | |
| 		mptr = map;
 | |
| 
 | |
| 		while (mptr && mptr->key)
 | |
| 		{
 | |
| 			if (pg_strcasecmp(mptr->key, "MaxWords") == 0)
 | |
| 				max_words = pg_atoi(mptr->value, 4, 1);
 | |
| 			else if (pg_strcasecmp(mptr->key, "MinWords") == 0)
 | |
| 				min_words = pg_atoi(mptr->value, 4, 1);
 | |
| 			else if (pg_strcasecmp(mptr->key, "ShortWord") == 0)
 | |
| 				shortword = pg_atoi(mptr->value, 4, 1);
 | |
| 			else if (pg_strcasecmp(mptr->key, "StartSel") == 0)
 | |
| 				prs->startsel = pstrdup(mptr->value);
 | |
| 			else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
 | |
| 				prs->stopsel = pstrdup(mptr->value);
 | |
| 
 | |
| 			pfree(mptr->key);
 | |
| 			pfree(mptr->value);
 | |
| 
 | |
| 			mptr++;
 | |
| 		}
 | |
| 		pfree(map);
 | |
| 
 | |
| 		if (min_words >= max_words)
 | |
| 			ereport(ERROR,
 | |
| 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 					 errmsg("MinWords should be less than MaxWords")));
 | |
| 		if (min_words <= 0)
 | |
| 			ereport(ERROR,
 | |
| 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 					 errmsg("MinWords should be positive")));
 | |
| 		if (shortword < 0)
 | |
| 			ereport(ERROR,
 | |
| 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 | |
| 					 errmsg("ShortWord should be >= 0")));
 | |
| 	}
 | |
| 
 | |
| 	while (hlCover(prs, query, &p, &q))
 | |
| 	{
 | |
| 		/* find cover len in words */
 | |
| 		curlen = 0;
 | |
| 		poslen = 0;
 | |
| 		for (i = p; i <= q && curlen < max_words; i++)
 | |
| 		{
 | |
| 			if (!NONWORDTOKEN(prs->words[i].type))
 | |
| 				curlen++;
 | |
| 			if (prs->words[i].item && !prs->words[i].repeated)
 | |
| 				poslen++;
 | |
| 			pose = i;
 | |
| 		}
 | |
| 
 | |
| 		if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
 | |
| 		{
 | |
| 			/* best already finded, so try one more cover */
 | |
| 			p++;
 | |
| 			continue;
 | |
| 		}
 | |
| 
 | |
| 		posb=p;
 | |
| 		if (curlen < max_words)
 | |
| 		{						/* find good end */
 | |
| 			for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
 | |
| 			{
 | |
| 				if (i != q)
 | |
| 				{
 | |
| 					if (!NONWORDTOKEN(prs->words[i].type))
 | |
| 						curlen++;
 | |
| 					if (prs->words[i].item && !prs->words[i].repeated)
 | |
| 						poslen++;
 | |
| 				}
 | |
| 				pose = i;
 | |
| 				if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
 | |
| 					continue;
 | |
| 				if (curlen >= min_words)
 | |
| 					break;
 | |
| 			}
 | |
| 			if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
 | |
| 				for(i=p; i>= 0; i--) {
 | |
| 					if (!NONWORDTOKEN(prs->words[i].type))
 | |
| 						curlen++;
 | |
| 					if (prs->words[i].item && !prs->words[i].repeated)
 | |
| 						poslen++;
 | |
| 					if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
 | |
| 						continue;
 | |
| 					if (curlen >= min_words)
 | |
| 						break;
 | |
| 				}
 | |
| 				posb=(i>=0) ? i : 0;
 | |
| 			}
 | |
| 		}
 | |
| 		else
 | |
| 		{						/* shorter cover :((( */
 | |
| 			for (; curlen > min_words; i--)
 | |
| 			{
 | |
| 				if (!NONWORDTOKEN(prs->words[i].type))
 | |
| 					curlen--;
 | |
| 				if (prs->words[i].item && !prs->words[i].repeated)
 | |
| 					poslen--;
 | |
| 				pose = i;
 | |
| 				if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
 | |
| 					continue;
 | |
| 				break;
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
 | |
| 			(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
 | |
| 			 (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
 | |
| 		{
 | |
| 			bestb = posb;
 | |
| 			beste = pose;
 | |
| 			bestlen = poslen;
 | |
| 		}
 | |
| 
 | |
| 		p++;
 | |
| 	}
 | |
| 
 | |
| 	if (bestlen < 0)
 | |
| 	{
 | |
| 		curlen = 0;
 | |
| 		poslen = 0;
 | |
| 		for (i = 0; i < prs->curwords && curlen < min_words; i++)
 | |
| 		{
 | |
| 			if (!NONWORDTOKEN(prs->words[i].type))
 | |
| 				curlen++;
 | |
| 			pose = i;
 | |
| 		}
 | |
| 		bestb = 0;
 | |
| 		beste = pose;
 | |
| 	}
 | |
| 
 | |
| 	for (i = bestb; i <= beste; i++)
 | |
| 	{
 | |
| 		if (prs->words[i].item)
 | |
| 			prs->words[i].selected = 1;
 | |
| 		if (prs->words[i].repeated)
 | |
| 			prs->words[i].skip = 1;
 | |
| 		if (HLIDIGNORE(prs->words[i].type))
 | |
| 			prs->words[i].replace = 1;
 | |
| 
 | |
| 		prs->words[i].in = 1;
 | |
| 	}
 | |
| 
 | |
| 	if (!prs->startsel)
 | |
| 		prs->startsel = pstrdup("<b>");
 | |
| 	if (!prs->stopsel)
 | |
| 		prs->stopsel = pstrdup("</b>");
 | |
| 	prs->startsellen = strlen(prs->startsel);
 | |
| 	prs->stopsellen = strlen(prs->stopsel);
 | |
| 
 | |
| 	PG_RETURN_POINTER(prs);
 | |
| }
 |