From cbe25dcff73a297adbada9dc1d6cad3df18014e9 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sat, 6 Nov 2021 13:28:53 -0400 Subject: [PATCH] Disallow making an empty lexeme via array_to_tsvector(). The tsvector data type has always forbidden lexemes to be empty. However, array_to_tsvector() didn't get that memo, and would allow an empty-string array element to become an empty lexeme. This could result in dump/restore failures later, not to mention whatever semantic issues might be behind the original prohibition. However, other functions that take a plain text input directly as a lexeme value do not need a similar restriction, because they only match the string against existing tsvector entries. In particular it'd be a bad idea to make ts_delete() reject empty strings, since that is the most convenient way to clean up any bad data that might have gotten into a tsvector column via this bug. Reflecting on that, let's also remove the prohibition against NULL array elements in tsvector_delete_arr and tsvector_setweight_by_filter. It seems more consistent to ignore them, as an empty-string element would be ignored. There's a case for back-patching this, since it's clearly a bug fix. On balance though, it doesn't seem like something to change in a minor release. Jean-Christophe Arnu Discussion: https://postgr.es/m/CAHZmTm1YVndPgUVRoag2WL0w900XcoiivDDj-gTTYBsG25c65A@mail.gmail.com --- doc/src/sgml/func.sgml | 14 ++++++++++++-- src/backend/utils/adt/tsvector_op.c | 20 +++++++++++++------- src/test/regress/expected/tstypes.out | 19 ++++++++++++++----- src/test/regress/sql/tstypes.sql | 8 +++++--- 4 files changed, 44 insertions(+), 17 deletions(-) diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 4b49dff2ffc..24447c00177 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -12920,8 +12920,10 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple tsvector - Converts an array of lexemes to a tsvector. - The given strings are used as-is without further processing. + Converts an array of text strings to a tsvector. + The given strings are used as lexemes as-is, without further + processing. Array elements must not be empty strings + or NULL. array_to_tsvector('{fat,cat,rat}'::text[]) @@ -13104,6 +13106,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple Assigns the specified weight to elements of the vector that are listed in lexemes. + The strings in lexemes are taken as lexemes + as-is, without further processing. Strings that do not match any + lexeme in vector are ignored. setweight('fat:2,4 cat:3 rat:5,6B'::tsvector, 'A', '{cat,rat}') @@ -13265,6 +13270,8 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple Removes any occurrence of the given lexeme from the vector. + The lexeme string is treated as a lexeme as-is, + without further processing. ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, 'fat') @@ -13281,6 +13288,9 @@ CREATE TYPE rainbow AS ENUM ('red', 'orange', 'yellow', 'green', 'blue', 'purple Removes any occurrences of the lexemes in lexemes from the vector. + The strings in lexemes are taken as lexemes + as-is, without further processing. Strings that do not match any + lexeme in vector are ignored. ts_delete('fat:2,4 cat:3 rat:5A'::tsvector, ARRAY['fat','rat']) diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c index 9236ebcc8fe..11ccb5297c9 100644 --- a/src/backend/utils/adt/tsvector_op.c +++ b/src/backend/utils/adt/tsvector_op.c @@ -322,10 +322,9 @@ tsvector_setweight_by_filter(PG_FUNCTION_ARGS) int lex_len, lex_pos; + /* Ignore null array elements, they surely don't match */ if (nulls[i]) - ereport(ERROR, - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), - errmsg("lexeme array may not contain nulls"))); + continue; lex = VARDATA(dlexemes[i]); lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; @@ -602,10 +601,9 @@ tsvector_delete_arr(PG_FUNCTION_ARGS) int lex_len, lex_pos; + /* Ignore null array elements, they surely don't match */ if (nulls[i]) - ereport(ERROR, - (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), - errmsg("lexeme array may not contain nulls"))); + continue; lex = VARDATA(dlexemes[i]); lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; @@ -761,13 +759,21 @@ array_to_tsvector(PG_FUNCTION_ARGS) deconstruct_array(v, TEXTOID, -1, false, TYPALIGN_INT, &dlexemes, &nulls, &nitems); - /* Reject nulls (maybe we should just ignore them, instead?) */ + /* + * Reject nulls and zero length strings (maybe we should just ignore them, + * instead?) + */ for (i = 0; i < nitems; i++) { if (nulls[i]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); + + if (VARSIZE(dlexemes[i]) - VARHDRSZ == 0) + ereport(ERROR, + (errcode(ERRCODE_ZERO_LENGTH_CHARACTER_STRING), + errmsg("lexeme array may not contain empty strings"))); } /* Sort and de-dup, because this is required for a valid tsvector. */ diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out index 2601e312df4..92c1c6e10bb 100644 --- a/src/test/regress/expected/tstypes.out +++ b/src/test/regress/expected/tstypes.out @@ -85,6 +85,10 @@ SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B'; 'a':3A,4B 'b':2A 'ba':1237 (1 row) +SELECT $$'' '1' '2'$$::tsvector; -- error, empty lexeme is not allowed +ERROR: syntax error in tsvector: "'' '1' '2'" +LINE 1: SELECT $$'' '1' '2'$$::tsvector; + ^ --Base tsquery test SELECT '1'::tsquery; tsquery @@ -1258,8 +1262,12 @@ SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceshi 'base' 'hidden' 'strike' (1 row) -SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); -ERROR: lexeme array may not contain nulls +SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]); + ts_delete +-------------------------- + 'base' 'hidden' 'strike' +(1 row) + SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); unnest --------------------------------------------- @@ -1328,8 +1336,11 @@ SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']); 'base' 'hidden' 'rebel' 'spaceship' 'strike' (1 row) +-- null and empty string are disallowed, since we mustn't make an empty lexeme SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]); ERROR: lexeme array may not contain nulls +SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']); +ERROR: lexeme array may not contain empty strings -- array_to_tsvector must sort and de-dup SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']); array_to_tsvector @@ -1367,14 +1378,12 @@ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', ' 'a':1C,3C 'asd':1C 'w':5,6,12B,13A 'zxc':81C,222C,567C (1 row) -SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}'); +SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]); setweight --------------------------------- 'a' 'asd' 'w':5,6,12B,13A 'zxc' (1 row) -SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]); -ERROR: lexeme array may not contain nulls SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}'); ts_filter ------------------------------------------------------------- diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql index 30c8c702f09..61e8f49c916 100644 --- a/src/test/regress/sql/tstypes.sql +++ b/src/test/regress/sql/tstypes.sql @@ -17,6 +17,7 @@ SELECT $$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector; SELECT tsvectorin(tsvectorout($$'\\as' ab\c ab\\c AB\\\c ab\\\\c$$::tsvector)); SELECT '''w'':4A,3B,2C,1D,5 a:8'; SELECT 'a:3A b:2a'::tsvector || 'ba:1234 a:1B'; +SELECT $$'' '1' '2'$$::tsvector; -- error, empty lexeme is not allowed --Base tsquery test SELECT '1'::tsquery; @@ -239,7 +240,7 @@ SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3': SELECT ts_delete('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector, ARRAY['spaceship','leya','rebel']); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel']); SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel','rebel']); -SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', NULL]); +SELECT ts_delete('base hidden rebel spaceship strike'::tsvector, ARRAY['spaceship','leya','rebel', '', NULL]); SELECT unnest('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D strike:3'::tsvector); SELECT unnest('base hidden rebel spaceship strike'::tsvector); @@ -251,7 +252,9 @@ SELECT tsvector_to_array('base:7 hidden:6 rebel:1 spaceship:2,33A,34B,35C,36D st SELECT tsvector_to_array('base hidden rebel spaceship strike'::tsvector); SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship','strike']); +-- null and empty string are disallowed, since we mustn't make an empty lexeme SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', NULL]); +SELECT array_to_tsvector(ARRAY['base','hidden','rebel','spaceship', '']); -- array_to_tsvector must sort and de-dup SELECT array_to_tsvector(ARRAY['foo','bar','baz','bar']); @@ -260,8 +263,7 @@ SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c'); SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}'); SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a}'); SELECT setweight('a:1,3A asd:1C w:5,6,12B,13A zxc:81,222A,567'::tsvector, 'c', '{a,zxc}'); -SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', '{a,zxc}'); -SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', NULL]); +SELECT setweight('a asd w:5,6,12B,13A zxc'::tsvector, 'c', ARRAY['a', 'zxc', '', NULL]); SELECT ts_filter('base:7A empir:17 evil:15 first:11 galact:16 hidden:6A rebel:1A spaceship:2A strike:3A victori:12 won:9'::tsvector, '{a}'); SELECT ts_filter('base hidden rebel spaceship strike'::tsvector, '{a}');