diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index b43872cca5c..61583df3a21 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ - + Full Text Search @@ -1862,12 +1862,12 @@ LIMIT 10; tag - HTML tag - <A HREF="dictionaries.html"> + XML tag + <a href="dictionaries.html"> entity - HTML entity + XML entity &amp; diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 3f95f60579e..b80175456d2 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.10 2007/11/15 22:25:16 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.11 2007/11/20 02:25:22 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -50,7 +50,7 @@ #define DECIMAL 20 #define SIGNEDINT 21 #define UNSIGNEDINT 22 -#define HTMLENTITY 23 +#define XMLENTITY 23 #define LASTNUM 23 @@ -95,7 +95,7 @@ static const char *const lex_descr[] = { "Hyphenated word part, all letters", "Hyphenated word part, all ASCII", "Space symbols", - "HTML tag", + "XML tag", "Protocol head", "Hyphenated word, letters and digits", "Hyphenated word, all ASCII", @@ -105,7 +105,7 @@ static const char *const lex_descr[] = { "Decimal notation", "Signed integer", "Unsigned integer", - "HTML entity" + "XML entity" }; @@ -132,11 +132,13 @@ typedef enum TPS_InMantissaFirst, TPS_InMantissaSign, TPS_InMantissa, - TPS_InHTMLEntityFirst, - TPS_InHTMLEntity, - TPS_InHTMLEntityNumFirst, - TPS_InHTMLEntityNum, - TPS_InHTMLEntityEnd, + TPS_InXMLEntityFirst, + TPS_InXMLEntity, + TPS_InXMLEntityNumFirst, + TPS_InXMLEntityNum, + TPS_InXMLEntityHexNumFirst, + TPS_InXMLEntityHexNum, + TPS_InXMLEntityEnd, TPS_InTagFirst, TPS_InXMLBegin, TPS_InTagCloseFirst, @@ -653,7 +655,7 @@ static const TParserStateActionItem actionTPS_Base[] = { {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, - {p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL}, + {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL}, {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL}, @@ -811,35 +813,56 @@ static const TParserStateActionItem actionTPS_InMantissa[] = { {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityFirst[] = { +static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL}, - {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, + {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntity[] = { +static const TParserStateActionItem actionTPS_InXMLEntity[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, - {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, + {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = { +static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL}, + {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityNum[] = { +static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL}, - {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, + {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityEnd[] = { - {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, HTMLENTITY, NULL} +static const TParserStateActionItem actionTPS_InXMLEntityNum[] = { + {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, + {NULL, 0, A_POP, TPS_Null, 0, NULL} +}; + +static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = { + {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, + {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, + {NULL, 0, A_POP, TPS_Null, 0, NULL} +}; + +static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = { + {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL} }; static const TParserStateActionItem actionTPS_InTagFirst[] = { @@ -854,8 +877,8 @@ static const TParserStateActionItem actionTPS_InTagFirst[] = { static const TParserStateActionItem actionTPS_InXMLBegin[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, /* words[i].type)) + if (XMLHLIDIGNORE(prs->words[i].type)) prs->words[i].replace = 1; } diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index b6f8f05d228..eb004020758 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -222,7 +222,7 @@ SELECT * FROM ts_token_type('default'); 10 | hword_part | Hyphenated word part, all letters 11 | hword_asciipart | Hyphenated word part, all ASCII 12 | blank | Space symbols - 13 | tag | HTML tag + 13 | tag | XML tag 14 | protocol | Protocol head 15 | numhword | Hyphenated word, letters and digits 16 | asciihword | Hyphenated word, all ASCII @@ -232,7 +232,7 @@ SELECT * FROM ts_token_type('default'); 20 | float | Decimal notation 21 | int | Signed integer 22 | uint | Unsigned integer - 23 | entity | HTML entity + 23 | entity | XML entity (23 rows) SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf qwer jf sdjk ewr1> ewri2