8000 Add prefix support for synonym dictionary · larkly/postgres-docker@abd8c94 · GitHub
[go: up one dir, main page]

Skip to content

Commit abd8c94

Browse files
committed
Add prefix support for synonym dictionary
1 parent 0c73808 commit abd8c94

File tree

5 files changed

+111
-8
lines changed

5 files changed

+111
-8
lines changed

doc/src/sgml/textsearch.sgml

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.52 2009/06/17 21:58:49 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.53 2009/08/14 14:53:20 teodor Exp $ -->
22

33
<chapter id="textsearch">
44
<title id="textsearch-title">Full Text Search</title>
@@ -2288,6 +2288,63 @@ SELECT * FROM ts_debug('english', 'Paris');
22882288
asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
22892289
</programlisting>
22902290
</para>
2291+
2292+
<para>
2293+
An asterisk (<literal>*</literal>) at the end of definition word indicates
2294+
that definition word is a prefix, and <function>to_tsquery()</function>
2295+
function will transform that definition to the prefix search format (see
2296+
<xref linkend="textsearch-parsing-queries">).
2297+
Notice that it is ignored in <function>to_tsvector()</function>.
2298+
</para>
2299+
2300+
<para>
2301+
Contents of <filename>$SHAREDIR/tsearch_data/synonym_sample.syn</>:
2302+
</para>
2303+
<programlisting>
2304+
postgres pgsql
2305+
postgresql pgsql
2306+
postgre pgsql
2307+
gogle googl
2308+
indices index*
2309+
</programlisting>
2310+
2311+
<para>
2312+
Results:
2313+
</para>
2314+
<programlisting>
2315+
=# create text search dictionary syn( template=synonym,synonyms='synonym_sample');
2316+
=# select ts_lexize('syn','indices');
2317+
ts_lexize
2318+
-----------
2319+
{index}
2320+
(1 row)
2321+
2322+
=# create text search configuration tst ( copy=simple);
2323+
=# alter text search configuration tst alter mapping for asciiword with syn;
2324+
=# select to_tsquery('tst','indices');
2325+
to_tsquery
2326+
------------
2327+
'index':*
2328+
(1 row)
2329+
2330+
=# select 'indexes are very useful'::tsvector;
2331+
tsvector
2332+
---------------------------------
2333+
'are' 'indexes' 'useful' 'very'
2334+
(1 row)
2335+
2336+
=# select 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
2337+
?column?
2338+
----------
2339+
t
2340+
(1 row)
2341+
2342+
=# select to_tsvector('tst','indices');
2343+
to_tsvector
2344+
-------------
2345+
'index':1
2346+
(1 row)
2347+
</programlisting>
22912348

22922349
<para>
22932350
The only parameter required by the <literal>synonym</> template is

src/backend/tsearch/dict_synonym.c

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.10 2009/01/01 17:23:48 momjian Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.11 2009/08/14 14:53:20 teodor Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -23,6 +23,8 @@ typedef struct
2323
{
2424
char *in;
2525
char *out;
26+
int outlen;
27+
uint16 flags;
2628
} Syn;
2729

2830
typedef struct
@@ -36,11 +38,14 @@ typedef struct
3638
* Finds the next whitespace-delimited word within the 'in' string.
3739
* Returns a pointer to the first character of the word, and a pointer
3840
* to the next byte after the last character in the word (in *end).
41+
* Character '*' at the end of word will not be threated as word
42+
* charater if flags is not null.
3943
*/
4044
static char *
41-
findwrd(char *in, char **end)
45+
findwrd(char *in, char **end, uint16 *flags)
4246
{
4347
char *start;
48+
char *lastchar;
4449

4550
/* Skip leading spaces */
4651
while (*in && t_isspace(in))
@@ -53,13 +58,27 @@ findwrd(char *in, char **end)
5358
return NULL;
5459
}
5560

56-
start = in;
61+
lastchar = start = in;
5762

5863
/* Find end of word */
5964
while (*in && !t_isspace(in))
65+
{
66+
lastchar = in;
6067
in += pg_mblen(in);
68+
}
69+
70+
if ( in - lastchar == 1 && t_iseq(lastchar, '*') && flags )
71+
{
72+
*flags = TSL_PREFIX;
73+
*end = lastchar;
74+
}
75+
else
76+
{
77+
if (flags)
78+
*flags = 0;
79+
*end = in;
80+
}
6181

62-
*end = in;
6382
return start;
6483
}
6584

@@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
84103
*end = NULL;
85104
int cur = 0;
86105
char *line = NULL;
106+
uint16 flags = 0;
87107

88108
foreach(l, dictoptions)
89109
{
@@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
117137

118138
while ((line = tsearch_readline(&trst)) != NULL)
119139
{
120-
starti = findwrd(line, &end);
140+
starti = findwrd(line, &end, NULL);
121141
if (!starti)
122142
{
123143
/* Empty line */
@@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
130150
}
131151
*end = '\0';
132152

133-
starto = findwrd(end + 1, &end);
153+
starto = findwrd(end + 1, &end, &flags);
134154
if (!starto)
135155
{
136156
/* A line with only one word (+whitespace). Ignore silently. */
@@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS)
168188
d->syn[cur].out = lowerstr(starto);
169189
}
170190

191+
d->syn[cur].outlen = strlen(starto);
192+
d->syn[cur].flags = flags;
193+
171194
cur++;
172195

173196
skipline:
@@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
212235
PG_RETURN_POINTER(NULL);
213236

214237
res = palloc0(sizeof(TSLexeme) * 2);
215-
res[0].lexeme = pstrdup(found->out);
238+
res[0].lexeme = pnstrdup(found->out, found->outlen);
239+
res[0].flags = found->flags;
216240

217241
PG_RETURN_POINTER(res);
218242
}

src/backend/tsearch/synonym_sample.syn

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ postgres pgsql
22
postgresql pgsql
33
postgre pgsql
44
gogle googl
5+
indices index*

src/test/regress/expected/tsdicts.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle');
208208
{googl}
209209
(1 row)
210210

211+
SELECT ts_lexize('synonym', 'indices');
212+
ts_lexize
213+
-----------
214+
{index}
215+
(1 row)
216+
211217
-- Create and simple test thesaurus dictionary
212218
-- More tests in configuration checks because ts_lexize()
213219
-- cannot pass more than one word to thesaurus.
@@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead
290296
'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
291297
(1 row)
292298

299+
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
300+
to_tsvector
301+
----------------------------------------------
302+
'form':8 'index':1,3,10 'plural':7 'right':6
303+
(1 row)
304+
305+
SELECT to_tsquery('synonym_tst', 'Index & indices');
306+
to_tsquery
307+
---------------------
308+
'index' & 'index':*
309+
(1 row)
310+
293311
-- test thesaurus in configuration
294312
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector
295313
CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (

src/test/regress/sql/tsdicts.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym (
5656

5757
SELECT ts_lexize('synonym', 'PoStGrEs');
5858
SELECT ts_lexize('synonym', 'Gogle');
59+
SELECT ts_lexize('synonym', 'indices');
5960

6061
-- Create and simple test thesaurus dictionary
6162
-- More tests in configuration checks because ts_lexize()
@@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
104105

105106
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
106107
SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
108+
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
109+
SELECT to_tsquery('synonym_tst', 'Index & indices');
107110

108111
-- test thesaurus in configuration
109112
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector

0 commit comments

Comments
 (0)
0