7
7
*
8
8
*
9
9
* IDENTIFICATION
10
- * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.1 2007/09/07 15:09 :56 teodor Exp $
10
+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.2 2007/10/21 22:29 :56 tgl Exp $
11
11
*
12
12
*-------------------------------------------------------------------------
13
13
*/
20
20
#include "tsearch/ts_utils.h"
21
21
#include "utils/memutils.h"
22
22
23
+
24
+ /*
25
+ * Private state of tsvector parser. Note that tsquery also uses this code to
26
+ * parse its input, hence the boolean flags. The two flags are both true or
27
+ * both false in current usage, but we keep them separate for clarity.
28
+ * is_tsquery affects *only* the content of error messages.
29
+ */
23
30
struct TSVectorParseStateData
24
31
{
25
- char *prsbuf;
26
- char *word; /* buffer to hold the current word */
27
- int len; /* size in bytes allocated for 'word' */
28
- bool oprisdelim;
32
+ char *prsbuf; /* next input character */
33
+ char *bufstart; /* whole string (used only for errors) */
34
+ char *word; /* buffer to hold the current word */
35
+ int len; /* size in bytes allocated for 'word' */
36
+ int eml; /* max bytes per character */
37
+ bool oprisdelim; /* treat ! | * ( ) as delimiters? */
38
+ bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
29
39
};
30
40
41
+
31
42
/*
32
43
* Initializes parser for the input string. If oprisdelim is set, the
33
44
* following characters are treated as delimiters in addition to whitespace:
34
45
* ! | & ( )
35
46
*/
36
47
TSVectorParseState
37
- init_tsvector_parser(char *input, bool oprisdelim)
48
+ init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery )
38
49
{
39
50
TSVectorParseState state;
40
51
41
52
state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
42
53
state->prsbuf = input;
54
+ state->bufstart = input;
43
55
state->len = 32;
44
56
state->word = (char *) palloc(state->len);
57
+ state->eml = pg_database_encoding_max_length();
45
58
state->oprisdelim = oprisdelim;
59
+ state->is_tsquery = is_tsquery;
46
60
47
61
return state;
48
62
}
49
63
50
64
/*
51
- * Reinitializes parser for parsing 'input', instead of previous input.
65
+ * Reinitializes parser to parse 'input', instead of previous input.
52
66
*/
53
67
void
54
68
reset_tsvector_parser(TSVectorParseState state, char *input)
@@ -66,21 +80,21 @@ close_tsvector_parser(TSVectorParseState state)
66
80
pfree(state);
67
81
}
68
82
83
+ /* increase the size of 'word' if needed to hold one more character */
69
84
#define RESIZEPRSBUF \
70
85
do { \
71
- if ( curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
86
+ int clen = curpos - state->word; \
87
+ if ( clen + state->eml >= state->len ) \
72
88
{ \
73
- int clen = curpos - state->word; \
74
89
state->len *= 2; \
75
- state->word = (char*) repalloc( (void*) state->word, state->len ); \
90
+ state->word = (char *) repalloc(state->word, state->len); \
76
91
curpos = state->word + clen; \
77
92
} \
78
93
} while (0)
79
94
80
-
81
95
#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
82
96
83
- /* Fills the output parameters, and returns true */
97
+ /* Fills gettoken_tsvector's output parameters, and returns true */
84
98
#define RETURN_TOKEN \
85
99
do { \
86
100
if (pos_ptr != NULL) \
@@ -111,18 +125,34 @@ do { \
111
125
#define WAITPOSDELIM 7
112
126
#define WAITCHARCMPLX 8
113
127
128
+ #define PRSSYNTAXERROR prssyntaxerror(state)
129
+
130
+ static void
131
+ prssyntaxerror(TSVectorParseState state)
132
+ {
133
+ ereport(ERROR,
134
+ (errcode(ERRCODE_SYNTAX_ERROR),
135
+ state->is_tsquery ?
136
+ errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
137
+ errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
138
+ }
139
+
140
+
114
141
/*
115
- * Get next token from string being parsed. Returns false if
116
- * end of input string is reached, otherwise strval, lenval, pos_ptr
117
- * and poslen output parameters are filled in:
142
+ * Get next token from string being parsed. Returns true if successful,
143
+ * false if end of input string is reached. On success, these output
144
+ * parameters are filled in:
118
145
*
119
- * *strval token
120
- * *lenval length of*strval
146
+ * *strval pointer to token
147
+ * *lenval length of *strval
121
148
* *pos_ptr pointer to a palloc'd array of positions and weights
122
149
* associated with the token. If the caller is not interested
123
150
* in the information, NULL can be supplied. Otherwise
124
151
* the caller is responsible for pfreeing the array.
125
152
* *poslen number of elements in *pos_ptr
153
+ * *endptr scan resumption point
154
+ *
155
+ * Pass NULL for unwanted output parameters.
126
156
*/
127
157
bool
128
158
gettoken_tsvector(TSVectorParseState state,
@@ -155,9 +185,7 @@ gettoken_tsvector(TSVectorParseState state,
155
185
oldstate = WAITENDWORD;
156
186
}
157
187
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
158
- ereport(ERROR,
159
- (errcode(ERRCODE_SYNTAX_ERROR),
160
- errmsg("syntax error in tsvector")));
188
+ PRSSYNTAXERROR;
161
189
else if (!t_isspace(state->prsbuf))
162
190
{
163
191
COPYCHAR(curpos, state->prsbuf);
@@ -170,7 +198,8 @@ gettoken_tsvector(TSVectorParseState state,
170
198
if (*(state->prsbuf) == '\0')
171
199
ereport(ERROR,
172
200
(errcode(ERRCODE_SYNTAX_ERROR),
173
- errmsg("there is no escaped character")));
201
+ errmsg("there is no escaped character: \"%s\"",
202
+ state->bufstart)));
174
203
else
175
204
{
176
205
RESIZEPRSBUF;
@@ -192,18 +221,14 @@ gettoken_tsvector(TSVectorParseState state,
192
221
{
193
222
RESIZEPRSBUF;
194
223
if (curpos == state->word)
195
- ereport(ERROR,
196
- (errcode(ERRCODE_SYNTAX_ERROR),
197
- errmsg("syntax error in tsvector")));
224
+ PRSSYNTAXERROR;
198
225
*(curpos) = '\0';
199
226
RETURN_TOKEN;
200
227
}
201
228
else if (t_iseq(state->prsbuf, ':'))
202
229
{
203
230
if (curpos == state->word)
204
- ereport(ERROR,
205
- (errcode(ERRCODE_SYNTAX_ERROR),
206
- errmsg("syntax error in tsvector")));
231
+ PRSSYNTAXERROR;
207
232
*(curpos) = '\0';
208
233
if (state->oprisdelim)
209
234
RETURN_TOKEN;
@@ -229,9 +254,7 @@ gettoken_tsvector(TSVectorParseState state,
229
254
oldstate = WAITENDCMPLX;
230
255
}
231
256
else if (*(state->prsbuf) == '\0')
232
- ereport(ERROR,
233
- (errcode(ERRCODE_SYNTAX_ERROR),
234
- errmsg("syntax error in tsvector")));
257
+ PRSSYNTAXERROR;
235
258
else
236
259
{
237
260
RESIZEPRSBUF;
@@ -253,9 +276,7 @@ gettoken_tsvector(TSVectorParseState state,
253
276
RESIZEPRSBUF;
254
277
*(curpos) = '\0';
255
278
if (curpos == state->word)
256
- ereport(ERROR,
257
- (errcode(ERRCODE_SYNTAX_ERROR),
258
- errmsg("syntax error in tsvector")));
279
+ PRSSYNTAXERROR;
259
280
if (state->oprisdelim)
260
281
{
261
282
/* state->prsbuf+=pg_mblen(state->prsbuf); */
@@ -290,17 +311,17 @@ gettoken_tsvector(TSVectorParseState state,
290
311
}
291
312
npos++;
292
313
WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
314
+ /* we cannot get here in tsquery, so no need for 2 errmsgs */
293
315
if (WEP_GETPOS(pos[npos - 1]) == 0)
294
316
ereport(ERROR,
295
317
(errcode(ERRCODE_SYNTAX_ERROR),
296
- errmsg("wrong position info in tsvector")));
318
+ errmsg("wrong position info in tsvector: \"%s\"",
319
+ state->bufstart)));
297
320
WEP_SETWEIGHT(pos[npos - 1], 0);
298
321
statecode = WAITPOSDELIM;
299
322
}
300
323
else
301
- ereport(ERROR,
302
- (errcode(ERRCODE_SYNTAX_ERROR),
303
- errmsg("syntax error in tsvector")));
324
+ PRSSYNTAXERROR;
304
325
}
305
326
else if (statecode == WAITPOSDELIM)
306
327
{
@@ -309,42 +330,32 @@ gettoken_tsvector(TSVectorParseState state,
309
330
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
310
331
{
311
332
if (WEP_GETWEIGHT(pos[npos - 1]))
312
- ereport(ERROR,
313
- (errcode(ERRCODE_SYNTAX_ERROR),
314
- errmsg("syntax error in tsvector")));
333
+ PRSSYNTAXERROR;
315
334
WEP_SETWEIGHT(pos[npos - 1], 3);
316
335
}
317
336
else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
318
337
{
319
338
if (WEP_GETWEIGHT(pos[npos - 1]))
320
- ereport(ERROR,
321
- (errcode(ERRCODE_SYNTAX_ERROR),
322
- errmsg("syntax error in tsvector")));
339
+ PRSSYNTAXERROR;
323
340
WEP_SETWEIGHT(pos[npos - 1], 2);
324
341
}
325
342
else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
326
343
{
327
344
if (WEP_GETWEIGHT(pos[npos - 1]))
328
- ereport(ERROR,
329
- (errcode(ERRCODE_SYNTAX_ERROR),
330
- errmsg("syntax error in tsvector")));
345
+ PRSSYNTAXERROR;
331
346
WEP_SETWEIGHT(pos[npos - 1], 1);
332
347
}
333
348
else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
334
349
{
335
350
if (WEP_GETWEIGHT(pos[npos - 1]))
336
- ereport(ERROR,
337
- (errcode(ERRCODE_SYNTAX_ERROR),
338
- errmsg("syntax error in tsvector")));
351
+ PRSSYNTAXERROR;
339
352
WEP_SETWEIGHT(pos[npos - 1], 0);
340
353
}
341
354
else if (t_isspace(state->prsbuf) ||
342
355
*(state->prsbuf) == '\0')
343
356
RETURN_TOKEN;
344
357
else if (!t_isdigit(state->prsbuf))
345
- ereport(ERROR,
346
- (errcode(ERRCODE_SYNTAX_ERROR),
347
- errmsg("syntax error in tsvector")));
358
+ PRSSYNTAXERROR;
348
359
}
349
360
else /* internal error */
350
361
elog(ERROR, "internal error in gettoken_tsvector");
0 commit comments