8000 Fix assorted bugs in contrib/unaccent's configuration file parsing. · jaylevitt/postgres@27864da · GitHub
[go: up one dir, main page]

Skip to content

Commit 27864da

Browse files
committed
Fix assorted bugs in contrib/unaccent's configuration file parsing.
Make it use t_isspace() to identify whitespace, rather than relying on sscanf which is known to get it wrong on some platform/locale combinations. Get rid of fixed-size buffers. Make it actually continue to parse the file after ignoring a line with untranslatable characters, as was obviously intended. The first of these issues is per gripe from J Smith, though not exactly either of his proposed patches.
1 parent 8bfc2b5 commit 27864da

File tree

1 file changed

+67
-19
lines changed

1 file changed

+67
-19
lines changed

contrib/unaccent/unaccent.c

Lines changed: 67 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -93,35 +93,83 @@ initSuffixTree(char *filename)
9393

9494
do
9595
{
96-
char src[4096];
97-
char trg[4096];
98-
int srclen;
99-
int trglen;
100-
char *line = NULL;
101-
96+
/*
97+
* pg_do_encoding_conversion() (called by tsearch_readline()) will
98+
* emit exception if it finds untranslatable characters in current
99+
* locale. We just skip such lines, continuing with the next.
100+
*/
102101
skip = true;
103102

104103
PG_TRY();
105104
{
106-
/*
107-
* pg_do_encoding_conversion() (called by tsearch_readline()) will
108-
* emit exception if it finds untranslatable characters in current
109-
* locale. We just skip such characters.
110-
*/
105+
char *line;
106+
111107
while ((line = tsearch_readline(&trst)) != NULL)
112108
{
113-
if (sscanf(line, "%s\t%s\n", src, trg) != 2)
114-
continue;
109+
/*
110+
* The format of each line must be "src trg" where src and trg
111+
* are sequences of one or more non-whitespace characters,
112+
* separated by whitespace. Whitespace at start or end of
113+
* line is ignored.
114+
*/
115+
int state;
116+
char *ptr;
117+
char *src = NULL;
118+
char *trg = NULL;
119+
int ptrlen;
120+
int srclen = 0;
121+
int trglen = 0;
122+
123+
state = 0;
124+
for (ptr = line; *ptr; ptr += ptrlen)
125+
{
126+
ptrlen = pg_mblen(ptr);
127+
/* ignore whitespace, but end src or trg */
128+
if (t_isspace(ptr))
129+
{
130+
if (state == 1)
131+
state = 2;
132+
else if (state == 3)
133+
state = 4;
134+
continue;
135+
}
136+
switch (state)
137+
{
138+
case 0:
139+
/* start of src */
140+
src = ptr;
141+
srclen = ptrlen;
142+
state = 1;
143+
break;
144+
case 1:
145+
/* continue src */
146+
srclen += ptrlen;
147+
break;
148+
case 2:
149+
/* start of trg */
150+
trg = ptr;
151+
trglen = ptrlen;
152+
state = 3;
153+
break;
154+
case 3:
155+
/* continue trg */
156+
trglen += ptrlen;
157+
break;
158+
default:
159+
/* bogus line format */
160+
state = -1;
161+
break;
162+
}
163+
}
115164

116-
srclen = strlen(src);
117-
trglen = strlen(trg);
165+
if (state >= 3)
166+
rootSuffixTree = placeChar(rootSuffixTree,
167+
(unsigned char *) src, srclen,
168+
trg, trglen);
118169

119-
rootSuffixTree = placeChar(rootSuffixTree,
120-
(unsigned char *) src, srclen,
121-
trg, trglen);
122-
skip = false;
123170
pfree(line);
124171
}
172+
skip = false;
125173
}
126174
PG_CATCH();
127175
{

0 commit comments

Comments
 (0)
0