8000 Fix conversion of SIMILAR TO regexes for character classes · postgres/postgres@d46911e · GitHub
[go: up one dir, main page]

Skip to content

Commit d46911e

Browse files
committed
Fix conversion of SIMILAR TO regexes for character classes
The code that translates SIMILAR TO pattern matching expressions to POSIX-style regular expressions did not consider that square brackets can be nested. For example, in an expression like [[:alpha:]%_], the logic replaced the placeholders '_' and '%' but it should not. This commit fixes the conversion logic by tracking the nesting level of square brackets marking character class areas, while considering that in expressions like []] or [^]] the first closing square bracket is a regular character. Multiple tests are added to show how the conversions should or should not apply applied while in a character class area, with specific cases added for all the characters converted outside character classes like an opening parenthesis '(', dollar sign '$', etc. Author: Laurenz Albe <laurenz.albe@cybertec.at> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Michael Paquier <michael@paquier.xyz> Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at Backpatch-through: 13
1 parent 3e782ca commit d46911e

File tree

3 files changed

+114
-6
lines changed

3 files changed

+114
-6
lines changed

src/backend/utils/adt/regexp.c

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -773,8 +773,11 @@ similar_escape_internal(text *pat_text, text *esc_text)
773773
int plen,
774774
elen;
775775
bool afterescape = false;
776-
bool incharclass = false;
777776
int nquotes = 0;
777+
int charclass_depth = 0; /* Nesting level of character classes,
778+
* encompassed by square brackets */
779+
int charclass_start = 0; /* State of the character class start,
780< 8000 /code>+
* for carets */
778781

779782
p = VARDATA_ANY(pat_text);
780783
plen = VARSIZE_ANY_EXHDR(pat_text);
@@ -904,7 +907,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
904907
/* fast path */
905908
if (afterescape)
906909
{
907-
if (pchar == '"' && !incharclass) /* escape-double-quote? */
910+
if (pchar == '"' && charclass_depth < 1) /* escape-double-quote? */
908911
{
909912
/* emit appropriate part separator, per notes above */
910913
if (nquotes == 0)
@@ -953,18 +956,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
953956
/* SQL escape character; do not send to output */
954957
afterescape = true;
955958
}
956-
else if (incharclass)
959+
else if (charclass_depth > 0)
957960
{
958961
if (pchar == '\\')
959962
*r++ = '\\';
960963
*r++ = pchar;
961-
if (pchar == ']')
962-
incharclass = false;
964+
965+
/*
966+
* Ignore a closing bracket at the start of a character class.
967+
* Such a bracket is taken literally rather than closing the
968+
* class. "charclass_start" is 1 right at the beginning of a
969+
* class and 2 after an initial caret.
970+
*/
971+
if (pchar == ']' && charclass_start > 2)
972+
charclass_depth--;
973+
else if (pchar == '[')
974+
charclass_depth++;
975+
976+
/*
977+
* If there is a caret right after the opening bracket, it negates
978+
* the character class, but a following closing bracket should
979+
* still be treated as a normal character. That holds only for
980+
* the first caret, so only the values 1 and 2 mean that closing
981+
* brackets should be taken literally.
982+
*/
983+
if (pchar == '^')
984+
charclass_start++;
985+
else
986+
charclass_start = 3; /* definitely past the start */
963987
}
964988
else if (pchar == '[')
965989
{
990+
/* start of a character class */
966991
*r++ = pchar;
967-
incharclass = true;
992+
charclass_depth++;
993+
charclass_start = 1;
968994
}
969995
else if (pchar == '%')
970996
{

src/test/regress/expected/strings.out

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,68 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
614614
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
615615
ERROR: invalid escape string
616616
HINT: Escape string must be empty or one character.
617+
-- Characters that should be left alone in character classes when a
618+
-- SIMILAR TO regexp pattern is converted to POSIX style.
619+
-- Underscore "_"
620+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
621+
QUERY PLAN
622+
------------------------------------------------
623+
Seq Scan on text_tbl
624+
Filter: (f1 ~ '^(?:.[_[:alpha:]_].)$'::text)
625+
(2 rows)
626+
627+
-- Percentage "%"
628+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
629+
QUERY PLAN
630+
--------------------------------------------------
631+
Seq Scan on text_tbl
632+
Filter: (f1 ~ '^(?:.*[%[:alnum:]%].*)$'::text)
633+
(2 rows)
634+
635+
-- Dot "."
636+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
637+
QUERY PLAN
638+
--------------------------------------------------
639+
Seq Scan on text_tbl
640+
Filter: (f1 ~ '^(?:\.[.[:alnum:].]\.)$'::text)
641+
(2 rows)
642+
643+
-- Dollar "$"
644+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
645+
QUERY PLAN
646+
--------------------------------------------------
647+
Seq Scan on text_tbl
648+
Filter: (f1 ~ '^(?:\$[$[:alnum:]$]\$)$'::text)
649+
(2 rows)
650+
651+
-- Opening parenthesis "("
652+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
653+
ERROR: invalid regular expression: parentheses () not balanced
654+
-- Caret "^"
655+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
656+
QUERY PLAN
657+
------------------------------------------------------------------------
658+
Seq Scan on text_tbl
659+
Filter: (f1 ~ '^(?:\^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]\^)$'::text)
660+
(2 rows)
661+
662+
-- Closing square bracket "]" at the beginning of character class
663+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
664+
QUERY PLAN
665+
------------------------------------------------
666+
Seq Scan on text_tbl
667+
Filter: (f1 ~ '^(?:[]%][^]%][^%].*)$'::text)
668+
(2 rows)
669+
670+
-- Closing square bracket effective after two carets at the beginning
671+
-- of character class.
672+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
673+
QUERY PLAN
674+
---------------------------------------
675+
Seq Scan on text_tbl
676+
Filter: (f1 ~ '^(?:[^^]\^)$'::text)
677+
(2 rows)
678+
617679
-- Test backslash escapes in regexp_replace's replacement string
618680
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
619681
regexp_replace

src/test/regress/sql/strings.sql

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,26 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
197197
SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
198198
SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
199199

200+
-- Characters that should be left alone in character classes when a
201+
-- SIMILAR TO regexp pattern is converted to POSIX style.
202+
-- Underscore "_"
203+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '_[_[:alpha:]_]_';
204+
-- Percentage "%"
205+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '%[%[:alnum:]%]%';
206+
-- Dot "."
207+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '.[.[:alnum:].].';
208+
-- Dollar "$"
209+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '$[$[:alnum:]$]$';
210+
-- Opening parenthesis "("
211+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '([([:alnum:](](';
212+
-- Caret "^"
213+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '^[^[:alnum:]^[^^][[^^]][\^][[\^]]\^]^';
214+
-- Closing square bracket "]" at the beginning of character class
215+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[]%][^]%][^%]%';
216+
-- Closing square bracket effective after two carets at the beginning
217+
-- of character class.
218+
EXPLAIN (COSTS OFF) SELECT * FROM TEXT_TBL WHERE f1 SIMILAR TO '[^^]^';
219+
200220
-- Test backslash escapes in regexp_replace's replacement string
201221
SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
202222
SELECT regexp_replace('foobarrbazz', E'(.)\\1', E'X\\&Y', 'g');

0 commit comments

Comments
 (0)
0