8000 String#gsub! Elide MatchData allocation when we know it can't escape · ruby/ruby@1895aaa · GitHub
[go: up one dir, main page]

Skip to content

Commit 1895aaa

Browse files
byrootetiennebarrie
andcommitted
String#gsub! Elide MatchData allocation when we know it can't escape
In gsub is used with a string replacement or a map that doesn't have a default proc, we know for sure no code can cause the MatchData to escape the `gsub` call. In such case, we still have to allocate a new MatchData because we don't know what is the lifetime of the backref, but for any subsequent match we can re-use the MatchData we allocated ourselves, reducing allocations significantly. This partially fixes [Misc #20652], except when a block is used, and partially reduce the performance impact of abc0304 / [Bug #17507] ``` compare-ruby: ruby 3.5.0dev (2025-02-24T09:44:57Z master 5cf1463) +PRISM [arm64-darwin24] built-ruby: ruby 3.5.0dev (2025-02-24T10:58:27Z gsub-elude-match da966636e9) +PRISM [arm64-darwin24] warming up.... | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |escape | 3.577k| 3.697k| | | -| 1.03x| |escape_bin | 5.869k| 6.743k| | | -| 1.15x| |escape_utf8 | 3.448k| 3.738k| | | -| 1.08x| |escape_utf8_bin | 6.361k| 7.267k| | | -| 1.14x| ``` Co-Authored-By: Étienne Barrié <etienne.barrie@gmail.com>
1 parent 5cf1463 commit 1895aaa

File tree

3 files changed

+44
-20
lines changed

3 files changed

+44
-20
lines changed

internal/re.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
/* re.c */
1515
VALUE rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline);
1616
VALUE rb_reg_check_preprocess(VALUE);
17-
long rb_reg_search0(VALUE, VALUE, long, int, int);
17+
long rb_reg_search0(VALUE, VALUE, long, int, int, VALUE *);
1818
VALUE rb_reg_match_p(VALUE re, VALUE str, long pos);
1919
bool rb_reg_start_with_p(VALUE re, VALUE str);
2020
VALUE rb_reg_hash(VALUE re);
2121
VALUE rb_reg_equal(VALUE re1, VALUE re2);
22-
void rb_backref_set_string(VALUE string, long pos, long len);
22+
VALUE rb_backref_set_string(VALUE string, long pos, long len);
2323
void rb_match_unbusy(VALUE);
2424
int rb_match_count(VALUE match);
2525
VALUE rb_reg_new_ary(VALUE ary, int options);

re.c

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,7 +1521,7 @@ match_set_string(VALUE m, VALUE string, long pos, long len)
15211521
rmatch->regs.end[0] = pos + len;
15221522
}
15231523

1524-
void
1524+
VALUE
15251525
rb_backref_set_string(VALUE string, long pos, long len)
15261526
{
15271527
VALUE match = rb_backref_get();
@@ -1530,6 +1530,7 @@ rb_backref_set_string(VALUE string, long pos, long len)
15301530
}
15311531
match_set_string(match, string, pos, len);
15321532
rb_backref_set(match);
1533+
return match;
15331534
}
15341535

15351536
/*
@@ -1812,12 +1813,20 @@ rb_reg_search_set_match(VALUE re, VALUE str, long pos, int reverse, int set_back
18121813
return ONIG_MISMATCH;
18131814
}
18141815

1815-
VALUE match = match_alloc(rb_cMatch);
1816+
VALUE match;
1817+
if (set_match && RTEST(*set_match)) {
1818+
match = *set_match;
1819+
}
1820+
else {
1821+
match = match_alloc(rb_cMatch);
1822+
}
1823+
18161824
rb_matchext_t *rm = RMATCH_EXT(match);
18171825
rm->regs = regs;
18181826

18191827
if (set_backref_str) {
18201828
RB_OBJ_WRITE(match, &RMATCH(match)->str, rb_str_new4(str));
1829+
rb_obj_reveal(match, rb_cMatch);
18211830
}
18221831
else {
18231832
/* Note that a MatchData object with RMATCH(match)->str == 0 is incomplete!
@@ -1835,15 +1844,15 @@ rb_reg_search_set_match(VALUE re, VALUE str, long pos, int reverse, int set_back
18351844
}
18361845

18371846
long
1838-
rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str)
1847+
rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str, VALUE *match)
18391848
{
1840-
return rb_reg_search_set_match(re, str, pos, reverse, set_backref_str, NULL);
1849+
return rb_reg_search_set_match(re, str, pos, reverse, set_backref_str, match);
18411850
}
18421851

18431852
long
18441853
rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
18451854
{
1846-
return rb_reg_search0(re, str, pos, reverse, 1);
1855+
return rb_reg_search_set_match(re, str, pos, reverse, 1, NULL);
18471856
}
18481857

18491858
static OnigPosition

string.c

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6122,14 +6122,17 @@ get_pat_quoted(VALUE pat, int check)
61226122
}
61236123

61246124
static long
6125-
rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6125+
rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
61266126
{
61276127
if (BUILTIN_TYPE(pat) == T_STRING) {
61286128
pos = rb_str_byteindex(str, pat, pos);
61296129
if (set_backref_str) {
61306130
if (pos >= 0) {
61316131
str = rb_str_new_frozen_String(str);
6132-
rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6132+
VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
6133+
if (match) {
6134+
*match = match_data;
6135+
}
61336136
}
61346137
else {
61356138
rb_backref_set(Qnil);
@@ -6138,10 +6141,16 @@ rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
61386141
return pos;
61396142
}
61406143
else {
6141-
return rb_reg_search0(pat, str, pos, 0, set_backref_str);
6144+
return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
61426145
}
61436146
}
61446147

6148+
static long
6149+
rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
6150+
{
6151+
return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
6152+
}
6153+
61456154

61466155
/*
61476156
* call-seq:
@@ -6293,12 +6302,12 @@ rb_str_sub(int argc, VALUE *argv, VALUE str)
62936302
static VALUE
62946303
str_gsub(int argc, VALUE *argv, VALUE str, int bang)
62956304
{
6296-
VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
6305+
VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
62976306
long beg, beg0, end0;
62986307
long offset, blen, slen, len, last;
62996308
enum {STR, ITER, FAST_MAP, MAP} mode = STR;
63006309
char *sp, *cp;
6301-
int need_backref = -1;
6310+
int need_backref_str = -1;
63026311
rb_encoding *str_enc;
63036312

63046313
switch (argc) {
@@ -6324,7 +6333,8 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
63246333
}
63256334

63266335
pat = get_pat_quoted(argv[0], 1);
6327-
beg = rb_pat_search(pat, str, 0, need_backref);
6336+
beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
6337+
63286338
if (beg < 0) {
63296339
if (bang) return Qnil; /* no match, no substitution */
63306340
return str_duplicate(rb_cString, str);
@@ -6341,7 +6351,6 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
63416351
ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
63426352

63436353
do {
6344-
VALUE match = rb_backref_get();
63456354
struct re_registers *regs = RMATCH_REGS(match);
63466355
if (RB_TYPE_P(pat, T_STRING)) {
63476356
beg0 = beg;
@@ -6354,7 +6363,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
63546363
if (mode == ITER) match0 = rb_reg_nth_match(0, match);
63556364
}
63566365

6357-
if (mode) {
6366+
if (mode != STR) {
63586367
if (mode == ITER) {
63596368
val = rb_obj_as_string(rb_yield(match0));
63606369
}
@@ -6378,10 +6387,10 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
63786387
rb_raise(rb_eRuntimeError, "block should not cheat");
63796388
}
63806389
}
6381-
else if (need_backref) {
6390+
else if (need_backref_str) {
63826391
val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
6383-
if (need_backref < 0) {
6384-
need_backref = val != repl;
6392+
if (need_backref_str < 0) {
6393+
need_backref_str = val != repl;
63856394
}
63866395
}
63876396
else {
@@ -6409,14 +6418,20 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
64096418
}
64106419
cp = RSTRING_PTR(str) + offset;
64116420
if (offset > RSTRING_LEN(str)) break;
6412-
beg = rb_pat_search(pat, str, offset, need_backref);
6421+
6422+
// In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
6423+
if (mode != FAST_MAP && mode != STR) {
6424+
match = Qnil;
6425+
}
6426+
beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
64136427

64146428
RB_GC_GUARD(match);
64156429
} while (beg >= 0);
6430+
64166431
if (RSTRING_LEN(str) > offset) {
64176432
rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
64186433
}
6419-
rb_pat_search(pat, str, last, 1);
6434+
rb_pat_search0(pat, str, last, 1, &match);
64206435
if (bang) {
64216436
str_shared_replace(str, dest);
64226437
}

0 commit comments

Comments
 (0)
0