8000 tested a bit; utf-8 bugs removed · jruby/ruby@7ce7d91 · GitHub
[go: up one dir, main page]

10000 Skip to content

Commit 7ce7d91

Browse files
author
matz
committed
tested a bit; utf-8 bugs removed
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_m17n@1229 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
1 parent 841e67c commit 7ce7d91

File tree

4 files changed

+104
-70
lines changed

4 files changed

+104
-70
lines changed

intern.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,7 @@ struct m17n_encoding *ruby_default_encoding;
303303
struct m17n_encoding *rb_m17n_get_encoding _((VALUE obj));
304304
void rb_m17n_associate_encoding _((VALUE obj, struct m17n_encoding* enc));
305305
void rb_m17n_enc_check _((VALUE str1, VALUE str2, struct m17n_encoding **encp));
306+
int rb_str_sublen _((VALUE, int));
306307
VALUE rb_enc_get_encoding _((VALUE obj));
307308
VALUE rb_enc_set_encoding _((VALUE obj, VALUE encoding));
308309
VALUE rb_str_new _((const char*, long));

m17n.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -579,10 +579,10 @@ utf8_codepoint(p, e, enc)
579579

580580
if (p == e) return 0;
581581
if (*p < 0x80) return *p;
582-
c = *p++;
582+
c = *p;
583583
n = utf8_mbctab[c];
584584
if (e-p < n) return 0;
585-
n--;
585+
p++; n--;
586586
c &= (1<<(6-n))-1;
587587
while (n--) {
588588
c = c << 6 | (*p++ & ((1<<6)-1));

re.c

Lines changed: 4 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,8 @@ enc_nth(str, nth)
3434
VALUE str;
3535
int nth;
3636
{
37-
m17n_encoding *enc = rb_m17n_get_encoding(str);
38-
if (m17n_mbmaxlen(enc) == 1) return INT2NUM(nth);
39-
else {
40-
char *p = RSTRING(str)->ptr;
41-
char *e = p + nth;
42-
int i;
43-
44-
i = 0;
45-
while (p < e) {
46-
p += mbclen(*p);
47-
i++;
48-
}
49-
return INT2NUM(i);
50-
}
37+
int i = rb_str_sublen(str, nth);
38+
return INT2NUM(i);
5139
}
5240

5341
extern int ruby_in_compile;
@@ -133,18 +121,17 @@ rb_reg_desc(s, len, re)
133121
m17n_encoding *enc;
134122
VALUE str = rb_str_new2( A3E2 "/");
135123

136-
137124
enc = re ? rb_m17n_get_encoding(re) : ruby_default_encoding;
138125
rb_m17n_associate_encoding(str, enc);
139126
rb_reg_expr_str(str, s, len);
140127
rb_str_cat2(str, "/");
141128
if (re) {
142129
rb_reg_check(re);
143-
if (RREGEXP(re)->ptr->options & RE_OPTION_MULTILINE)
144-
rb_str_cat2(str, "m");
145130
/* /p is obsolete; to be removed */
146131
if ((RREGEXP(re)->ptr->options & RE_OPTION_POSIXLINE) == RE_OPTION_POSIXLINE)
147132
rb_str_cat2(str, "p");
133+
else if (RREGEXP(re)->ptr->options & RE_OPTION_MULTILINE)
134+
rb_str_cat2(str, "m");
148135
if (RREGEXP(re)->ptr->options & RE_OPTION_IGNORECASE)
149136
rb_str_cat2(str, "i");
150137
if (RREGEXP(re)->ptr->options & RE_OPTION_EXTENDED)
@@ -1123,7 +1110,6 @@ Init_Regexp()
11231110
rb_cMatch = rb_define_class("MatchData", rb_cObject);
11241111
rb_define_global_const("MatchingData", rb_cMatch);
11251112
rb_undef_method(CLASS_OF(rb_cMatch), "new");
1126-
11271113
rb_define_method(rb_cMatch, "clone", match_clone, 0);
11281114
rb_define_method(rb_cMatch, "size", match_size, 0);
11291115
rb_define_method(rb_cMatch, "length", match_size, 0);

string.c

Lines changed: 97 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,11 @@ rb_enc_set_encoding(obj, encoding)
140140
return encoding;
141141
}
142142

143-
static void
144-
rb_str_copy_encoding(str1, str2)
145-
VALUE str1, str2;
143+
void
144+
rb_m17n_copy_encoding(obj1, obj2)
145+
VALUE obj1, obj2;
146146
{
147-
rb_m17n_associate_encoding(str1, rb_m17n_get_encoding(str2));
147+
rb_m17n_associate_encoding(obj1, rb_m17n_get_encoding(obj2));
148148
}
149149

150150
#define str_ptr(str) RSTRING(str)->ptr
@@ -280,7 +280,7 @@ rb_str_become(str, str2)
280280
RSTRING(str)->orig = RSTRING(str2)->orig;
281281
RSTRING(str2)->ptr = 0; /* abandon str2 */
282282
RSTRING(str2)->len = 0;
283-
rb_str_copy_encoding(str, str2);
283+
rb_m17n_copy_encoding(str, str2);
284284
if (OBJ_TAINTED(str2)) OBJ_TAINT(str);
285285
}
286286

@@ -512,6 +512,35 @@ str_nth(enc, p, e, idx)
512512
return p;
513513
}
514514

515+
static int
516+
str_sublen(str, len, enc)
517+
VALUE str;
518+
int len;
519+
m17n_encoding *enc;
520+
{
521+
if (m17n_mbmaxlen(enc) == 1) return len;
522+
else {
523+
char *p = str_ptr(str);
524+
char *e = p + len;
525+
int i;
526+
527+
i = 0;
528+
while (p < e) {
529+
p += mbclen(*p);
530+
i++;
531+
}
532+
return i;
533+
}
534+
}
535+
536+
int
537+
rb_str_sublen(str, len)
538+
VALUE str;
539+
int len;
540+
{
541+
return str_sublen(str, len, rb_m17n_get_encoding(str));
542+
}
543+
515544
VALUE
516545
rb_str_substr(str, beg, len)
517546
VALUE str;
@@ -798,6 +827,7 @@ rb_str_match(x, y)
798827
if (start == -1) {
799828
return Qnil;
800829
}
830+
start = rb_str_sublen(x, start);
801831
return INT2NUM(start);
802832

803833
default:
@@ -823,20 +853,22 @@ rb_str_index(str, sub, offset)
823853

824854
rb_m17n_enc_check(str, sub, &enc);
825855
if (offset < 0) {
826-
offset += str_len(str);
856+
offset += str_strlen(str, enc);
827857
if (offset < 0) return -1;
828858
}
829-
if (str_len(str) - offset < str_len(sub)) return -1;
830-
s = str_ptr(str)+offset;
859+
if (str_strlen(str, enc) - offset < str_strlen(sub, enc))
860+
return -1;
861+
s = str_nth(enc, str_ptr(str), str_end(str), offset);
831862
p = str_ptr(sub);
832863
len = str_len(sub);
833864
if (len == 0) return offset;
834865
e = str_ptr(str) + str_len(str) - len + 1;
835866
while (s < e) {
836867
if (str_memcmp(s, p, len, enc) == 0) {
837-
return (s-(str_ptr(str)));
868+
return offset;
838869
}
839-
s++;
870+
offset++;
871+
s += m17n_mbclen(enc, *s);
840872
}
841873
return -1;
842874
}
@@ -866,6 +898,7 @@ rb_str_index_m(argc, argv, str)
866898
case T_REGEXP:
867899
pos = rb_reg_adjust_startpos(sub, str, pos, 0);
868900
pos = rb_reg_search(sub, str, pos, 0);
901+
pos = rb_str_sublen(str, pos);
869902
break;
870903

871904
case T_STRING:
@@ -901,8 +934,8 @@ rb_str_rindex(argc, argv, str)
901934
{
902935
VALUE sub;
903936
VALUE position;
904-
int pos, len;
905-
char *s, *sbeg, *t;
937+
int pos, len, len2;
938+
char *s, *sbeg, *e, *t;
906939
m17n_encoding *enc;
907940

908941
if (rb_scan_args(argc, argv, "11", &sub, &position) == 2) {
@@ -923,31 +956,32 @@ rb_str_rindex(argc, argv, str)
923956
if (RREGEXP(sub)->len) {
924957
pos = rb_reg_adjust_startpos(sub, str, pos, 1);
925958
pos = rb_reg_search(sub, str, pos, 1);
959+
pos = str_sublen(str, pos, enc);
926960
}
927961
if (pos >= 0) return INT2NUM(pos);
928962
break;
929963

930964
case T_STRING:
931965
rb_m17n_enc_check(str, sub, &enc);
932-
len = str_len(sub);
933-
if (str_len(str) < len) return Qnil;
934-
if (str_len(str) - pos < len) {
935-
pos = str_len(str) - len;
966+
len = str_strlen(sub);
967+
len2 = str_strlen(str);
968+
if (len > len2) return Qnil;
969+
if (len2 - pos < len) {
970+
pos = len2 - len;
971+
}
972+
if (len == 0) {
973+
return INT2NUM(pos);
936974
}
937975
sbeg = str_ptr(str);
938-
s = str_ptr(str) + pos;
976+
e = str_end(str);
939977
t = str_ptr(sub);
940-
if (len) {
941-
while (sbeg <= s) {
942-
if (str_memcmp(s, t, len, enc) == 0) {
943-
return INT2NUM(s - str_ptr(str));
944-
}
945-
s--;
978+
do {
979+
s = str_nth(enc, sbeg, e, pos);
980+
if (str_memcmp(s, t, len, enc) == 0) {
981+
return INT2NUM(pos);
946982
}
947-
}
948-
else {
949-
return INT2NUM(pos);
950-
}
983+
pos--;
984+
} while (sbeg <= s);
951985
break;
952986

953987
case T_FIXNUM:
@@ -1002,7 +1036,7 @@ rb_str_succ(orig)
10021036
{
10031037
m17n_encoding *enc = rb_m17n_get_encoding(orig);
10041038
VALUE str;
1005-
char *sbeg, *s;
1039+
char *sbeg, *s, *e;
10061040
int c = -1;
10071041
int n = 0;
10081042

@@ -1011,9 +1045,11 @@ rb_str_succ(orig)
10111045
if (str_len(str) == 0) return str;
10121046

10131047
sbeg = str_ptr(str); s = sbeg + str_len(str) - 1;
1048+
e = str_end(str);
10141049

10151050
while (sbeg <= s) {
1016-
if (m17n_isalnum(enc, *s)) {
1051+
int c = m17n_codepoint(enc, s, e);
1052+
if (m17n_isalnum(enc, c)) {
10171053
if ((c = succ_char(s)) == 0) break;
10181054
n = s - sbeg;
10191055
}
@@ -1278,25 +1314,30 @@ rb_str_aset_m(argc, argv, str)
12781314
{
12791315
rb_str_modify(str);
12801316
if (argc == 3) {
1281-
long beg, len;
1317+
m17n_encoding *enc;
1318+
long beg, len, slen, b, l;
12821319

12831320
if (TYPE(argv[2]) != T_STRING) argv[2] = rb_str_to_str(argv[2]);
12841321
beg = NUM2INT(argv[0]);
12851322
len = NUM2INT(argv[1]);
12861323
if (len < 0) rb_raise(rb_eIndexError, "negative length %d", len);
1324+
slen = str_strlen(str);
12871325
if (beg < 0) {
1288-
beg += str_len(str);
1326+
beg += slen;
12891327
}
1290-
if (beg < 0 || str_len(str) < beg) {
1328+
if (beg < 0 || slen < beg) {
12911329
if (beg < 0) {
1292-
beg -= str_len(str);
1330+
beg -= slen;
12931331
}
12941332
rb_raise(rb_eIndexError, "index %d out of string", beg);
12951333
}
1296-
if (beg + len > str_len(str)) {
1297-
len = str_len(str) - beg;
1334+
if (beg + len > slen) {
1335+
len = slen - beg;
12981336
}
1299-
rb_str_replace(str, beg, len, argv[2]);
1337+
rb_m17n_enc_check(str, argv[2], &enc);
1338+
b = str_nth(enc, str_ptr(str), str_end(str), beg) - str_ptr(str);
1339+
l = str_nth(enc, str_ptr(str), str_end(str), beg+len) - str_ptr(str) - b;
1340+
rb_str_replace(str, b, l, argv[2]);
13001341
return argv[2];
13011342
}
13021343
if (argc != 2) {
@@ -1570,7 +1611,7 @@ rb_str_replace_m(str, str2)
15701611
memcpy(str_ptr(str), str_ptr(str2), str_len(str2));
15711612
}
15721613

1573-
rb_str_copy_encoding(str, str2);
1614+
rb_m17n_copy_encoding(str, str2);
15741615
if (OBJ_TAINTED(str2)) OBJ_TAINT(str);
15751616
return str;
15761617
}
@@ -1660,7 +1701,7 @@ rb_str_reverse(str)
16601701
s += clen;
16611702
}
16621703
}
1663-
rb_str_copy_encoding(obj, str);
1704+
rb_m17n_copy_encoding(obj, str);
16641705

16651706
return obj;
16661707
}
@@ -2359,10 +2400,11 @@ rb_str_squeeze_bang(argc, argv, str)
23592400
int clen = m17n_codelen(enc, c);
23602401
VALUE v = INT2NUM(c);
23612402

2362-
if (c != save && del && !NIL_P(rb_hash_aref(del, v)) &&
2363-
(!nodel || NIL_P(rb_hash_aref(nodel, v)))) {
2364-
save = c;
2403+
if (c != save &&
2404+
((del && NIL_P(rb_hash_aref(del, v))) ||
2405+
(!nodel || NIL_P(rb_hash_aref(nodel, v))))) {
23652406
if (t != s) m17n_mbcput(enc, c, t);
2407+
save = c;
23662408
t += clen;
23672409
}
23682410
s += clen;
@@ -2476,8 +2518,9 @@ rb_str_split_m(argc, argv, str)
24762518
fs_set:
24772519
switch (TYPE(spat)) {
24782520
case T_STRING:
2479-
if (str_len(spat) == 1) {
2480-
char_sep = (unsigned char)str_ptr(spat)[0];
2521+
if (str_strlen(spat, enc) == 1) {
2522+
rb_m17n_enc_check(str, spat, &enc);
2523+
char_sep = m17n_codepoint(enc, str_ptr(spat), str_end(spat));
24812524
}
24822525
else {
24832526
spat = rb_reg_regcomp(spat);
@@ -2547,12 +2590,13 @@ rb_str_split_m(argc, argv, str)
25472590
int last_null = 0;
25482591
struct re_registers *regs;
25492592

2593+
rb_m17n_enc_check(str, spat, &enc);
25502594
while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
25512595
regs = RMATCH(rb_backref_get())->regs;
25522596
if (start == end && BEG(0) == END(0)) {
25532597
if (last_null == 1) {
25542598
tmp = rb_str_new(str_ptr(str)+beg, mbclen(str_ptr(str)[beg]));
2555-
rb_str_copy_encoding(tmp, str);
2599+
rb_m17n_copy_encoding(tmp, str);
25562600
rb_ary_push(result, tmp);
25572601
beg = start;
25582602
}
@@ -2564,7 +2608,7 @@ rb_str_split_m(argc, argv, str)
25642608
}
25652609
else {
25662610
tmp = rb_str_new(str_ptr(str)+beg, end-beg);
2567-
rb_str_copy_encoding(tmp, str);
2611+
rb_m17n_copy_encoding(tmp, str);
25682612
rb_ary_push(result, tmp);
25692613
beg = start = END(0);
25702614
}
@@ -2576,18 +2620,21 @@ rb_str_split_m(argc, argv, str)
25762620
tmp = rb_str_new(0, 0);
25772621
else
25782622
tmp = rb_reg_nth_match(idx, rb_backref_get());
2579-
rb_str_copy_encoding(tmp, str);
2623+
rb_m17n_copy_encoding(tmp, str);
25802624
rb_ary_push(result, tmp);
25812625
}
25822626
if (!NIL_P(limit) && lim <= ++i) break;
25832627
}
2628+
beg = str_sublen(str, beg, enc);
25842629
}
25852630
if (!NIL_P(limit) || str_len(str) > beg || lim < 0) {
2586-
if (str_len(str) == beg)
2631+
if (str_strlen(str, enc) == beg)
25872632
tmp = rb_str_new(0, 0);
2588-
else
2589-
tmp = rb_str_new(str_ptr(str)+beg, str_len(str)-beg);
2590-
rb_str_copy_encoding(tmp, str);
2633+
else {
2634+
char *p = str_nth(enc, str_ptr(str), str_end(str), beg);
2635+
tmp = rb_str_new(p, str_end(str)-p);
2636+
}
2637+
rb_m17n_copy_encoding(tmp, str);
25912638
rb_ary_push(result, tmp);
25922639
}
25932640
if (NIL_P(limit) && lim == 0) {

0 commit comments

Comments
 (0)
0