@@ -140,11 +140,11 @@ rb_enc_set_encoding(obj, encoding)
140
140
return encoding ;
141
141
}
142
142
143
- static void
144
- rb_str_copy_encoding ( str1 , str2 )
145
- VALUE str1 , str2 ;
143
+ void
144
+ rb_m17n_copy_encoding ( obj1 , obj2 )
145
+ VALUE obj1 , obj2 ;
146
146
{
147
- rb_m17n_associate_encoding (str1 , rb_m17n_get_encoding (str2 ));
147
+ rb_m17n_associate_encoding (obj1 , rb_m17n_get_encoding (obj2 ));
148
148
}
149
149
150
150
#define str_ptr (str ) RSTRING(str)->ptr
@@ -280,7 +280,7 @@ rb_str_become(str, str2)
280
280
RSTRING (str )-> orig = RSTRING (str2 )-> orig ;
281
281
RSTRING (str2 )-> ptr = 0 ; /* abandon str2 */
282
282
RSTRING (str2 )-> len = 0 ;
283
- rb_str_copy_encoding (str , str2 );
283
+ rb_m17n_copy_encoding (str , str2 );
284
284
if (OBJ_TAINTED (str2 )) OBJ_TAINT (str );
285
285
}
286
286
@@ -512,6 +512,35 @@ str_nth(enc, p, e, idx)
512
512
return p ;
513
513
}
514
514
515
+ static int
516
+ str_sublen (str , len , enc )
517
+ VALUE str ;
518
+ int len ;
519
+ m17n_encoding * enc ;
520
+ {
521
+ if (m17n_mbmaxlen (enc ) == 1 ) return len ;
522
+ else {
523
+ char * p = str_ptr (str );
524
+ char * e = p + len ;
525
+ int i ;
526
+
527
+ i = 0 ;
528
+ while (p < e ) {
529
+ p += mbclen (* p );
530
+ i ++ ;
531
+ }
532
+ return i ;
533
+ }
534
+ }
535
+
536
+ int
537
+ rb_str_sublen (str , len )
538
+ VALUE str ;
539
+ int len ;
540
+ {
541
+ return str_sublen (str , len , rb_m17n_get_encoding (str ));
542
+ }
543
+
515
544
VALUE
516
545
rb_str_substr (str , beg , len )
517
546
VALUE str ;
@@ -798,6 +827,7 @@ rb_str_match(x, y)
798
827
if (start == -1 ) {
799
828
return Qnil ;
800
829
}
830
+ start = rb_str_sublen (x , start );
801
831
return INT2NUM (start );
802
832
803
833
default :
@@ -823,20 +853,22 @@ rb_str_index(str, sub, offset)
823
853
824
854
rb_m17n_enc_check (str , sub , & enc );
825
855
if (offset < 0 ) {
826
- offset += str_len (str );
856
+ offset += str_strlen (str , enc );
827
857
if (offset < 0 ) return -1 ;
828
858
}
829
- if (str_len (str ) - offset < str_len (sub )) return -1 ;
830
- s = str_ptr (str )+ offset ;
859
+ if (str_strlen (str , enc ) - offset < str_strlen (sub , enc ))
860
+ return -1 ;
861
+ s = str_nth (enc , str_ptr (str ), str_end (str ), offset );
831
862
p = str_ptr (sub );
832
863
len = str_len (sub );
833
864
if (len == 0 ) return offset ;
834
865
e = str_ptr (str ) + str_len (str ) - len + 1 ;
835
866
while (s < e ) {
836
867
if (str_memcmp (s , p , len , enc ) == 0 ) {
837
- return ( s - ( str_ptr ( str ))) ;
868
+ return offset ;
838
869
}
839
- s ++ ;
870
+ offset ++ ;
871
+ s += m17n_mbclen (enc , * s );
840
872
}
841
873
return -1 ;
842
874
}
@@ -866,6 +898,7 @@ rb_str_index_m(argc, argv, str)
866
898
case T_REGEXP :
867
899
pos = rb_reg_adjust_startpos (sub , str , pos , 0 );
868
900
pos = rb_reg_search (sub , str , pos , 0 );
901
+ pos = rb_str_sublen (str , pos );
869
902
break ;
870
903
871
904
case T_STRING :
@@ -901,8 +934,8 @@ rb_str_rindex(argc, argv, str)
901
934
{
902
935
VALUE sub ;
903
936
VALUE position ;
904
- int pos , len ;
905
- char * s , * sbeg , * t ;
937
+ int pos , len , len2 ;
938
+ char * s , * sbeg , * e , * t ;
906
939
m17n_encoding * enc ;
907
940
908
941
if (rb_scan_args (argc , argv , "11" , & sub , & position ) == 2 ) {
@@ -923,31 +956,32 @@ rb_str_rindex(argc, argv, str)
923
956
if (RREGEXP (sub )-> len ) {
924
957
pos = rb_reg_adjust_startpos (sub , str , pos , 1 );
925
958
pos = rb_reg_search (sub , str , pos , 1 );
959
+ pos = str_sublen (str , pos , enc );
926
960
}
927
961
if (pos >= 0 ) return INT2NUM (pos );
928
962
break ;
929
963
930
964
case T_STRING :
931
965
rb_m17n_enc_check (str , sub , & enc );
932
- len = str_len (sub );
933
- if (str_len (str ) < len ) return Qnil ;
934
- if (str_len (str ) - pos < len ) {
935
- pos = str_len (str ) - len ;
966
+ len = str_strlen (sub );
967
+ len2 = str_strlen (str );
968
+ if (len > len2 ) return Qnil ;
969
+ if (len2 - pos < len ) {
970
+ pos = len2 - len ;
971
+ }
972
+ if (len == 0 ) {
973
+ return INT2NUM (pos );
936
974
}
937
975
sbeg = str_ptr (str );
938
- s = str_ptr (str ) + pos ;
976
+ e = str_end (str );
939
977
t = str_ptr (sub );
940
- if (len ) {
941
- while (sbeg <= s ) {
942
- if (str_memcmp (s , t , len , enc ) == 0 ) {
943
- return INT2NUM (s - str_ptr (str ));
944
- }
945
- s -- ;
978
+ do {
979
+ s = str_nth (enc , sbeg , e , pos );
980
+ if (str_memcmp (s , t , len , enc ) == 0 ) {
981
+ return INT2NUM (pos );
946
982
}
947
- }
948
- else {
949
- return INT2NUM (pos );
950
- }
983
+ pos -- ;
984
+ } while (sbeg <= s );
951
985
break ;
952
986
953
987
case T_FIXNUM :
@@ -1002,7 +1036,7 @@ rb_str_succ(orig)
1002
1036
{
1003
1037
m17n_encoding * enc = rb_m17n_get_encoding (orig );
1004
1038
VALUE str ;
1005
- char * sbeg , * s ;
1039
+ char * sbeg , * s , * e ;
1006
1040
int c = -1 ;
1007
1041
int n = 0 ;
1008
1042
@@ -1011,9 +1045,11 @@ rb_str_succ(orig)
1011
1045
if (str_len (str ) == 0 ) return str ;
1012
1046
1013
1047
sbeg = str_ptr (str ); s = sbeg + str_len (str ) - 1 ;
1048
+ e = str_end (str );
1014
1049
1015
1050
while (sbeg <= s ) {
1016
- if (m17n_isalnum (enc , * s )) {
1051
+ int c = m17n_codepoint (enc , s , e );
1052
+ if (m17n_isalnum (enc , c )) {
1017
1053
if ((c = succ_char (s )) == 0 ) break ;
1018
1054
n = s - sbeg ;
1019
1055
}
@@ -1278,25 +1314,30 @@ rb_str_aset_m(argc, argv, str)
1278
1314
{
1279
1315
rb_str_modify (str );
1280
1316
if (argc == 3 ) {
1281
- long beg , len ;
1317
+ m17n_encoding * enc ;
1318
+ long beg , len , slen , b , l ;
1282
1319
1283
1320
if (TYPE (argv [2 ]) != T_STRING ) argv [2 ] = rb_str_to_str (argv [2 ]);
1284
1321
beg = NUM2INT (argv [0 ]);
1285
1322
len = NUM2INT (argv [1 ]);
1286
1323
if (len < 0 ) rb_raise (rb_eIndexError , "negative length %d" , len );
1324
+ slen = str_strlen (str );
1287
1325
if (beg < 0 ) {
1288
- beg += str_len ( str ) ;
1326
+ beg += slen ;
1289
1327
}
1290
- if (beg < 0 || str_len ( str ) < beg ) {
1328
+ if (beg < 0 || slen < beg ) {
1291
1329
if (beg < 0 ) {
1292
- beg -= str_len ( str ) ;
1330
+ beg -= slen ;
1293
1331
}
1294
1332
rb_raise (rb_eIndexError , "index %d out of string" , beg );
1295
1333
}
1296
- if (beg + len > str_len ( str ) ) {
1297
- len = str_len ( str ) - beg ;
1334
+ if (beg + len > slen ) {
1335
+ len = slen - beg ;
1298
1336
}
1299
- rb_str_replace (str , beg , len , argv [2 ]);
1337
+ rb_m17n_enc_check (str , argv [2 ], & enc );
1338
+ b = str_nth (enc , str_ptr (str ), str_end (str ), beg ) - str_ptr (str );
1339
+ l = str_nth (enc , str_ptr (str ), str_end (str ), beg + len ) - str_ptr (str ) - b ;
1340
+ rb_str_replace (str , b , l , argv [2 ]);
1300
1341
return argv [2 ];
1301
1342
}
1302
1343
if (argc != 2 ) {
@@ -1570,7 +1611,7 @@ rb_str_replace_m(str, str2)
1570
1611
memcpy (str_ptr (str ), str_ptr (str2 ), str_len (str2 ));
1571
1612
}
1572
1613
1573
- rb_str_copy_encoding (str , str2 );
1614
+ rb_m17n_copy_encoding (str , str2 );
1574
1615
if (OBJ_TAINTED (str2 )) OBJ_TAINT (str );
1575
1616
return str ;
1576
1617
}
@@ -1660,7 +1701,7 @@ rb_str_reverse(str)
1660
1701
s += clen ;
1661
1702
}
1662
1703
}
1663
- rb_str_copy_encoding (obj , str );
1704
+ rb_m17n_copy_encoding (obj , str );
1664
1705
1665
1706
return obj ;
1666
1707
}
@@ -2359,10 +2400,11 @@ rb_str_squeeze_bang(argc, argv, str)
2359
2400
int clen = m17n_codelen (enc , c );
2360
2401
VALUE v = INT2NUM (c );
2361
2402
2362
- if (c != save && del && ! NIL_P ( rb_hash_aref ( del , v )) &&
2363
- (! nodel || NIL_P (rb_hash_aref (nodel , v )))) {
2364
- save = c ;
2403
+ if (c != save &&
2404
+ (( del && NIL_P (rb_hash_aref (del , v ))) ||
2405
+ (! nodel || NIL_P ( rb_hash_aref ( nodel , v ))))) {
2365
2406
if (t != s ) m17n_mbcput (enc , c , t );
2407
+ save = c ;
2366
2408
t += clen ;
2367
2409
}
2368
2410
s += clen ;
@@ -2476,8 +2518,9 @@ rb_str_split_m(argc, argv, str)
2476
2518
fs_set :
2477
2519
switch (TYPE (spat )) {
2478
2520
case T_STRING :
2479
- if (str_len (spat ) == 1 ) {
2480
- char_sep = (unsigned char )str_ptr (spat )[0 ];
2521
+ if (str_strlen (spat , enc ) == 1 ) {
2522
+ rb_m17n_enc_check (str , spat , & enc );
2523
+ char_sep = m17n_codepoint (enc , str_ptr (spat ), str_end (spat ));
2481
2524
}
2482
2525
else {
2483
2526
spat = rb_reg_regcomp (spat );
@@ -2547,12 +2590,13 @@ rb_str_split_m(argc, argv, str)
2547
2590
int last_null = 0 ;
2548
2591
struct re_registers * regs ;
2549
2592
2593
+ rb_m17n_enc_check (str , spat , & enc );
2550
2594
while ((end = rb_reg_search (spat , str , start , 0 )) >= 0 ) {
2551
2595
regs = RMATCH (rb_backref_get ())-> regs ;
2552
2596
if (start == end && BEG (0 ) == END (0 )) {
2553
2597
if (last_null == 1 ) {
2554
2598
tmp = rb_str_new (str_ptr (str )+ beg , mbclen (str_ptr (str )[beg ]));
2555
- rb_str_copy_encoding (tmp , str );
2599
+ rb_m17n_copy_encoding (tmp , str );
2556
2600
rb_ary_push (result , tmp );
2557
2601
beg = start ;
2558
2602
}
@@ -2564,7 +2608,7 @@ rb_str_split_m(argc, argv, str)
2564
2608
}
2565
2609
else {
2566
2610
tmp = rb_str_new (str_ptr (str )+ beg , end - beg );
2567
- rb_str_copy_encoding (tmp , str );
2611
+ rb_m17n_copy_encoding (tmp , str );
2568
2612
rb_ary_push (result , tmp );
2569
2613
beg = start = END (0 );
2570
2614
}
@@ -2576,18 +2620,21 @@ rb_str_split_m(argc, argv, str)
2576
2620
tmp = rb_str_new (0 , 0 );
2577
2621
else
2578
2622
tmp = rb_reg_nth_match (idx , rb_backref_get ());
2579
- rb_str_copy_encoding (tmp , str );
2623
+ rb_m17n_copy_encoding (tmp , str );
2580
2624
rb_ary_push (result , tmp );
2581
2625
}
2582
2626
if (!NIL_P (limit ) && lim <= ++ i ) break ;
2583
2627
}
2628
+ beg = str_sublen (str , beg , enc );
2584
2629
}
2585
2630
if (!NIL_P (limit ) || str_len (str ) > beg || lim < 0 ) {
2586
- if (str_len (str ) == beg )
2631
+ if (str_strlen (str , enc ) == beg )
2587
2632
tmp = rb_str_new (0 , 0 );
2588
- else
2589
- tmp = rb_str_new (str_ptr (str )+ beg , str_len (str )- beg );
2590
- rb_str_copy_encoding (tmp , str );
2633
+ else {
2634
+ char * p = str_nth (enc , str_ptr (str ), str_end (str ), beg );
2635
+ tmp = rb_str_new (p , str_end (str )- p );
2636
+ }
2637
+ rb_m17n_copy_encoding (tmp , str );
2591
2638
rb_ary_push (result , tmp );
2592
2639
}
2593
2640
if (NIL_P (limit ) && lim == 0 ) {
0 commit comments