diff --git a/NEWS b/NEWS index 5ca32c67a2..1850bdcd43 100644 --- a/NEWS +++ b/NEWS @@ -93,6 +93,10 @@ with all sufficient information, see the ChangeLog file or Redmine * added Random.bytes. [Feature #4938] +* String + + * String#split yields each substrings to the block if given. [Feature #4780] + === Stdlib updates (outstanding ones only) * ERB diff --git a/string.c b/string.c index 710c753336..01b5c4e421 100644 --- a/string.c +++ b/string.c @@ -7602,6 +7602,35 @@ static const char isspacetable[256] = { #define ascii_isspace(c) isspacetable[(unsigned char)(c)] +static long +split_string(VALUE result, VALUE str, long beg, long len, long empty_count) +{ + if (empty_count >= 0 && len == 0) { + return empty_count + 1; + } + if (empty_count > 0) { + /* make different substrings */ + if (result) { + do { + rb_ary_push(result, str_new_empty(str)); + } while (--empty_count > 0); + } + else { + do { + rb_yield(str_new_empty(str)); + } while (--empty_count > 0); + } + } + str = rb_str_subseq(str, beg, len); + if (result) { + rb_ary_push(result, str); + } + else { + rb_yield(str); + } + return empty_count; +} + /* * call-seq: * str.split(pattern=nil, [limit]) -> an_array @@ -7660,20 +7689,27 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) VALUE spat; VALUE limit; enum {awk, string, regexp} split_type; - long beg, end, i = 0; + long beg, end, i = 0, empty_count = -1; int lim = 0; VALUE result, tmp; + result = rb_block_given_p() ? Qfalse : Qnil; if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) { lim = NUM2INT(limit); if (lim <= 0) limit = Qnil; else if (lim == 1) { if (RSTRING_LEN(str) == 0) - return rb_ary_new2(0); - return rb_ary_new3(1, rb_str_dup(str)); + return result ? rb_ary_new2(0) : str; + tmp = rb_str_dup(str); + if (!result) { + rb_yield(tmp); + return str; + } + return rb_ary_new3(1, tmp); } i = 1; } + if (NIL_P(limit) && !lim) empty_count = 0; enc = STR_ENC_GET(str); split_type = regexp; @@ -7712,7 +7748,9 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } - result = rb_ary_new(); +#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count)) + + if (result) result = rb_ary_new(); beg = 0; if (split_type == awk) { char *ptr = RSTRING_PTR(str); @@ -7736,7 +7774,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } else if (ascii_isspace(c)) { - rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); + SPLIT_STR(beg, end-beg); skip = 1; beg = ptr - bptr; if (!NIL_P(limit)) ++i; @@ -7763,7 +7801,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } else if (rb_isspace(c)) { - rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); + SPLIT_STR(beg, end-beg); skip = 1; beg = ptr - bptr; if (!NIL_P(limit)) ++i; @@ -7792,8 +7830,7 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) ptr = t; continue; } - rb_ary_push(result, rb_str_subseq(str, substr_start - str_start, - (ptr+end) - substr_start)); + SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start); ptr += end + slen; substr_start = ptr; if (!NIL_P(limit) && lim <= ++i) break; @@ -7812,14 +7849,11 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) regs = RMATCH_REGS(rb_backref_get()); if (start == end && BEG(0) == END(0)) { if (!ptr) { - rb_ary_push(result, str_new_empty(str)); + SPLIT_STR(0, 0); break; } else if (last_null == 1) { - rb_ary_push(result, rb_str_subseq(str, beg, - rb_enc_fast_mbclen(ptr+beg, - ptr+len, - enc))); + SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, ptr+len, enc)); beg = start; } else { @@ -7832,37 +7866,23 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) } } else { - rb_ary_push(result, rb_str_subseq(str, beg, end-beg)); + SPLIT_STR(beg, end-beg); beg = start = END(0); } last_null = 0; for (idx=1; idx < regs->num_regs; idx++) { if (BEG(idx) == -1) continue; - if (BEG(idx) == END(idx)) - tmp = str_new_empty(str); - else - tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx)); - rb_ary_push(result, tmp); + SPLIT_STR(BEG(idx), END(idx)-BEG(idx)); } if (!NIL_P(limit) && lim <= ++i) break; } } if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) { - if (RSTRING_LEN(str) == beg) - tmp = str_new_empty(str); - else - tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg); - rb_ary_push(result, tmp); - } - if (NIL_P(limit) && lim == 0) { - long len; - while ((len = RARRAY_LEN(result)) > 0 && - (tmp = RARRAY_AREF(result, len-1), RSTRING_LEN(tmp) == 0)) - rb_ary_pop(result); + SPLIT_STR(beg, RSTRING_LEN(str)-beg); } - return result; + return result ? result : str; } VALUE diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 3dadc8e04b..f71dfc7fee 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -1699,7 +1699,46 @@ CODE assert_equal([S("a"), S(""), S("b"), S("c"), S("")], S("a||b|c|").split(S('|'), -1)) assert_equal([], "".split(//, 1)) + ensure + $; = fs + end + def test_split_with_block + fs, $; = $;, nil + result = []; S(" a b\t c ").split {|s| result << s} + assert_equal([S("a"), S("b"), S("c")], result) + result = []; S(" a b\t c ").split(S(" ")) {|s| result << s} + assert_equal([S("a"), S("b"), S("c")], result) + + result = []; S(" a | b | c ").split(S("|")) {|s| result << s} + assert_equal([S(" a "), S(" b "), S(" c ")], result) + + result = []; S("aXXbXXcXX").split(/X./) {|s| result << s} + assert_equal([S("a"), S("b"), S("c")], result) + + result = []; S("abc").split(//) {|s| result << s} + assert_equal([S("a"), S("b"), S("c")], result) + + result = []; S("a|b|c").split(S('|'), 1) {|s| result << s} + assert_equal([S("a|b|c")], result) + + result = []; S("a|b|c").split(S('|'), 2) {|s| result << s} + assert_equal([S("a"), S("b|c")], result) + result = []; S("a|b|c").split(S('|'), 3) {|s| result << s} + assert_equal([S("a"), S("b"), S("c")], result) + + result = []; S("a|b|c|").split(S('|'), -1) {|s| result << s} + assert_equal([S("a"), S("b"), S("c"), S("")], result) + result = []; S("a|b|c||").split(S('|'), -1) {|s| result << s} + assert_equal([S("a"), S("b"), S("c"), S(""), S("")], result) + + result = []; S("a||b|c|").split(S('|')) {|s| result << s} + assert_equal([S("a"), S(""), S("b"), S("c")], result) + result = []; S("a||b|c|").split(S('|'), -1) {|s| result << s} + assert_equal([S("a"), S(""), S("b"), S("c"), S("")], result) + + result = []; "".split(//, 1) {|s| result << s} + assert_equal([], result) ensure $; = fs end @@ -1762,6 +1801,7 @@ CODE s.split("b", 1).map(&:upcase!) assert_equal("abc", s) end + def test_squeeze assert_equal(S("abc"), S("aaabbbbccc").squeeze) assert_equal(S("aa bb cc"), S("aa bb cc").squeeze(S(" ")))