From 9580a9ca91a84e410e643398130897b38e02a0a0 Mon Sep 17 00:00:00 2001 From: matz Date: Wed, 23 Jan 2008 18:43:51 +0000 Subject: [PATCH] * string.c (rb_str_each_line): use memchr(3) for faster newline search. * io.c (appendline): remove unused arguments * io.c (rb_io_getline_fast): make much simpler (and faster). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15199 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 9 +++++ include/ruby/encoding.h | 2 +- io.c | 80 ++++++++++++++++++++++++++--------------- string.c | 40 ++++++++++++--------- 4 files changed, 85 insertions(+), 46 deletions(-) diff --git a/ChangeLog b/ChangeLog index 902f7e8b83..2ca97f5c41 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,12 @@ +Thu Jan 24 03:23:44 2008 Yukihiro Matsumoto + + * string.c (rb_str_each_line): use memchr(3) for faster newline + search. + + * io.c (appendline): remove unused arguments + + * io.c (rb_io_getline_fast): make much simpler (and faster). + Thu Jan 24 02:13:07 2008 Yusuke Endoh * insns.def (expandarray): fix stack inc. diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 27abba1d53..ca06b4dd7a 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -134,7 +134,7 @@ int rb_enc_codelen(int code, rb_encoding *enc); #define rb_enc_right_char_head(s,p,enc) (char *)onigenc_get_right_adjust_char_head(enc,(UChar*)(s),(UChar*)(p)) /* ptr, ptr, encoding -> newline_or_not */ -#define rb_enc_is_newline(p,end,enc) ONIGENC_IS_MBC_NEWLINE(enc,p,end) +#define rb_enc_is_newline(p,end,enc) ONIGENC_IS_MBC_NEWLINE(enc,(UChar*)p,(UChar*)end) #define rb_enc_isctype(c,t,enc) ONIGENC_IS_CODE_CTYPE(enc,c,t) #define rb_enc_isascii(c,enc) ONIGENC_IS_CODE_ASCII(c) diff --git a/io.c b/io.c index 1206b9bcaa..79f2396383 100644 --- a/io.c +++ b/io.c @@ -1686,21 +1686,19 @@ rscheck(const char *rsptr, long rslen, VALUE rs) } static int -appendline(rb_io_t *fptr, int delim, const char *rsptr, int rslen, VALUE *strp, long *lp) +appendline(rb_io_t *fptr, int delim, VALUE *strp, long *lp) { VALUE str = *strp; int c = EOF; long limit = *lp; - if (rsptr == 0) - rslen = 1; - do { long pending = READ_DATA_PENDING_COUNT(fptr); if (pending > 0) { const char *p = READ_DATA_PENDING_PTR(fptr); const char *e; long last = 0, len = (c != EOF); + rb_encoding *enc = io_read_encoding(fptr); if (limit > 0 && pending > limit) pending = limit; e = memchr(p, delim, pending); @@ -1720,7 +1718,7 @@ appendline(rb_io_t *fptr, int delim, const char *rsptr, int rslen, VALUE *strp, if (limit > 0 && limit == pending) { char *p = fptr->rbuf+fptr->rbuf_off; char *pp = p + limit; - char *pl = rb_enc_left_char_head(p, pp, io_read_encoding(fptr)); + char *pl = rb_enc_left_char_head(p, pp, enc); if (pl < pp) { int diff = pp - pl; @@ -1790,27 +1788,53 @@ swallow(rb_io_t *fptr, int term) } static VALUE -rb_io_getline_fast(rb_io_t *fptr, unsigned char delim, long limit) +rb_io_getline_fast(rb_io_t *fptr) { VALUE str = Qnil; - int c, nolimit = 0; + int len = 0; + rb_encoding *enc = io_read_encoding(fptr); for (;;) { - c = appendline(fptr, delim, 0, 0, &str, &limit); - if (c == EOF || c == delim) break; - if (limit == 0) { - nolimit = 1; + long pending = READ_DATA_PENDING_COUNT(fptr); + + if (pending > 0) { + const char *p = READ_DATA_PENDING_PTR(fptr); + const char *e; + + e = memchr(p, '\n', pending); + if (e) { + const char *p0 = rb_enc_left_char_head(p, e, enc); + const char *pend = rb_enc_left_char_head(p, p+pending, enc); + if (rb_enc_is_newline(p0, pend, enc)) { + pending = p0 - p + rb_enc_mbclen(p0, pend, enc); + } + else { + e = 0; + } + } + if (NIL_P(str)) { + str = rb_str_new(p, pending); + fptr->rbuf_off += pending; + fptr->rbuf_len -= pending; + } + else { + rb_str_resize(str, len + pending); + read_buffered_data(RSTRING_PTR(str)+len, pending, fptr); + } + len += pending; + if (e) break; + } + rb_thread_wait_fd(fptr->fd); + rb_io_check_closed(fptr); + if (io_fillbuf(fptr) < 0) { + if (NIL_P(str)) return Qnil; break; } } - if (!NIL_P(str)) { - str = io_enc_str(str, fptr); - if (!nolimit) { - fptr->lineno++; - lineno = INT2FIX(fptr->lineno); - } - } + str = io_enc_str(str, fptr); + fptr->lineno++; + lineno = INT2FIX(fptr->lineno); return str; } @@ -1838,11 +1862,12 @@ prepare_getline_args(int argc, VALUE *argv, VALUE *rsp, long *limit, VALUE io) } } } - GetOpenFile(io, fptr); - if (!NIL_P(rs)) { - rb_encoding *enc_rs = rb_enc_get(rs); - rb_encoding *enc_io = io_read_encoding(fptr); + if (!NIL_P(rs) && rs != rb_default_rs) { + rb_encoding *enc_rs, *enc_io; + GetOpenFile(io, fptr); + enc_rs = rb_enc_get(rs); + enc_io = io_read_encoding(fptr); if (enc_io != enc_rs && (rb_enc_str_coderange(rs) != ENC_CODERANGE_7BIT || !rb_enc_asciicompat(enc_io))) { @@ -1876,8 +1901,8 @@ rb_io_getline_1(VALUE rs, long limit, VALUE io) else if (limit == 0) { return rb_enc_str_new(0, 0, io_read_encoding(fptr)); } - else if (rs == rb_default_rs) { - return rb_io_getline_fast(fptr, '\n', limit); + else if (rs == rb_default_rs && limit < 0) { + return rb_io_getline_fast(fptr); } else { int c, newline; @@ -1893,15 +1918,12 @@ rb_io_getline_1(VALUE rs, long limit, VALUE io) swallow(fptr, '\n'); rs = 0; } - else if (rslen == 1) { - return rb_io_getline_fast(fptr, (unsigned char)RSTRING_PTR(rs)[0], limit); - } else { rsptr = RSTRING_PTR(rs); } newline = rsptr[rslen - 1]; - while ((c = appendline(fptr, newline, rsptr, rslen, &str, &limit)) != EOF) { + while ((c = appendline(fptr, newline, &str, &limit)) != EOF) { if (c == newline) { const char *s, *p, *pp; @@ -1954,7 +1976,7 @@ rb_io_gets(VALUE io) GetOpenFile(io, fptr); rb_io_check_readable(fptr); - return rb_io_getline_fast(fptr, '\n', 0); + return rb_io_getline_fast(fptr); } /* diff --git a/string.c b/string.c index eacced74ca..0dfb098679 100644 --- a/string.c +++ b/string.c @@ -4470,9 +4470,8 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str) rb_encoding *enc; VALUE rs; int newline; - char *p = RSTRING_PTR(str), *pend = p + RSTRING_LEN(str), *s = p; - char *ptr = p; - long len = RSTRING_LEN(str), rslen; + char *p, *pend, *s, *ptr; + long len, rslen; VALUE line; int n; @@ -4480,29 +4479,39 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str) rs = rb_rs; } RETURN_ENUMERATOR(str, argc, argv); - if (NIL_P(rs)) { rb_yield(str); return str; } + str = rb_str_new4(str); + ptr = p = s = RSTRING_PTR(str); + pend = p + RSTRING_LEN(str); + len = RSTRING_LEN(str); StringValue(rs); - enc = rb_enc_check(str, rs); if (rs == rb_default_rs) { + enc = rb_enc_get(str); while (p < pend) { - n = rb_enc_mbclen(p, pend, enc); - if (rb_enc_is_newline(p, pend, enc)) { - line = rb_str_new5(str, s, p - s + n); - OBJ_INFECT(line, str); - rb_enc_copy(line, str); - rb_yield(line); - str_mod_check(str, ptr, len); - s = p + n; + char *p0; + + p = memchr(p, '\n', pend - p); + if (!p) break; + p0 = rb_enc_left_char_head(s, p, enc); + if (!rb_enc_is_newline(p0, pend, enc)) { + p++; + continue; } - p += n; + p = p0 + rb_enc_mbclen(s, p0, enc); + line = rb_str_new5(str, s, p - s); + OBJ_INFECT(line, str); + rb_enc_copy(line, str); + rb_yield(line); + str_mod_check(str, ptr, len); + s = p; } goto finish; } + enc = rb_enc_check(str, rs); rslen = RSTRING_LEN(rs); if (rslen == 0) { newline = '\n'; @@ -4535,8 +4544,7 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str) finish: if (s != pend) { - if (p > pend) p = pend; - line = rb_str_new5(str, s, p - s); + line = rb_str_new5(str, s, pend - s); OBJ_INFECT(line, str); rb_enc_copy(line, str); rb_yield(line);