diff --git a/ChangeLog b/ChangeLog index 34603b33c3..30dbd45853 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,63 @@ +Sat Aug 25 11:45:37 2007 Yukihiro Matsumoto + + * encoding.c: provide basic features for M17N. + + * parse.y: encoding aware parsing. + + * parse.y (pragma_encoding): encoding specification pragma. + + * parse.y (rb_intern3): encoding specified symbols. + + * string.c (rb_str_length): length based on characters. + for older behavior, bytesize method added. + + * string.c (rb_str_index_m): index based on characters. rindex as + well. + + * string.c (succ_char): encoding aware succeeding string. + + * string.c (rb_str_reverse): reverse based on characters. + + * string.c (rb_str_inspect): encoding aware string description. + + * string.c (rb_str_upcase_bang): encoding aware case conversion. + downcase, capitalize, swapcase as well. + + * string.c (rb_str_tr_bang): tr based on characters. delete, + squeeze, tr_s, count as well. + + * string.c (rb_str_split_m): split based on characters. + + * string.c (rb_str_each_line): encoding aware each_line. + + * string.c (rb_str_each_char): added. iteration based on + characters. + + * string.c (rb_str_strip_bang): encoding aware whitespace + stripping. lstrip, rstrip as well. + + * string.c (rb_str_justify): encoding aware justifying (ljust, + rjust, center). + + * string.c (str_encoding): get encoding attribute from a string. + + * re.c (rb_reg_initialize): encoding aware regular expression + + * sprintf.c (rb_str_format): formatting (i.e. length count) based + on characters. + + * io.c (rb_io_getc): getc to return one-character string. + for older behavior, getbyte method added. + + * ext/stringio/stringio.c (strio_getc): ditto. + + * io.c (rb_io_ungetc): allow pushing arbitrary string at the + current reading point. + + * ext/stringio/stringio.c (strio_ungetc): ditto. + + * ext/strscan/strscan.c: encoding support. + Sat Aug 25 10:59:19 2007 Koichi Sasada * cont.c: separate Continuation and Fiber from core. diff --git a/array.c b/array.c index a7361661c4..0b4ffa14d7 100644 --- a/array.c +++ b/array.c @@ -6,7 +6,7 @@ $Date$ created at: Fri Aug 6 09:46:12 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/bignum.c b/bignum.c index fcfe64983a..c944907c64 100644 --- a/bignum.c +++ b/bignum.c @@ -6,7 +6,7 @@ $Date$ created at: Fri Jun 10 00:48:55 JST 1994 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/class.c b/class.c index f881e690e4..dff402c759 100644 --- a/class.c +++ b/class.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Aug 10 15:05:44 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/common.mk b/common.mk index 75c649a75f..6fdaa9e82e 100644 --- a/common.mk +++ b/common.mk @@ -25,6 +25,7 @@ OBJS = array.$(OBJEXT) \ compar.$(OBJEXT) \ dir.$(OBJEXT) \ dln.$(OBJEXT) \ + encoding.$(OBJEXT) \ enum.$(OBJEXT) \ enumerator.$(OBJEXT) \ error.$(OBJEXT) \ @@ -401,6 +402,7 @@ dmydln.$(OBJEXT): {$(VPATH)}dmydln.c {$(VPATH)}dln.c {$(VPATH)}ruby.h \ {$(VPATH)}config.h {$(VPATH)}defines.h {$(VPATH)}intern.h {$(VPATH)}missing.h \ {$(VPATH)}dln.h dmyext.$(OBJEXT): {$(VPATH)}dmyext.c +encoding.$(OBJEXT): {$(VPATH)}encoding.c {$(VPATH)}encoding.h enum.$(OBJEXT): {$(VPATH)}enum.c {$(VPATH)}ruby.h {$(VPATH)}config.h \ {$(VPATH)}defines.h {$(VPATH)}intern.h {$(VPATH)}missing.h \ {$(VPATH)}node.h {$(VPATH)}util.h @@ -523,7 +525,7 @@ sprintf.$(OBJEXT): {$(VPATH)}sprintf.c {$(VPATH)}ruby.h {$(VPATH)}config.h \ st.$(OBJEXT): {$(VPATH)}st.c {$(VPATH)}config.h {$(VPATH)}st.h {$(VPATH)}defines.h string.$(OBJEXT): {$(VPATH)}string.c {$(VPATH)}ruby.h {$(VPATH)}config.h \ {$(VPATH)}defines.h {$(VPATH)}intern.h {$(VPATH)}missing.h \ - {$(VPATH)}re.h {$(VPATH)}regex.h + {$(VPATH)}re.h {$(VPATH)}regex.h {$(VPATH)}encoding.h struct.$(OBJEXT): {$(VPATH)}struct.c {$(VPATH)}ruby.h {$(VPATH)}config.h \ {$(VPATH)}defines.h {$(VPATH)}intern.h {$(VPATH)}missing.h thread.$(OBJEXT): {$(VPATH)}thread.c {$(VPATH)}eval_intern.h \ diff --git a/compar.c b/compar.c index 7802567070..a8952b2d1c 100644 --- a/compar.c +++ b/compar.c @@ -6,7 +6,7 @@ $Date$ created at: Thu Aug 26 14:39:48 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/dir.c b/dir.c index f7b7eb1933..b26df63f71 100644 --- a/dir.c +++ b/dir.c @@ -6,7 +6,7 @@ $Date$ created at: Wed Jan 5 09:51:01 JST 1994 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/dln.c b/dln.c index 8b46954002..6403e9a87c 100644 --- a/dln.c +++ b/dln.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Jan 18 17:05:06 JST 1994 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/dln.h b/dln.h index 4fd51cbee0..6905a36d5a 100644 --- a/dln.h +++ b/dln.h @@ -6,7 +6,7 @@ $Date$ created at: Wed Jan 19 16:53:09 JST 1994 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/enum.c b/enum.c index 73b2767863..fa4b480f9f 100644 --- a/enum.c +++ b/enum.c @@ -6,7 +6,7 @@ $Date$ created at: Fri Oct 1 15:15:19 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/error.c b/error.c index 77d63d7b38..e972bdd1b0 100644 --- a/error.c +++ b/error.c @@ -6,7 +6,7 @@ $Date$ created at: Mon Aug 9 16:11:34 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/euc_jp.c b/euc_jp.c index 65729feda8..f39c4de628 100644 --- a/euc_jp.c +++ b/euc_jp.c @@ -78,8 +78,9 @@ static int code_to_mbclen(OnigCodePoint code) { if (ONIGENC_IS_CODE_ASCII(code)) return 1; - else if ((code & 0xff0000) != 0) return 3; - else if ((code & 0xff00) != 0) return 2; + else if (code > 0xffffff) return 0; + else if ((code & 0xff0000) >= 0x800000) return 3; + else if ((code & 0xff00) >= 0x8000) return 2; else return 0; } diff --git a/eval.c b/eval.c index 721ac6302c..63579c4e4e 100644 --- a/eval.c +++ b/eval.c @@ -6,7 +6,7 @@ $Date$ created at: Thu Jun 10 14:22:17 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/ext/socket/socket.c b/ext/socket/socket.c index aee4b21567..514ba3fd59 100644 --- a/ext/socket/socket.c +++ b/ext/socket/socket.c @@ -6,7 +6,7 @@ $Date$ created at: Thu Mar 31 12:21:29 JST 1994 - Copyright (C) 1993-2001 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto ************************************************/ diff --git a/ext/stringio/stringio.c b/ext/stringio/stringio.c index 480c9ed378..08a6341b70 100644 --- a/ext/stringio/stringio.c +++ b/ext/stringio/stringio.c @@ -13,7 +13,7 @@ **********************************************************************/ #include "ruby.h" -#include "rubyio.h" +#include "ruby/io.h" #if defined(HAVE_FCNTL_H) || defined(_WIN32) #include #elif defined(HAVE_SYS_FCNTL_H) @@ -84,6 +84,18 @@ get_strio(VALUE self) return ptr; } +static VALUE +strio_substr(struct StringIO *ptr, int pos, int len) +{ + VALUE str = ptr->string; + rb_encoding *enc = rb_enc_get(str); + int rlen = RSTRING_LEN(str) - pos; + + if (len > rlen) len = rlen; + if (len < 0) len = 0; + return rb_enc_str_new(RSTRING_PTR(str)+pos, len, enc); +} + #define StringIO(obj) get_strio(obj) #define CLOSED(ptr) (!((ptr)->flags & FMODE_READWRITE)) @@ -603,7 +615,7 @@ strio_each_byte(VALUE self) /* * call-seq: - * strio.getc -> fixnum or nil + * strio.getc -> string or nil * * See IO#getc. */ @@ -611,15 +623,17 @@ static VALUE strio_getc(VALUE self) { struct StringIO *ptr = readable(StringIO(self)); - int c; - char ch; + rb_encoding *enc = rb_enc_get(ptr->string); + int len; + char *p; if (ptr->pos >= RSTRING_LEN(ptr->string)) { return Qnil; } - c = RSTRING_PTR(ptr->string)[ptr->pos++]; - ch = c & 0xff; - return rb_str_new(&ch, 1); + p = RSTRING_PTR(ptr->string)+ptr->pos; + len = rb_enc_mbclen(p, enc); + ptr->pos += len; + return rb_enc_str_new(p, len, rb_enc_get(ptr->string)); } /* @@ -671,30 +685,34 @@ static VALUE strio_ungetc(VALUE self, VALUE c) { struct StringIO *ptr = readable(StringIO(self)); - int cc; - long len, pos = ptr->pos; + long lpos, clen; + char *p, *pend; + rb_encoding *enc; if (NIL_P(c)) return Qnil; if (FIXNUM_P(c)) { - cc = FIX2INT(c); + int cc = FIX2INT(c); + char buf[16]; + + enc = rb_enc_get(ptr->string); + rb_enc_mbcput(cc, buf, enc); + c = rb_enc_str_new(buf, rb_enc_codelen(cc, enc), enc); } else { SafeStringValue(c); - if (RSTRING_LEN(c) > 1) { - rb_warn("IO#ungetc pushes back only one byte"); - } - cc = (unsigned char)RSTRING_PTR(c)[0]; + enc = rb_enc_check(ptr->string, c); } - if (cc != EOF && pos > 0) { - if ((len = RSTRING_LEN(ptr->string)) < pos-- || - (unsigned char)RSTRING_PTR(ptr->string)[pos] != - (unsigned char)cc) { - strio_extend(ptr, pos, 1); - RSTRING_PTR(ptr->string)[pos] = cc; - OBJ_INFECT(ptr->string, self); - } - --ptr->pos; + /* get logical position */ + lpos = 0; p = RSTRING_PTR(ptr->string); pend = p + ptr->pos - 1; + for (;;) { + clen = rb_enc_mbclen(p, enc); + if (p+clen >= pend) break; + p += clen; + lpos++; } + rb_str_update(ptr->string, lpos, ptr->pos ? 1 : 0, c); + ptr->pos = p - RSTRING_PTR(ptr->string); + return Qnil; } @@ -800,7 +818,7 @@ strio_getline(int argc, VALUE *argv, struct StringIO *ptr) e = s + limit; } if (NIL_P(str)) { - str = rb_str_substr(ptr->string, ptr->pos, e - s); + str = strio_substr(ptr, ptr->pos, e - s); } else if ((n = RSTRING_LEN(str)) == 0) { p = s; @@ -816,13 +834,13 @@ strio_getline(int argc, VALUE *argv, struct StringIO *ptr) break; } } - str = rb_str_substr(ptr->string, s - RSTRING_PTR(ptr->string), e - s); + str = strio_substr(ptr, s - RSTRING_PTR(ptr->string), e - s); } else if (n == 1) { if ((p = memchr(s, RSTRING_PTR(str)[0], e - s)) != 0) { e = p + 1; } - str = rb_str_substr(ptr->string, ptr->pos, e - s); + str = strio_substr(ptr, ptr->pos, e - s); } else { if (n < e - s) { @@ -843,7 +861,7 @@ strio_getline(int argc, VALUE *argv, struct StringIO *ptr) } } } - str = rb_str_substr(ptr->string, ptr->pos, e - s); + str = strio_substr(ptr, ptr->pos, e - s); } ptr->pos = e - RSTRING_PTR(ptr->string); ptr->lineno++; @@ -944,7 +962,7 @@ strio_write(VALUE self, VALUE str) if (TYPE(str) != T_STRING) str = rb_obj_as_string(str); len = RSTRING_LEN(str); - if (!len) return INT2FIX(0); + if (len == 0) return INT2FIX(0); check_modifiable(ptr); olen = RSTRING_LEN(ptr->string); if (ptr->flags & FMODE_APPEND) { @@ -955,7 +973,8 @@ strio_write(VALUE self, VALUE str) } else { strio_extend(ptr, ptr->pos, len); - rb_str_update(ptr->string, ptr->pos, len, str); + memmove(RSTRING_PTR(ptr->string)+ptr->pos, RSTRING_PTR(str), len); + OBJ_INFECT(ptr->string, str); } OBJ_INFECT(ptr->string, self); ptr->pos += len; @@ -1070,7 +1089,7 @@ strio_read(int argc, VALUE *argv, VALUE self) rb_raise(rb_eArgError, "wrong number of arguments (%d for 0)", argc); } if (NIL_P(str)) { - str = rb_str_substr(ptr->string, ptr->pos, len); + str = strio_substr(ptr, ptr->pos, len); } else { long rest = RSTRING_LEN(ptr->string) - ptr->pos; diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 63a0f1185e..44cd0f4131 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -10,6 +10,7 @@ #include "ruby/ruby.h" #include "ruby/re.h" +#include "ruby/encoding.h" #define STRSCAN_VERSION "0.7.0" @@ -189,6 +190,7 @@ strscan_initialize(int argc, VALUE *argv, VALUE self) rb_scan_args(argc, argv, "11", &str, &need_dup); StringValue(str); p->str = str; + rb_enc_associate(self, rb_enc_get(str)); return self; } @@ -652,13 +654,14 @@ strscan_getch(VALUE self) { struct strscanner *p; long len; + rb_encoding *enc = rb_enc_get(self); GET_SCANNER(self, p); CLEAR_MATCH_STATUS(p); if (EOS_P(p)) return Qnil; - len = mbclen(*CURPTR(p)); + len = rb_enc_mbclen(CURPTR(p), enc); if (p->curr + len > S_LEN(p)) { len = S_LEN(p) - p->curr; } diff --git a/ext/syck/emitter.c b/ext/syck/emitter.c index 03bdaa2dd7..cadee64789 100644 --- a/ext/syck/emitter.c +++ b/ext/syck/emitter.c @@ -7,7 +7,7 @@ * Copyright (C) 2003 why the lucky stiff * * All Base64 code from Ruby's pack.c. - * Ruby is Copyright (C) 1993-2003 Yukihiro Matsumoto + * Ruby is Copyright (C) 1993-2007 Yukihiro Matsumoto */ #include "ruby/ruby.h" diff --git a/ext/tk/sample/demos-jp/anilabel.rb b/ext/tk/sample/demos-jp/anilabel.rb index 97781fbe77..c6e5c7385b 100644 --- a/ext/tk/sample/demos-jp/anilabel.rb +++ b/ext/tk/sample/demos-jp/anilabel.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # animated label widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/aniwave.rb b/ext/tk/sample/demos-jp/aniwave.rb index 81e2d76b30..5f94add111 100644 --- a/ext/tk/sample/demos-jp/aniwave.rb +++ b/ext/tk/sample/demos-jp/aniwave.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # animated wave demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/arrow.rb b/ext/tk/sample/demos-jp/arrow.rb index 477a0abf6f..43c6eef4eb 100644 --- a/ext/tk/sample/demos-jp/arrow.rb +++ b/ext/tk/sample/demos-jp/arrow.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # arrowhead widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/bind.rb b/ext/tk/sample/demos-jp/bind.rb index e1e23b9893..779e395826 100644 --- a/ext/tk/sample/demos-jp/bind.rb +++ b/ext/tk/sample/demos-jp/bind.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # text (tag bindings) widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/bitmap.rb b/ext/tk/sample/demos-jp/bitmap.rb index b71c67d3fd..4594892c81 100644 --- a/ext/tk/sample/demos-jp/bitmap.rb +++ b/ext/tk/sample/demos-jp/bitmap.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # bitmap widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/button.rb b/ext/tk/sample/demos-jp/button.rb index 20f8cae299..7e9457f5b4 100644 --- a/ext/tk/sample/demos-jp/button.rb +++ b/ext/tk/sample/demos-jp/button.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # button widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/check.rb b/ext/tk/sample/demos-jp/check.rb index be675b9042..7545df80fa 100644 --- a/ext/tk/sample/demos-jp/check.rb +++ b/ext/tk/sample/demos-jp/check.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # checkbutton widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/check2.rb b/ext/tk/sample/demos-jp/check2.rb index f681a4d684..90c6dd736f 100644 --- a/ext/tk/sample/demos-jp/check2.rb +++ b/ext/tk/sample/demos-jp/check2.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # checkbutton widget demo2 (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/clrpick.rb b/ext/tk/sample/demos-jp/clrpick.rb index de8cd80fcd..d81ecebc83 100644 --- a/ext/tk/sample/demos-jp/clrpick.rb +++ b/ext/tk/sample/demos-jp/clrpick.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # widget demo prompts the user to select a color (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/colors.rb b/ext/tk/sample/demos-jp/colors.rb index c6128f9c00..68b40e69f0 100644 --- a/ext/tk/sample/demos-jp/colors.rb +++ b/ext/tk/sample/demos-jp/colors.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # listbox widget demo 'colors' (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/cscroll.rb b/ext/tk/sample/demos-jp/cscroll.rb index 0d6db69af6..0be26133c5 100644 --- a/ext/tk/sample/demos-jp/cscroll.rb +++ b/ext/tk/sample/demos-jp/cscroll.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # simple scrollable canvas widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/ctext.rb b/ext/tk/sample/demos-jp/ctext.rb index 66e1fe8107..35d43febbc 100644 --- a/ext/tk/sample/demos-jp/ctext.rb +++ b/ext/tk/sample/demos-jp/ctext.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # Canvas Text widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/dialog1.rb b/ext/tk/sample/demos-jp/dialog1.rb index 0d6181bfc6..07e50306ab 100644 --- a/ext/tk/sample/demos-jp/dialog1.rb +++ b/ext/tk/sample/demos-jp/dialog1.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # a dialog box with a local grab (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/dialog2.rb b/ext/tk/sample/demos-jp/dialog2.rb index a934378dda..f747f8d6a8 100644 --- a/ext/tk/sample/demos-jp/dialog2.rb +++ b/ext/tk/sample/demos-jp/dialog2.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # a dialog box with a global grab (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/entry1.rb b/ext/tk/sample/demos-jp/entry1.rb index edf3b5f71d..d794282284 100644 --- a/ext/tk/sample/demos-jp/entry1.rb +++ b/ext/tk/sample/demos-jp/entry1.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # entry (no scrollbars) widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/entry2.rb b/ext/tk/sample/demos-jp/entry2.rb index 7d5740e663..528ad6dec4 100644 --- a/ext/tk/sample/demos-jp/entry2.rb +++ b/ext/tk/sample/demos-jp/entry2.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # entry (with scrollbars) widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/entry3.rb b/ext/tk/sample/demos-jp/entry3.rb index f57dc13553..46426af6ae 100644 --- a/ext/tk/sample/demos-jp/entry3.rb +++ b/ext/tk/sample/demos-jp/entry3.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # entry3.rb -- # # This demonstration script creates several entry widgets whose diff --git a/ext/tk/sample/demos-jp/filebox.rb b/ext/tk/sample/demos-jp/filebox.rb index f3608ab70f..04b4810b3b 100644 --- a/ext/tk/sample/demos-jp/filebox.rb +++ b/ext/tk/sample/demos-jp/filebox.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # widget demo prompts the user to select a file (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/floor.rb b/ext/tk/sample/demos-jp/floor.rb index b029580bd6..b7d07bdafa 100644 --- a/ext/tk/sample/demos-jp/floor.rb +++ b/ext/tk/sample/demos-jp/floor.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # floorDisplay widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/floor2.rb b/ext/tk/sample/demos-jp/floor2.rb index a20b31d45c..b7571a592f 100644 --- a/ext/tk/sample/demos-jp/floor2.rb +++ b/ext/tk/sample/demos-jp/floor2.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # floorDisplay widget demo 2 (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/form.rb b/ext/tk/sample/demos-jp/form.rb index fe456d3943..637dd9a8ea 100644 --- a/ext/tk/sample/demos-jp/form.rb +++ b/ext/tk/sample/demos-jp/form.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # form widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/goldberg.rb b/ext/tk/sample/demos-jp/goldberg.rb index 592b69f775..8bf0104c16 100644 --- a/ext/tk/sample/demos-jp/goldberg.rb +++ b/ext/tk/sample/demos-jp/goldberg.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # Ruby/Tk Goldverg demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/hello b/ext/tk/sample/demos-jp/hello index 859ebd950e..08f154d499 100644 --- a/ext/tk/sample/demos-jp/hello +++ b/ext/tk/sample/demos-jp/hello @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# -*- coding: euc-jp -*- require 'tk' TkButton.new(nil, diff --git a/ext/tk/sample/demos-jp/hscale.rb b/ext/tk/sample/demos-jp/hscale.rb index 37d215435c..690479d6d1 100644 --- a/ext/tk/sample/demos-jp/hscale.rb +++ b/ext/tk/sample/demos-jp/hscale.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- require "tkcanvas" if defined?($hscale_demo) && $hscale_deom diff --git a/ext/tk/sample/demos-jp/icon.rb b/ext/tk/sample/demos-jp/icon.rb index e4fef5cae7..26382a57a7 100644 --- a/ext/tk/sample/demos-jp/icon.rb +++ b/ext/tk/sample/demos-jp/icon.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # iconic button widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/image1.rb b/ext/tk/sample/demos-jp/image1.rb index 3d47f844e5..3b56d240dc 100644 --- a/ext/tk/sample/demos-jp/image1.rb +++ b/ext/tk/sample/demos-jp/image1.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # two image widgets demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/image2.rb b/ext/tk/sample/demos-jp/image2.rb index e2e2a2b036..de627448c1 100644 --- a/ext/tk/sample/demos-jp/image2.rb +++ b/ext/tk/sample/demos-jp/image2.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # widget demo 'load image' (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/image3.rb b/ext/tk/sample/demos-jp/image3.rb index d9f378c116..36c1823745 100644 --- a/ext/tk/sample/demos-jp/image3.rb +++ b/ext/tk/sample/demos-jp/image3.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # image3.rb # # This demonstration script creates a simple collection of widgets diff --git a/ext/tk/sample/demos-jp/items.rb b/ext/tk/sample/demos-jp/items.rb index 38774d10d2..d538fac75f 100644 --- a/ext/tk/sample/demos-jp/items.rb +++ b/ext/tk/sample/demos-jp/items.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # canvas item types widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/ixset2 b/ext/tk/sample/demos-jp/ixset2 index 8947daa4b4..5b816e40b1 100644 --- a/ext/tk/sample/demos-jp/ixset2 +++ b/ext/tk/sample/demos-jp/ixset2 @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# -*- coding: euc-jp -*- # # ixset -- # A nice interface to "xset" to change X server settings diff --git a/ext/tk/sample/demos-jp/label.rb b/ext/tk/sample/demos-jp/label.rb index 59626289fc..a1ecc2ec80 100644 --- a/ext/tk/sample/demos-jp/label.rb +++ b/ext/tk/sample/demos-jp/label.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # label widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/labelframe.rb b/ext/tk/sample/demos-jp/labelframe.rb index 23c974dcc2..f16b601ffd 100644 --- a/ext/tk/sample/demos-jp/labelframe.rb +++ b/ext/tk/sample/demos-jp/labelframe.rb @@ -1,3 +1,5 @@ +# -*- coding: euc-jp -*- +# # labelframe.rb # # This demonstration script creates a toplevel window containing diff --git a/ext/tk/sample/demos-jp/menu.rb b/ext/tk/sample/demos-jp/menu.rb index add85f7f7b..6b9e5c9e5e 100644 --- a/ext/tk/sample/demos-jp/menu.rb +++ b/ext/tk/sample/demos-jp/menu.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # menus widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/menu84.rb b/ext/tk/sample/demos-jp/menu84.rb index 8c2a815d78..762cfa53b8 100644 --- a/ext/tk/sample/demos-jp/menu84.rb +++ b/ext/tk/sample/demos-jp/menu84.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # menus widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/menu8x.rb b/ext/tk/sample/demos-jp/menu8x.rb index 050f0decb4..23efa7e790 100644 --- a/ext/tk/sample/demos-jp/menu8x.rb +++ b/ext/tk/sample/demos-jp/menu8x.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # menus widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/menubu.rb b/ext/tk/sample/demos-jp/menubu.rb index aa90a3087f..e73c393aa5 100644 --- a/ext/tk/sample/demos-jp/menubu.rb +++ b/ext/tk/sample/demos-jp/menubu.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- require "tkcanvas" def optionMenu(menubutton, varName, firstValue, *rest) diff --git a/ext/tk/sample/demos-jp/msgbox.rb b/ext/tk/sample/demos-jp/msgbox.rb index 983e6b6589..0fe5db7dd6 100644 --- a/ext/tk/sample/demos-jp/msgbox.rb +++ b/ext/tk/sample/demos-jp/msgbox.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # message boxes widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/paned1.rb b/ext/tk/sample/demos-jp/paned1.rb index 8d16d03c08..137e187417 100644 --- a/ext/tk/sample/demos-jp/paned1.rb +++ b/ext/tk/sample/demos-jp/paned1.rb @@ -1,3 +1,5 @@ +# -*- coding: euc-jp -*- +# # paned1.rb # # This demonstration script creates a toplevel window containing diff --git a/ext/tk/sample/demos-jp/paned2.rb b/ext/tk/sample/demos-jp/paned2.rb index 1e82eddda4..b394432b1c 100644 --- a/ext/tk/sample/demos-jp/paned2.rb +++ b/ext/tk/sample/demos-jp/paned2.rb @@ -1,3 +1,5 @@ +# -*- coding: euc-jp -*- +# # paned2.rb -- # # This demonstration script creates a toplevel window containing diff --git a/ext/tk/sample/demos-jp/pendulum.rb b/ext/tk/sample/demos-jp/pendulum.rb index d703c74d5a..c245136d5c 100644 --- a/ext/tk/sample/demos-jp/pendulum.rb +++ b/ext/tk/sample/demos-jp/pendulum.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # This demonstration illustrates how Tcl/Tk can be used to construct # simulations of physical systems. diff --git a/ext/tk/sample/demos-jp/plot.rb b/ext/tk/sample/demos-jp/plot.rb index 902b144f72..09a3446836 100644 --- a/ext/tk/sample/demos-jp/plot.rb +++ b/ext/tk/sample/demos-jp/plot.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # 2-D plot widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/puzzle.rb b/ext/tk/sample/demos-jp/puzzle.rb index ad69775aab..6a3c8c8ef6 100644 --- a/ext/tk/sample/demos-jp/puzzle.rb +++ b/ext/tk/sample/demos-jp/puzzle.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # widet demo 'puzzle' (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/radio.rb b/ext/tk/sample/demos-jp/radio.rb index 5858b4222a..3a11c394a3 100644 --- a/ext/tk/sample/demos-jp/radio.rb +++ b/ext/tk/sample/demos-jp/radio.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # radiobutton widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/radio2.rb b/ext/tk/sample/demos-jp/radio2.rb index 5ac877d99a..b89520cdc5 100644 --- a/ext/tk/sample/demos-jp/radio2.rb +++ b/ext/tk/sample/demos-jp/radio2.rb @@ -1,3 +1,5 @@ +# -*- coding: euc-jp -*- +# # radio2.rb # # This demonstration script creates a toplevel window containing diff --git a/ext/tk/sample/demos-jp/radio3.rb b/ext/tk/sample/demos-jp/radio3.rb index 6e9a0f750b..a223a19bc2 100644 --- a/ext/tk/sample/demos-jp/radio3.rb +++ b/ext/tk/sample/demos-jp/radio3.rb @@ -1,3 +1,5 @@ +# -*- coding: euc-jp -*- +# # radio3.rb # # This demonstration script creates a toplevel window containing diff --git a/ext/tk/sample/demos-jp/rolodex-j b/ext/tk/sample/demos-jp/rolodex-j index dcc18cfa51..6c3ea7a484 100644 --- a/ext/tk/sample/demos-jp/rolodex-j +++ b/ext/tk/sample/demos-jp/rolodex-j @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# -*- coding: euc-jp -*- # # rolodex -- # このスクリプトは Tom LaStrange の rolodex の一部です。 diff --git a/ext/tk/sample/demos-jp/ruler.rb b/ext/tk/sample/demos-jp/ruler.rb index 94b4c921d3..c913e247d1 100644 --- a/ext/tk/sample/demos-jp/ruler.rb +++ b/ext/tk/sample/demos-jp/ruler.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # ruler widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/sayings.rb b/ext/tk/sample/demos-jp/sayings.rb index ce195a0e53..24b011f5ab 100644 --- a/ext/tk/sample/demos-jp/sayings.rb +++ b/ext/tk/sample/demos-jp/sayings.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # listbox widget demo 'sayings' (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/search.rb b/ext/tk/sample/demos-jp/search.rb index f5268f987f..adb72fd809 100644 --- a/ext/tk/sample/demos-jp/search.rb +++ b/ext/tk/sample/demos-jp/search.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # Text Search widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/spin.rb b/ext/tk/sample/demos-jp/spin.rb index c7b8096723..b8eb99c4ed 100644 --- a/ext/tk/sample/demos-jp/spin.rb +++ b/ext/tk/sample/demos-jp/spin.rb @@ -1,3 +1,5 @@ +# -*- coding: euc-jp -*- +# # spin.rb -- # # This demonstration script creates several spinbox widgets. diff --git a/ext/tk/sample/demos-jp/states.rb b/ext/tk/sample/demos-jp/states.rb index dfae821261..3c58711bd1 100644 --- a/ext/tk/sample/demos-jp/states.rb +++ b/ext/tk/sample/demos-jp/states.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # listbox widget demo 'states' (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/style.rb b/ext/tk/sample/demos-jp/style.rb index 59e406bc80..50855a549a 100644 --- a/ext/tk/sample/demos-jp/style.rb +++ b/ext/tk/sample/demos-jp/style.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # text (display styles) widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/tcolor b/ext/tk/sample/demos-jp/tcolor index c7f7ec7289..17f7e1347e 100644 --- a/ext/tk/sample/demos-jp/tcolor +++ b/ext/tk/sample/demos-jp/tcolor @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# -*- coding: euc-jp -*- # # tcolor -- # このスクリプトはRGB,HSB,CYM形式をサポートする diff --git a/ext/tk/sample/demos-jp/text.rb b/ext/tk/sample/demos-jp/text.rb index 0057d5dbdc..25e0e64e9a 100644 --- a/ext/tk/sample/demos-jp/text.rb +++ b/ext/tk/sample/demos-jp/text.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # text (basic facilities) widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/twind.rb b/ext/tk/sample/demos-jp/twind.rb index 2b228e4acd..166a44cdb2 100644 --- a/ext/tk/sample/demos-jp/twind.rb +++ b/ext/tk/sample/demos-jp/twind.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # text (embedded windows) widget demo (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/twind2.rb b/ext/tk/sample/demos-jp/twind2.rb index b634f07b4b..e8009cef19 100644 --- a/ext/tk/sample/demos-jp/twind2.rb +++ b/ext/tk/sample/demos-jp/twind2.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- # # text (embedded windows) widget demo 2 (called by 'widget') # diff --git a/ext/tk/sample/demos-jp/unicodeout.rb b/ext/tk/sample/demos-jp/unicodeout.rb index 090cdf3059..7ab415fe57 100644 --- a/ext/tk/sample/demos-jp/unicodeout.rb +++ b/ext/tk/sample/demos-jp/unicodeout.rb @@ -1,3 +1,5 @@ +# -*- coding: euc-jp -*- +# # unicodeout.rb -- # # This demonstration script shows how you can produce output (in label diff --git a/ext/tk/sample/demos-jp/vscale.rb b/ext/tk/sample/demos-jp/vscale.rb index 86f6f7cdee..eb0cea250d 100644 --- a/ext/tk/sample/demos-jp/vscale.rb +++ b/ext/tk/sample/demos-jp/vscale.rb @@ -1,3 +1,4 @@ +# -*- coding: euc-jp -*- require "tkcanvas" if defined?($vscale_demo) && $vscale_demo diff --git a/ext/tk/sample/demos-jp/widget b/ext/tk/sample/demos-jp/widget index 11495dda54..b369bfba96 100644 --- a/ext/tk/sample/demos-jp/widget +++ b/ext/tk/sample/demos-jp/widget @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# -*- coding: euc-jp -*- # 漢字コード設定 ( tk.rb のロード時の encoding 推定/設定に使われる ) $KCODE = 'euc' diff --git a/ext/tk/sample/tkextlib/vu/canvSticker2.rb b/ext/tk/sample/tkextlib/vu/canvSticker2.rb index f54e748660..21f098a196 100644 --- a/ext/tk/sample/tkextlib/vu/canvSticker2.rb +++ b/ext/tk/sample/tkextlib/vu/canvSticker2.rb @@ -1,4 +1,5 @@ #!/usr/bin/env ruby +# -*- coding: utf-8 -*- require 'tk' require 'tkextlib/vu/charts' diff --git a/file.c b/file.c index 2f4643dcd3..260de7c822 100644 --- a/file.c +++ b/file.c @@ -6,7 +6,7 @@ $Date$ created at: Mon Nov 15 12:24:34 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/gc.c b/gc.c index a9683e7493..8054d9acb3 100644 --- a/gc.c +++ b/gc.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Oct 5 09:44:46 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/hash.c b/hash.c index d9198f678a..27c0c5bdfc 100644 --- a/hash.c +++ b/hash.c @@ -6,7 +6,7 @@ $Date$ created at: Mon Nov 22 18:51:18 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/include/ruby/intern.h b/include/ruby/intern.h index aefee7f674..ae231c7d91 100644 --- a/include/ruby/intern.h +++ b/include/ruby/intern.h @@ -6,7 +6,7 @@ $Date$ created at: Thu Jun 10 14:22:17 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan @@ -337,7 +337,7 @@ RUBY_EXTERN VALUE rb_default_rs; RUBY_EXTERN VALUE rb_output_rs; VALUE rb_io_write(VALUE, VALUE); VALUE rb_io_gets(VALUE); -VALUE rb_io_getc(VALUE); +VALUE rb_io_getbyte(VALUE); VALUE rb_io_ungetc(VALUE, VALUE); VALUE rb_io_close(VALUE); VALUE rb_io_flush(VALUE); @@ -444,7 +444,7 @@ VALUE rb_reg_last_match(VALUE); VALUE rb_reg_match_pre(VALUE); VALUE rb_reg_match_post(VALUE); VALUE rb_reg_match_last(VALUE); -VALUE rb_reg_new(const char*, long, int); +VALUE rb_reg_new(VALUE, int); VALUE rb_reg_match(VALUE, VALUE); VALUE rb_reg_match2(VALUE); int rb_reg_options(VALUE); @@ -498,6 +498,7 @@ VALUE rb_str_unlocktmp(VALUE); VALUE rb_str_dup_frozen(VALUE); VALUE rb_str_plus(VALUE, VALUE); VALUE rb_str_times(VALUE, VALUE); +int rb_str_sublen(VALUE, int); VALUE rb_str_substr(VALUE, long, long); void rb_str_modify(VALUE); VALUE rb_str_freeze(VALUE); diff --git a/include/ruby/io.h b/include/ruby/io.h index 6d22de8df1..becf262eca 100644 --- a/include/ruby/io.h +++ b/include/ruby/io.h @@ -6,7 +6,7 @@ $Date$ created at: Fri Nov 12 16:47:09 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ @@ -22,6 +22,7 @@ extern "C" { #include #include +#include "ruby/encoding.h" #if defined(HAVE_STDIO_EXT_H) #include @@ -44,6 +45,7 @@ typedef struct rb_io_t { int rbuf_off; int rbuf_len; int rbuf_capa; + rb_encoding *enc; } rb_io_t; #define HAVE_RB_IO_T 1 diff --git a/include/ruby/node.h b/include/ruby/node.h index e3722e0c41..4d6d73c7fd 100644 --- a/include/ruby/node.h +++ b/include/ruby/node.h @@ -6,7 +6,7 @@ $Date$ created at: Fri May 28 15:14:02 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/include/ruby/re.h b/include/ruby/re.h index ae6e0357b6..5b0cc24e9a 100644 --- a/include/ruby/re.h +++ b/include/ruby/re.h @@ -6,7 +6,7 @@ $Date$ created at: Thu Sep 30 14:18:32 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/include/ruby/regex.h b/include/ruby/regex.h index 118c37c480..ad736775fe 100644 --- a/include/ruby/regex.h +++ b/include/ruby/regex.h @@ -5,7 +5,7 @@ $Author$ $Date$ - Copyright (C) 1993-2005 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ @@ -29,10 +29,8 @@ extern "C" { ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding; -#undef ismbchar -#define ismbchar(c) (mbclen((c)) != 1) -#define mbclen(c) \ - ONIGENC_MBC_ENC_LEN(OnigEncDefaultCharEncoding, (UChar* )(&c)) +#define ismbchar(p, enc) (mbclen((p),(enc)) != 1) +#define mbclen(p,enc) rb_enc_mbclen((p), (enc)) #endif /* ifndef ONIG_RUBY_M17N */ diff --git a/include/ruby/ruby.h b/include/ruby/ruby.h index ea3265505b..2dbf29956e 100644 --- a/include/ruby/ruby.h +++ b/include/ruby/ruby.h @@ -5,7 +5,7 @@ $Author$ created at: Thu Jun 10 14:26:32 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan @@ -455,6 +455,7 @@ struct RString { (!(RBASIC(str)->flags & RSTRING_NOEMBED) ? \ RSTRING(str)->as.ary : \ RSTRING(str)->as.heap.ptr) +#define RSTRING_END(str) (RSTRING_PTR(str)+RSTRING_LEN(str)) struct RArray { struct RBasic basic; @@ -598,6 +599,32 @@ enum ruby_value_flags { #define FL_USER6 RUBY_FL_USER6 RUBY_FL_USER7 = (1<<(FL_USHIFT+7)), #define FL_USER7 RUBY_FL_USER7 + RUBY_FL_USER8 = (1<<(FL_USHIFT+8)), +#define FL_USER8 RUBY_FL_USER8 + RUBY_FL_USER9 = (1<<(FL_USHIFT+9)), +#define FL_USER9 RUBY_FL_USER9 + RUBY_FL_USER10 = (1<<(FL_USHIFT+10)), +#define FL_USER10 RUBY_FL_USER10 + RUBY_FL_USER11 = (1<<(FL_USHIFT+11)), +#define FL_USER11 RUBY_FL_USER11 + RUBY_FL_USER12 = (1<<(FL_USHIFT+12)), +#define FL_USER12 RUBY_FL_USER12 + RUBY_FL_USER13 = (1<<(FL_USHIFT+13)), +#define FL_USER13 RUBY_FL_USER13 + RUBY_FL_USER14 = (1<<(FL_USHIFT+14)), +#define FL_USER14 RUBY_FL_USER14 + RUBY_FL_USER15 = (1<<(FL_USHIFT+15)), +#define FL_USER15 RUBY_FL_USER15 + RUBY_FL_USER16 = (1<<(FL_USHIFT+16)), +#define FL_USER16 RUBY_FL_USER16 + RUBY_FL_USER17 = (1<<(FL_USHIFT+17)), +#define FL_USER17 RUBY_FL_USER17 + RUBY_FL_USER18 = (1<<(FL_USHIFT+18)), +#define FL_USER18 RUBY_FL_USER18 + RUBY_FL_USER19 = (1<<(FL_USHIFT+19)), +#define FL_USER19 RUBY_FL_USER19 + RUBY_FL_USER20 = (1<<(FL_USHIFT+20)), +#define FL_USER20 RUBY_FL_USER20 }; #define SPECIAL_CONST_P(x) (IMMEDIATE_P(x) || !RTEST(x)) @@ -667,6 +694,7 @@ void rb_gc_unregister_address(VALUE*); ID rb_intern(const char*); ID rb_intern2(const char*, long); +ID rb_intern_str(VALUE str); const char *rb_id2name(ID); ID rb_to_id(VALUE); VALUE rb_id2str(ID); diff --git a/include/ruby/signal.h b/include/ruby/signal.h index 29ffcd9f11..23db123d92 100644 --- a/include/ruby/signal.h +++ b/include/ruby/signal.h @@ -6,7 +6,7 @@ $Date$ created at: Wed Aug 16 01:15:38 JST 1995 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/include/ruby/util.h b/include/ruby/util.h index 8437872479..f9ce983269 100644 --- a/include/ruby/util.h +++ b/include/ruby/util.h @@ -6,7 +6,7 @@ $Date$ created at: Thu Mar 9 11:55:53 JST 1995 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/inits.c b/inits.c index 7c53fb7aa2..ce7172987f 100644 --- a/inits.c +++ b/inits.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Dec 28 16:01:58 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/insns.def b/insns.def index 60d9d9a634..685f49f606 100644 --- a/insns.def +++ b/insns.def @@ -406,7 +406,7 @@ toregexp (VALUE val) { volatile VALUE tmp = str; /* for GC */ - val = rb_reg_new(RSTRING_PTR(str), RSTRING_LEN(str), flag); + val = rb_reg_new(str, flag); } /** diff --git a/io.c b/io.c index 850c1a657e..e9fd2a0b07 100644 --- a/io.c +++ b/io.c @@ -6,7 +6,7 @@ $Date$ created at: Fri Oct 15 18:08:59 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan @@ -278,27 +278,38 @@ io_unread(rb_io_t *fptr) return; } -static int -io_ungetc(int c, rb_io_t *fptr) +static void +io_ungetc(VALUE str, rb_io_t *fptr) { + int len = RSTRING_LEN(str); + if (fptr->rbuf == NULL) { fptr->rbuf_off = 0; fptr->rbuf_len = 0; - fptr->rbuf_capa = 8192; + if (len > 8192) + fptr->rbuf_capa = len; + else + fptr->rbuf_capa = 8192; fptr->rbuf = ALLOC_N(char, fptr->rbuf_capa); } - if (c < 0 || fptr->rbuf_len == fptr->rbuf_capa) { - return -1; - } if (fptr->rbuf_off == 0) { - if (fptr->rbuf_len) - MEMMOVE(fptr->rbuf+1, fptr->rbuf, char, fptr->rbuf_len); - fptr->rbuf_off = 1; + if (fptr->rbuf_len) { + MEMMOVE(fptr->rbuf+len, fptr->rbuf, char, fptr->rbuf_len); + } + fptr->rbuf_off = len; } - fptr->rbuf_off--; - fptr->rbuf_len++; - fptr->rbuf[fptr->rbuf_off] = c; - return c; + else if (fptr->rbuf_off < len) { + int capa = fptr->rbuf_len + len; + char *buf = ALLOC_N(char, capa); + + if (fptr->rbuf_len) { + MEMMOVE(buf+len, fptr->rbuf+fptr->rbuf_off, char, fptr->rbuf_len); + } + fptr->rbuf_off = len; + } + fptr->rbuf_off-=len; + fptr->rbuf_len+=len; + MEMMOVE(fptr->rbuf+fptr->rbuf_off, RSTRING_PTR(str), char, len); } static rb_io_t * @@ -875,16 +886,10 @@ rb_io_rewind(VALUE io) } static int -io_getc(rb_io_t *fptr) +io_fillbuf(rb_io_t *fptr) { int r; - if (fptr->fd == 0 && (fptr->mode & FMODE_TTY) && TYPE(rb_stdout) == T_FILE) { - rb_io_t *ofp; - GetOpenFile(rb_stdout, ofp); - if (ofp->mode & FMODE_TTY) { - rb_io_flush(rb_stdout); - } - } + if (fptr->rbuf == NULL) { fptr->rbuf_off = 0; fptr->rbuf_len = 0; @@ -906,9 +911,7 @@ io_getc(rb_io_t *fptr) if (r == 0) return -1; /* EOF */ } - fptr->rbuf_off++; - fptr->rbuf_len--; - return (unsigned char)fptr->rbuf[fptr->rbuf_off-1]; + return 0; } /* @@ -947,20 +950,16 @@ VALUE rb_io_eof(VALUE io) { rb_io_t *fptr; - int ch; GetOpenFile(io, fptr); rb_io_check_readable(fptr); if (READ_DATA_PENDING(fptr)) return Qfalse; READ_CHECK(fptr); - ch = io_getc(fptr); - - if (ch != EOF) { - io_ungetc(ch, fptr); - return Qfalse; + if (io_fillbuf(fptr) < 0) { + return Qtrue; } - return Qtrue; + return Qfalse; } /* @@ -1167,13 +1166,9 @@ io_fread(VALUE str, long offset, rb_io_t *fptr) } rb_thread_wait_fd(fptr->fd); rb_io_check_closed(fptr); - c = io_getc(fptr); - if (c < 0) { + if (io_fillbuf(fptr) < 0) { break; } - RSTRING_PTR(str)[offset++] = c; - if (offset > RSTRING_LEN(str)) break; - n--; } return len - n; } @@ -1599,9 +1594,7 @@ appendline(rb_io_t *fptr, int delim, VALUE *strp, long *lp) } rb_thread_wait_fd(fptr->fd); rb_io_check_closed(fptr); - c = io_getc(fptr); - limit--; - if (c < 0) { + if (io_fillbuf(fptr) < 0) { *lp = limit; return c; } @@ -1640,10 +1633,8 @@ swallow(rb_io_t *fptr, int term) } rb_thread_wait_fd(fptr->fd); rb_io_check_closed(fptr); - c = io_getc(fptr); - if (c != term) { - io_ungetc(c, fptr); - return Qtrue; + if (io_fillbuf(fptr) < 0) { + break; } } while (c != EOF); return Qfalse; @@ -2020,20 +2011,24 @@ static VALUE rb_io_each_byte(VALUE io) { rb_io_t *fptr; - int c; + char *p, *e; RETURN_ENUMERATOR(io, 0, 0); GetOpenFile(io, fptr); for (;;) { + p = fptr->rbuf+fptr->rbuf_off; + e = p + fptr->rbuf_len; + while (p < e) { + rb_yield(INT2FIX(*p & 0xff)); + p++; + } rb_io_check_readable(fptr); READ_CHECK(fptr); - c = io_getc(fptr); - if (c < 0) { + if (io_fillbuf(fptr) < 0) { break; } - rb_yield(INT2FIX(c & 0xff)); - } + } return io; } @@ -2070,54 +2065,54 @@ rb_io_bytes(VALUE str) return rb_enumeratorize(str, ID2SYM(rb_intern("each_byte")), 0, 0); } -VALUE -rb_io_getc(VALUE io) -{ - rb_io_t *fptr; - int c; - - GetOpenFile(io, fptr); - rb_io_check_readable(fptr); - - READ_CHECK(fptr); - c = io_getc(fptr); - - if (c < 0) { - return Qnil; - } - return INT2FIX(c & 0xff); -} - /* * call-seq: - * ios.getc => string or nil - * + * ios.getc => fixnum or nil + * * Reads a one-character string from ios. Returns * nil if called at end of file. - * + * * f = File.new("testfile") * f.getc #=> "8" * f.getc #=> "1" */ -VALUE -rb_io_getc_m(VALUE io) +static VALUE +rb_io_getc(VALUE io) { - char ch; + rb_encoding *enc; rb_io_t *fptr; - int c; + int n, left; + VALUE str; GetOpenFile(io, fptr); rb_io_check_readable(fptr); + enc = rb_enc_get(io); READ_CHECK(fptr); - c = io_getc(fptr); - - if (c < 0) { - return Qnil; + if (io_fillbuf(fptr) < 0) { + rb_eof_error(); } - ch = c & 0xff; - return rb_str_new(&ch, 1); + n = rb_enc_mbclen(fptr->rbuf+fptr->rbuf_off, enc); + if (n < fptr->rbuf_len) { + str = rb_str_new(fptr->rbuf+fptr->rbuf_off, n); + fptr->rbuf_off += n; + fptr->rbuf_len -= n; + } + else { + str = rb_str_new(0, n); + left = fptr->rbuf_len; + MEMCPY(RSTRING_PTR(str), fptr->rbuf+fptr->rbuf_off, char, left); + if (io_fillbuf(fptr) < 0) { + rb_eof_error(); + } + MEMCPY(RSTRING_PTR(str)+left, fptr->rbuf, char, n-left); + fptr->rbuf_off += left; + fptr->rbuf_len -= left; + } + rb_enc_associate(str, enc); + + return str; } int @@ -2139,14 +2134,74 @@ rb_getc(FILE *f) * call-seq: * ios.readchar => string * - * Reads a character as with IO#getc, but raises an + * Reads a one-character string from ios. Raises an * EOFError on end of file. + * + * f = File.new("testfile") + * f.readchar #=> "8" + * f.readchar #=> "1" */ static VALUE rb_io_readchar(VALUE io) { - VALUE c = rb_io_getc_m(io); + VALUE c = rb_io_getc(io); + + if (NIL_P(c)) { + rb_eof_error(); + } + return c; +} + +/* + * call-seq: + * ios.getbyte => fixnum or nil + * + * Gets the next 8-bit byte (0..255) from ios. Returns + * nil if called at end of file. + * + * f = File.new("testfile") + * f.getbyte #=> 84 + * f.getbyte #=> 104 + */ + +VALUE +rb_io_getbyte(VALUE io) +{ + rb_io_t *fptr; + int c; + + GetOpenFile(io, fptr); + rb_io_check_readable(fptr); + READ_CHECK(fptr); + if (fptr->fd == 0 && (fptr->mode & FMODE_TTY) && TYPE(rb_stdout) == T_FILE) { + rb_io_t *ofp; + GetOpenFile(rb_stdout, ofp); + if (ofp->mode & FMODE_TTY) { + rb_io_flush(rb_stdout); + } + } + if (io_fillbuf(fptr) < 0) { + return Qnil; + } + fptr->rbuf_off++; + fptr->rbuf_len--; + c = (unsigned char)fptr->rbuf[fptr->rbuf_off-1]; + return INT2FIX(c & 0xff); +} + +/* + * call-seq: + * ios.readbyte => fixnum + * + * Reads a character as with IO#getc, but raises an + * EOFError on end of file. + */ + +static VALUE +rb_io_readbyte(VALUE io) +{ + VALUE c = rb_io_getbyte(io); if (NIL_P(c)) { rb_eof_error(); @@ -2173,25 +2228,24 @@ rb_io_readchar(VALUE io) VALUE rb_io_ungetc(VALUE io, VALUE c) { + rb_encoding *enc; rb_io_t *fptr; - int cc; GetOpenFile(io, fptr); rb_io_check_readable(fptr); if (NIL_P(c)) return Qnil; + enc = rb_enc_get(io); if (FIXNUM_P(c)) { - cc = FIX2INT(c); + int cc = FIX2INT(c); + char buf[16]; + + rb_enc_mbcput(cc, buf, enc); + c = rb_str_new(buf, rb_enc_codelen(cc, enc)); } else { SafeStringValue(c); - if (RSTRING_LEN(c) > 1) { - rb_warn("IO#ungetc pushes back only one byte"); - } - cc = (unsigned char)RSTRING_PTR(c)[0]; - } - if (io_ungetc(cc, fptr) == EOF && cc != EOF) { - rb_raise(rb_eIOError, "ungetc failed"); } + io_ungetc(c, fptr); return Qnil; } @@ -5465,7 +5519,29 @@ argf_getc(void) ch = rb_funcall3(current_file, rb_intern("getc"), 0, 0); } else { - ch = rb_io_getc_m(current_file); + ch = rb_io_getc(current_file); + } + if (NIL_P(ch) && next_p != -1) { + argf_close(current_file); + next_p = 1; + goto retry; + } + + return ch; +} + +static VALUE +argf_getbyte(void) +{ + VALUE ch; + + retry: + if (!next_argv()) return Qnil; + if (TYPE(current_file) != T_FILE) { + ch = rb_funcall3(current_file, rb_intern("getbyte"), 0, 0); + } + else { + ch = rb_io_getbyte(current_file); } if (NIL_P(ch) && next_p != -1) { argf_close(current_file); @@ -5478,11 +5554,33 @@ argf_getc(void) static VALUE argf_readchar(void) +{ + VALUE ch; + + retry: + if (!next_argv()) return Qnil; + if (TYPE(current_file) != T_FILE) { + ch = rb_funcall3(current_file, rb_intern("getc"), 0, 0); + } + else { + ch = rb_io_getc(current_file); + } + if (NIL_P(ch) && next_p != -1) { + argf_close(current_file); + next_p = 1; + goto retry; + } + + return ch; +} + +static VALUE +argf_readbyte(void) { VALUE c; NEXT_ARGF_FORWARD(0, 0); - c = argf_getc(); + c = argf_getbyte(); if (NIL_P(c)) { rb_eof_error(); } @@ -5780,8 +5878,10 @@ Init_IO(void) rb_define_method(rb_cIO, "write", io_write, 1); rb_define_method(rb_cIO, "gets", rb_io_gets_m, -1); rb_define_method(rb_cIO, "readline", rb_io_readline, -1); - rb_define_method(rb_cIO, "getc", rb_io_getc_m, 0); + rb_define_method(rb_cIO, "getc", rb_io_getc, 0); + rb_define_method(rb_cIO, "getbyte", rb_io_getbyte, 0); rb_define_method(rb_cIO, "readchar", rb_io_readchar, 0); + rb_define_method(rb_cIO, "readbyte", rb_io_readbyte, 0); rb_define_method(rb_cIO, "ungetc",rb_io_ungetc, 1); rb_define_method(rb_cIO, "<<", rb_io_addstr, 1); rb_define_method(rb_cIO, "flush", rb_io_flush, 0); @@ -5851,7 +5951,9 @@ Init_IO(void) rb_define_singleton_method(argf, "gets", rb_f_gets, -1); rb_define_singleton_method(argf, "readline", rb_f_readline, -1); rb_define_singleton_method(argf, "getc", argf_getc, 0); + rb_define_singleton_method(argf, "getbyte", argf_getbyte, 0); rb_define_singleton_method(argf, "readchar", argf_readchar, 0); + rb_define_singleton_method(argf, "readbyte", argf_readbyte, 0); rb_define_singleton_method(argf, "tell", argf_tell, 0); rb_define_singleton_method(argf, "seek", argf_seek_m, -1); rb_define_singleton_method(argf, "rewind", argf_rewind, 0); diff --git a/main.c b/main.c index 09c0bdb8bf..b5c5bb3351 100644 --- a/main.c +++ b/main.c @@ -6,7 +6,7 @@ $Date$ created at: Fri Aug 19 13:19:58 JST 1994 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/marshal.c b/marshal.c index c6c512bdb0..b4ba9b6140 100644 --- a/marshal.c +++ b/marshal.c @@ -6,7 +6,7 @@ $Date$ created at: Thu Apr 27 16:30:01 JST 1995 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ @@ -1105,7 +1105,7 @@ r_object0(struct load_arg *arg, int *ivp, VALUE extmod) { volatile VALUE str = r_bytes(arg); int options = r_byte(arg); - v = r_entry(rb_reg_new(RSTRING_PTR(str), RSTRING_LEN(str), options), arg); + v = r_entry(rb_reg_new(str, options), arg); } break; diff --git a/math.c b/math.c index d0c2eee487..1e55902db8 100644 --- a/math.c +++ b/math.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Jan 25 14:12:56 JST 1994 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/numeric.c b/numeric.c index 9b723d8581..f6782aebd8 100644 --- a/numeric.c +++ b/numeric.c @@ -6,7 +6,7 @@ $Date$ created at: Fri Aug 13 18:33:09 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/object.c b/object.c index bcff31eebc..c6fa1176f3 100644 --- a/object.c +++ b/object.c @@ -6,7 +6,7 @@ $Date$ created at: Thu Jul 15 12:01:24 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/pack.c b/pack.c index 726048fae0..ff0cdae971 100644 --- a/pack.c +++ b/pack.c @@ -6,7 +6,7 @@ $Date$ created at: Thu Feb 10 15:17:05 JST 1994 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/parse.y b/parse.y index e76c7c3187..093d8c3593 100644 --- a/parse.y +++ b/parse.y @@ -6,7 +6,7 @@ $Date$ created at: Fri May 28 18:02:42 JST 1993 - Copyright (C) 1993-2004 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ @@ -20,6 +20,8 @@ #include "ruby/intern.h" #include "ruby/node.h" #include "ruby/st.h" +#include "ruby/encoding.h" +#include "regenc.h" #include #include #include @@ -255,8 +257,13 @@ struct parser_params { VALUE parsing_thread; int toplevel_p; #endif + + rb_encoding *enc; }; +#define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc) +#define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc) + #ifdef YYMALLOC void *rb_parser_malloc(struct parser_params *, size_t); void *rb_parser_realloc(struct parser_params *, void *, size_t); @@ -3555,7 +3562,7 @@ strings : string /*%%%*/ NODE *node = $1; if (!node) { - node = NEW_STR(rb_str_new(0, 0)); + node = NEW_STR(STR_NEW(0, 0)); } else { node = evstr2dstr(node); @@ -3594,7 +3601,7 @@ xstring : tXSTRING_BEG xstring_contents tSTRING_END /*%%%*/ NODE *node = $2; if (!node) { - node = NEW_XSTR(rb_str_new(0, 0)); + node = NEW_XSTR(STR_NEW(0, 0)); } else { switch (nd_type(node)) { @@ -3605,7 +3612,7 @@ xstring : tXSTRING_BEG xstring_contents tSTRING_END nd_set_type(node, NODE_DXSTR); break; default: - node = NEW_NODE(NODE_DXSTR, rb_str_new(0, 0), 1, NEW_LIST(node)); + node = NEW_NODE(NODE_DXSTR, STR_NEW(0, 0), 1, NEW_LIST(node)); break; } } @@ -3622,20 +3629,18 @@ regexp : tREGEXP_BEG xstring_contents tREGEXP_END int options = $3; NODE *node = $2; if (!node) { - node = NEW_LIT(reg_compile("", 0, options)); + node = NEW_LIT(rb_reg_compile(0, options & ~RE_OPTION_ONCE)); } else switch (nd_type(node)) { case NODE_STR: { VALUE src = node->nd_lit; nd_set_type(node, NODE_LIT); - node->nd_lit = reg_compile(RSTRING_PTR(src), - RSTRING_LEN(src), - options); + node->nd_lit = rb_reg_compile(src, options&~RE_OPTION_ONCE); } break; default: - node = NEW_NODE(NODE_DSTR, rb_str_new(0, 0), 1, NEW_LIST(node)); + node = NEW_NODE(NODE_DSTR, STR_NEW(0, 0), 1, NEW_LIST(node)); case NODE_DSTR: if (options & RE_OPTION_ONCE) { nd_set_type(node, NODE_DREGX_ONCE); @@ -3880,7 +3885,7 @@ dsym : tSYMBEG xstring_contents tSTRING_END nd_set_type($$, NODE_LIT); break; default: - $$ = NEW_NODE(NODE_DSYM, rb_str_new(0, 0), 1, NEW_LIST($$)); + $$ = NEW_NODE(NODE_DSYM, STR_NEW(0, 0), 1, NEW_LIST($$)); break; } } @@ -4518,7 +4523,7 @@ ripper_dispatch_scan_event(struct parser_params *parser, int t) if (lex_p < parser->tokp) rb_raise(rb_eRuntimeError, "lex_p < tokp"); if (lex_p == parser->tokp) return; - str = rb_str_new(parser->tokp, lex_p - parser->tokp); + str = STR_NEW(parser->tokp, lex_p - parser->tokp); yylval.val = ripper_dispatch1(parser, ripper_token2eventid(t), str); ripper_flush(parser); } @@ -4552,7 +4557,11 @@ ripper_dispatch_delayed_token(struct parser_params *parser, int t) /* As in Harbison and Steele. */ # define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128) #endif -#define is_identchar(c) (SIGN_EXTEND_CHAR(c)!=-1&&(ISALNUM(c) || (c) == '_' || ismbchar(c))) + +#define parser_mbclen() mbclen((lex_p-1),parser->enc) +#define is_identchar(p, enc) (rb_enc_isalnum(*p, enc) || (*p) == '_' || ismbchar(p, enc)) +#define parser_ismbchar() ismbchar((lex_p-1), parser->enc) +#define parser_is_identchar() (!parser->eofp && is_identchar((lex_p-1),parser->enc)) static int parser_yyerror(struct parser_params *parser, const char *msg) @@ -4596,7 +4605,7 @@ parser_yyerror(struct parser_params *parser, const char *msg) rb_compile_error_append("%s", buf); } #else - dispatch1(parse_error, rb_str_new2(msg)); + dispatch1(parse_error, STR_NEW2(msg)); #endif /* !RIPPER */ return 0; } @@ -4634,7 +4643,7 @@ yycompile(struct parser_params *parser, const char *f, int line) if (!compile_for_eval && rb_safe_level() == 0) { ruby_debug_lines = ruby_suppress_tracing(debug_lines, (VALUE)f); if (ruby_debug_lines && line > 1) { - VALUE str = rb_str_new(0,0); + VALUE str = STR_NEW(0,0); n = line - 1; do { rb_ary_push(ruby_debug_lines, str); @@ -4660,7 +4669,15 @@ yycompile(struct parser_params *parser, const char *f, int line) tree = NEW_NIL(); } if (ruby_eval_tree_begin) { - tree->nd_body = NEW_PRELUDE(ruby_eval_tree_begin, tree->nd_body); + NODE *scope = ruby_eval_tree; + + if (scope) { + scope->nd_body = NEW_PRELUDE(ruby_eval_tree_begin, scope->nd_body); + } + return scope; + } + else { + return ruby_eval_tree; } return tree; } @@ -4682,7 +4699,7 @@ lex_get_str(struct parser_params *parser, VALUE s) if (*end++ == '\n') break; } lex_gets_ptr = end - RSTRING_PTR(s); - return rb_str_new(beg, end - beg); + return STR_NEW(beg, end - beg); } static VALUE @@ -5173,8 +5190,8 @@ parser_tokadd_string(struct parser_params *parser, } } } - else if (ismbchar(uc)) { - int i, len = mbclen(uc)-1; + else if (parser_ismbchar()) { + int i, len = parser_mbclen()-1; for (i = 0; i < len; i++) { tokadd(c); @@ -5252,7 +5269,7 @@ parser_parse_string(struct parser_params *parser, NODE *quote) } tokfix(); - set_yylval_str(rb_str_new(tok(), toklen())); + set_yylval_str(STR_NEW(tok(), toklen())); return tSTRING_CONTENT; } @@ -5278,8 +5295,7 @@ parser_heredoc_identifier(struct parser_params *parser) tokadd(func); term = c; while ((c = nextc()) != -1 && c != term) { - uc = (unsigned int)c; - len = mbclen(uc); + len = parser_mbclen(); do {tokadd(c);} while (--len > 0 && (c = nextc()) != -1); } if (c == -1) { @@ -5289,8 +5305,7 @@ parser_heredoc_identifier(struct parser_params *parser) break; default: - uc = (unsigned int)c; - if (!is_identchar(uc)) { + if (!parser_is_identchar()) { pushback(c); if (func & STR_FUNC_INDENT) { pushback('-'); @@ -5301,11 +5316,9 @@ parser_heredoc_identifier(struct parser_params *parser) term = '"'; tokadd(func |= str_dquote); do { - uc = (unsigned int)c; - len = mbclen(uc); + len = parser_mbclen(); do {tokadd(c);} while (--len > 0 && (c = nextc()) != -1); - } while ((c = nextc()) != -1 && - (uc = (unsigned char)c, is_identchar(uc))); + } while ((c = nextc()) != -1 && parser_is_identchar()); pushback(c); break; } @@ -5317,7 +5330,7 @@ parser_heredoc_identifier(struct parser_params *parser) len = lex_p - lex_pbeg; lex_goto_eol(parser); lex_strterm = rb_node_newnode(NODE_HEREDOC, - rb_str_new(tok(), toklen()), /* nd_lit */ + STR_NEW(tok(), toklen()), /* nd_lit */ len, /* nd_nth */ lex_lastline); /* nd_orig */ nd_set_line(lex_strterm, ruby_sourceline); @@ -5410,7 +5423,7 @@ parser_here_document(struct parser_params *parser, NODE *here) if (str) rb_str_cat(str, p, pend - p); else - str = rb_str_new(p, pend - p); + str = STR_NEW(p, pend - p); if (pend < lex_pend) rb_str_cat(str, "\n", 1); lex_goto_eol(parser); if (nextc() == -1) { @@ -5436,13 +5449,13 @@ parser_here_document(struct parser_params *parser, NODE *here) pushback(c); if ((c = tokadd_string(func, '\n', 0, NULL)) == -1) goto error; if (c != '\n') { - set_yylval_str(rb_str_new(tok(), toklen())); + set_yylval_str(STR_NEW(tok(), toklen())); return tSTRING_CONTENT; } tokadd(nextc()); if ((c = nextc()) == -1) goto error; } while (!whole_match_p(eos, len, indent)); - str = rb_str_new(tok(), toklen()); + str = STR_NEW(tok(), toklen()); } heredoc_restore(lex_strterm); lex_strterm = NEW_STRTERM(-1, 0, 0); @@ -5487,6 +5500,7 @@ pragma_encoding(struct parser_params *parser, const char *name, const char *val) if (parser && parser->line_count != (parser->has_shebang ? 2 : 1)) return; rb_set_kcode(val); + parser->enc = rb_enc_find(val); } struct pragma { @@ -5540,7 +5554,7 @@ parser_pragma(struct parser_params *parser, const char *str, int len) #define str_copy(_s, _p, _n) ((_s) \ ? (rb_str_resize((_s), (_n)), \ MEMCPY(RSTRING_PTR(_s), (_p), char, (_n)), (_s)) \ - : ((_s) = rb_str_new((_p), (_n)))) + : ((_s) = STR_NEW((_p), (_n)))) if (len <= 7) return Qfalse; if (!(beg = pragma_marker(str, len))) return Qfalse; @@ -5934,8 +5948,7 @@ parser_yylex(struct parser_params *parser) compile_error(PARSER_ARG "incomplete character syntax"); return 0; } - uc = (unsigned char)c; - if (ISSPACE(c)){ + if (rb_enc_isspace(c, parser->enc)){ if (!IS_ARG()){ int c2 = 0; switch (c) { @@ -5968,8 +5981,8 @@ parser_yylex(struct parser_params *parser) return '?'; } newtok(); - if (ismbchar(uc)) { - int i, len = mbclen(uc)-1; + if (parser_ismbchar()) { + int i, len = parser_mbclen()-1; tokadd(c); for (i = 0; i < len; i++) { @@ -5977,7 +5990,8 @@ parser_yylex(struct parser_params *parser) tokadd(c); } } - else if ((ISALNUM(c) || c == '_') && lex_p < lex_pend && is_identchar(*lex_p)) { + else if ((rb_enc_isalnum(c, parser->enc) || c == '_') && + lex_p < lex_pend && is_identchar(lex_p, parser->enc)) { goto ternary; } else if (c == '\\') { @@ -5988,7 +6002,7 @@ parser_yylex(struct parser_params *parser) tokadd(c); } tokfix(); - set_yylval_str(rb_str_new(tok(), toklen())); + set_yylval_str(STR_NEW(tok(), toklen())); lex_state = EXPR_ENDARG; return tCHAR; @@ -6544,8 +6558,7 @@ parser_yylex(struct parser_params *parser) } else { term = nextc(); - uc = (unsigned char)c; - if (ISALNUM(term) || ismbchar(uc)) { + if (rb_enc_isalnum(term, parser->enc) || parser_ismbchar()) { yyerror("unknown type of %string"); return 0; } @@ -6625,8 +6638,7 @@ parser_yylex(struct parser_params *parser) switch (c) { case '_': /* $_: last read line string */ c = nextc(); - uc = (unsigned char)c; - if (is_identchar(uc)) { + if (parser_is_identchar()) { tokadd('$'); tokadd('_'); break; @@ -6660,8 +6672,7 @@ parser_yylex(struct parser_params *parser) tokadd('$'); tokadd(c); c = nextc(); - uc = (unsigned char)c; - if (is_identchar(uc)) { + if (parser_is_identchar()) { tokadd(c); } else { @@ -6703,8 +6714,7 @@ parser_yylex(struct parser_params *parser) return tNTH_REF; default: - uc = (unsigned char)c; - if (!is_identchar(uc)) { + if (!parser_is_identchar()) { pushback(c); return '$'; } @@ -6730,8 +6740,7 @@ parser_yylex(struct parser_params *parser) } return 0; } - uc = (unsigned char)c; - if (!is_identchar(uc)) { + if (!parser_is_identchar()) { pushback(c); return '@'; } @@ -6753,9 +6762,8 @@ parser_yylex(struct parser_params *parser) break; default: - uc = (unsigned char)c; - if (!is_identchar(uc)) { - compile_error(PARSER_ARG "Invalid char `\\%03o' in expression", c); + if (!parser_is_identchar()) { + rb_compile_error(PARSER_ARG "Invalid char `\\%03o' in expression", c); goto retry; } @@ -6763,21 +6771,18 @@ parser_yylex(struct parser_params *parser) break; } - uc = (unsigned char)c; do { + int i, len; tokadd(c); - if (ismbchar(uc)) { - int i, len = mbclen(uc)-1; - for (i = 0; i < len; i++) { - c = nextc(); - tokadd(c); - } + len = parser_mbclen()-1; + for (i = 0; i < len; i++) { + c = nextc(); + tokadd(c); } c = nextc(); - uc = (unsigned char)c; - } while (is_identchar(uc)); - if ((c == '!' || c == '?') && is_identchar(tok()[0]) && !peek('=')) { + } while (parser_is_identchar()); + if ((c == '!' || c == '?') && !peek('=')) { tokadd(c); } else { @@ -7214,7 +7219,7 @@ gettable_gen(struct parser_params *parser, ID id) return NEW_FALSE(); } else if (id == keyword__FILE__) { - return NEW_STR(rb_str_new2(ruby_sourcefile)); + return NEW_STR(STR_NEW2(ruby_sourcefile)); } else if (id == keyword__LINE__) { return NEW_LIT(INT2FIX(ruby_sourceline)); @@ -8115,8 +8120,7 @@ dvar_curr_gen(struct parser_params *parser, ID id) static VALUE reg_compile_gen(struct parser_params* parser, const char *ptr, long len, int options) { - VALUE rb_reg_compile(const char *, long, int); - VALUE re = rb_reg_compile(ptr, len, (options) & ~RE_OPTION_ONCE); + VALUE re = rb_reg_compile(STR_NEW(ptr, len), (options) & ~RE_OPTION_ONCE); if (NIL_P(re)) { RB_GC_GUARD(re) = rb_obj_as_string(rb_errinfo()); @@ -8316,7 +8320,7 @@ internal_id_gen(struct parser_params *parser) } static int -is_special_global_name(const char *m) +is_special_global_name(const char *m, rb_encoding *enc) { switch (*m) { case '~': case '*': case '$': case '?': case '!': case '@': @@ -8328,11 +8332,11 @@ is_special_global_name(const char *m) break; case '-': ++m; - if (is_identchar(*m)) m += mbclen(*m); + if (is_identchar(m, enc)) m += rb_enc_mbclen(m, enc); break; default: - if (!ISDIGIT(*m)) return 0; - do ++m; while (ISDIGIT(*m)); + if (!rb_enc_isdigit(*m, enc)) return 0; + do ++m; while (rb_enc_isdigit(*m, enc)); } return !*m; } @@ -8342,6 +8346,7 @@ rb_symname_p(const char *name) { const char *m = name; int localid = Qfalse; + rb_encoding *enc = rb_enc_from_index(0); if (!m) return Qfalse; switch (*m) { @@ -8349,7 +8354,7 @@ rb_symname_p(const char *name) return Qfalse; case '$': - if (is_special_global_name(++m)) return Qtrue; + if (is_special_global_name(++m, enc)) return Qtrue; goto id; case '@': @@ -8396,10 +8401,10 @@ rb_symname_p(const char *name) break; default: - localid = !ISUPPER(*m); + localid = !rb_enc_isupper(*m, enc); id: - if (*m != '_' && !ISALPHA(*m) && !ismbchar(*m)) return Qfalse; - while (is_identchar(*m)) m += mbclen(*m); + if (*m != '_' && !rb_enc_isalpha(*m, enc) && !ismbchar(m, enc)) return Qfalse; + while (is_identchar(m, enc)) m += rb_enc_mbclen(m, enc); if (localid) { switch (*m) { case '!': case '?': case '=': ++m; @@ -8411,7 +8416,7 @@ rb_symname_p(const char *name) } ID -rb_intern2(const char *name, long len) +rb_intern3(const char *name, long len, rb_encoding *enc) { const char *m = name; VALUE str; @@ -8429,13 +8434,13 @@ rb_intern2(const char *name, long len) last = len-1; id = 0; - switch (*name) { + switch (*m) { case '$': id |= ID_GLOBAL; - if (is_special_global_name(++m)) goto new_id; + if (is_special_global_name(++m, enc)) goto new_id; break; case '@': - if (name[1] == '@') { + if (m[1] == '@') { m++; id |= ID_CLASS; } @@ -8445,20 +8450,21 @@ rb_intern2(const char *name, long len) m++; break; default: - if (name[0] != '_' && ISASCII(name[0]) && !ISALNUM(name[0])) { + if (m[0] != '_' && rb_enc_isascii((unsigned char)m[0], enc) + && !rb_enc_isalnum(m[0], enc)) { /* operators */ int i; for (i=0; op_tbl[i].token; i++) { - if (*op_tbl[i].name == *name && - strcmp(op_tbl[i].name, name) == 0) { + if (*op_tbl[i].name == *m && + strcmp(op_tbl[i].name, m) == 0) { id = op_tbl[i].token; goto id_register; } } } - if (name[last] == '=') { + if (m[last] == '=') { /* attribute assignment */ id = rb_intern2(name, last); if (id > tLAST_TOKEN && !is_attrset_id(id)) { @@ -8467,7 +8473,7 @@ rb_intern2(const char *name, long len) } id = ID_ATTRSET; } - else if (ISUPPER(name[0])) { + else if (rb_enc_isupper(m[0], enc)) { id = ID_CONST; } else { @@ -8475,9 +8481,9 @@ rb_intern2(const char *name, long len) } break; } - if (!ISDIGIT(*m)) { - while (m <= name + last && is_identchar(*m)) { - m += mbclen(*m); + if (!rb_enc_isdigit(*m, enc)) { + while (m <= name + last && is_identchar(m, enc)) { + m += rb_enc_mbclen(m, enc); } } if (m - name < len) id = ID_JUNK; @@ -8491,12 +8497,24 @@ rb_intern2(const char *name, long len) return id; } +ID +rb_intern2(const char *name, long len) +{ + return rb_intern3(name, len, rb_enc_from_index(0)); +} + ID rb_intern(const char *name) { return rb_intern2(name, strlen(name)); } +ID +rb_intern_str(VALUE str) +{ + return rb_intern3(RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str)); +} + VALUE rb_id2str(ID id) { @@ -8662,6 +8680,7 @@ parser_initialize(struct parser_params *parser) #ifdef YYMALLOC parser->heap = NULL; #endif + parser->enc = rb_enc_from_index(0); } extern void rb_mark_source_filename(char *); @@ -9013,27 +9032,27 @@ ripper_compile_error(struct parser_params *parser, const char *fmt, ...) static void ripper_warn0(struct parser_params *parser, const char *fmt) { - rb_funcall(parser->value, rb_intern("warn"), 1, rb_str_new2(fmt)); + rb_funcall(parser->value, rb_intern("warn"), 1, STR_NEW2(fmt)); } static void ripper_warnI(struct parser_params *parser, const char *fmt, int a) { rb_funcall(parser->value, rb_intern("warn"), 2, - rb_str_new2(fmt), INT2NUM(a)); + STR_NEW2(fmt), INT2NUM(a)); } static void ripper_warnS(struct parser_params *parser, const char *fmt, const char *str) { rb_funcall(parser->value, rb_intern("warn"), 2, - rb_str_new2(fmt), rb_str_new2(str)); + STR_NEW2(fmt), STR_NEW2(str)); } static void ripper_warning0(struct parser_params *parser, const char *fmt) { - rb_funcall(parser->value, rb_intern("warning"), 1, rb_str_new2(fmt)); + rb_funcall(parser->value, rb_intern("warning"), 1, STR_NEW2(fmt)); } #if 0 /* unused in ripper right now */ @@ -9041,7 +9060,7 @@ static void ripper_warningS(struct parser_params *parser, const char *fmt, const char *str) { rb_funcall(parser->value, rb_intern("warning"), 2, - rb_str_new2(fmt), rb_str_new2(str)); + STR_NEW2(fmt), STR_NEW2(str)); } #endif @@ -9094,7 +9113,7 @@ ripper_initialize(int argc, VALUE *argv, VALUE self) parser->parser_lex_input = src; parser->eofp = Qfalse; if (NIL_P(fname)) { - fname = rb_str_new2("(ripper)"); + fname = STR_NEW2("(ripper)"); } else { StringValue(fname); diff --git a/prec.c b/prec.c index 981dcbc85c..da394027c6 100644 --- a/prec.c +++ b/prec.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Jan 26 02:40:41 2000 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/process.c b/process.c index 5374375e1b..92aaefca8c 100644 --- a/process.c +++ b/process.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Aug 10 14:30:50 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/random.c b/random.c index d37995baff..077415420f 100644 --- a/random.c +++ b/random.c @@ -6,7 +6,7 @@ $Date$ created at: Fri Dec 24 16:39:21 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/range.c b/range.c index 0d00c93cab..934f56b7a2 100644 --- a/range.c +++ b/range.c @@ -6,7 +6,7 @@ $Date$ created at: Thu Aug 19 17:46:47 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ diff --git a/re.c b/re.c index 824d56210c..d44f274197 100644 --- a/re.c +++ b/re.c @@ -5,12 +5,13 @@ $Author$ created at: Mon Aug 9 18:24:49 JST 1993 - Copyright (C) 1993-2006 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto **********************************************************************/ #include "ruby/ruby.h" #include "ruby/re.h" +#include "ruby/encoding.h" #include "regint.h" #include @@ -289,23 +290,27 @@ kcode_to_arg_value(unsigned int kcode) static void set_re_kcode_by_option(struct RRegexp *re, int options) { + rb_encoding *enc = 0; + + FL_UNSET(re, KCODE_MASK); switch (options & ARG_KCODE_MASK) { case ARG_KCODE_NONE: - FL_UNSET(re, KCODE_MASK); + enc = rb_enc_from_index(0); + FL_SET(re, KCODE_NONE); FL_SET(re, KCODE_FIXED); break; case ARG_KCODE_EUC: - FL_UNSET(re, KCODE_MASK); + enc = rb_enc_find("euc-jp"); FL_SET(re, KCODE_EUC); FL_SET(re, KCODE_FIXED); break; case ARG_KCODE_SJIS: - FL_UNSET(re, KCODE_MASK); - FL_SET(re, KCODE_SJIS); + enc = rb_enc_find("sjis"); FL_SET(re, KCODE_FIXED); + FL_SET(re, KCODE_SJIS); break; case ARG_KCODE_UTF8: - FL_UNSET(re, KCODE_MASK); + enc = rb_enc_find("utf-8"); FL_SET(re, KCODE_UTF8); FL_SET(re, KCODE_FIXED); break; @@ -315,6 +320,9 @@ set_re_kcode_by_option(struct RRegexp *re, int options) FL_SET(re, reg_kcode); break; } + if (enc) { + rb_enc_associate((VALUE)re, enc); + } } static int @@ -371,15 +379,9 @@ kcode_reset_option(void) int rb_reg_mbclen2(unsigned int c, VALUE re) { - int len; unsigned char uc = (unsigned char)c; - if (!FL_TEST(re, KCODE_FIXED)) - return mbclen(uc); - kcode_set_option(re); - len = mbclen(uc); - kcode_reset_option(); - return len; + return rb_enc_mbclen(&uc, rb_enc_get(re)); } static void @@ -393,16 +395,17 @@ rb_reg_check(VALUE re) static void rb_reg_expr_str(VALUE str, const char *s, long len) { + rb_encoding *enc = rb_enc_get(str); const char *p, *pend; int need_escape = 0; p = s; pend = p + len; while (p true or false @@ -1489,7 +1484,7 @@ match_inspect(VALUE match) VALUE rb_cRegexp; static int -rb_reg_initialize(VALUE obj, const char *s, long len, +rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, int options, onig_errmsg_buffer err) { struct RRegexp *re = RREGEXP(obj); @@ -1504,7 +1499,12 @@ rb_reg_initialize(VALUE obj, const char *s, long len, re->ptr = 0; re->str = 0; - set_re_kcode_by_option(re, options); + if (options & ARG_KCODE_MASK) { + set_re_kcode_by_option(re, options); + } + else { + rb_enc_associate((VALUE)re, enc); + } if (options & ARG_KCODE_MASK) { kcode_set_option((VALUE)re); @@ -1525,6 +1525,13 @@ rb_reg_initialize(VALUE obj, const char *s, long len, return 0; } +static int +rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err) +{ + return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str), + options, err); +} + static VALUE rb_reg_s_alloc(VALUE klass) { @@ -1539,27 +1546,35 @@ rb_reg_s_alloc(VALUE klass) } VALUE -rb_reg_new(const char *s, long len, int options) +rb_reg_new(VALUE s, int options) { VALUE re = rb_reg_s_alloc(rb_cRegexp); onig_errmsg_buffer err; - if (rb_reg_initialize(re, s, len, options, err) != 0) { - rb_exc_raise(rb_reg_error_desc(s, len, options, err)); + if (rb_reg_initialize_str(re, s, options, err) != 0) { + rb_reg_raise_str(s, err, re); } return re; } VALUE -rb_reg_compile(const char *s, long len, int options) +rb_reg_compile(VALUE str, int options) { VALUE re = rb_reg_s_alloc(rb_cRegexp); onig_errmsg_buffer err; - if (rb_reg_initialize(re, s, len, options, err) != 0) { - rb_set_errinfo(rb_reg_error_desc(s, len, options, err)); - return Qnil; + if (!str) str = rb_str_new(0,0); + if (rb_reg_initialize_str(re, str, options, err) != 0) { + char opts[6]; + VALUE desc = rb_str_buf_new2(err); + + rb_str_buf_cat2(desc, ": /"); + rb_reg_expr_str(desc, RSTRING_PTR(str), RSTRING_LEN(str)); + opts[0] = '/'; + option_to_str(opts + 1, options); + strlcat(opts, arg_kcode(options), sizeof(opts)); + return rb_str_buf_cat2(desc, opts); } FL_SET(re, REG_LITERAL); return re; @@ -1581,8 +1596,7 @@ rb_reg_regcomp(VALUE str) case_cache = ruby_ignorecase; kcode_cache = reg_kcode; - return reg_cache = rb_reg_new(RSTRING_PTR(save_str), RSTRING_LEN(save_str), - ruby_ignorecase); + return reg_cache = rb_reg_new(save_str, ruby_ignorecase); } static int @@ -1843,9 +1857,8 @@ static VALUE rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) { onig_errmsg_buffer err; - const char *s; - long len; int flags = 0; + VALUE str; if (argc == 0 || argc > 3) { rb_raise(rb_eArgError, "wrong number of arguments"); @@ -1859,8 +1872,8 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) if (FL_TEST(argv[0], KCODE_FIXED)) { flags |= re_to_kcode_arg_value(argv[0]); } - s = RREGEXP(argv[0])->str; - len = RREGEXP(argv[0])->len; + str = rb_enc_str_new(RREGEXP(argv[0])->str, RREGEXP(argv[0])->len, + rb_enc_get(argv[0])); } else { if (argc >= 2) { @@ -1873,11 +1886,10 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) flags &= ~ARG_KCODE_MASK; flags |= char_to_arg_kcode((int )kcode[0]); } - s = StringValuePtr(argv[0]); - len = RSTRING_LEN(argv[0]); + str = argv[0]; } - if (rb_reg_initialize(self, s, len, flags, err) != 0) { - rb_exc_raise(rb_reg_error_desc(s, len, flags, err)); + if (rb_reg_initialize_str(self, str, flags, err) != 0) { + rb_reg_raise_str(str, err, self); } return self; } @@ -1885,6 +1897,7 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self) VALUE rb_reg_quote(VALUE str) { + rb_encoding *enc = rb_enc_get(str); char *s, *send, *t; VALUE tmp; int c; @@ -1893,8 +1906,8 @@ rb_reg_quote(VALUE str) send = s + RSTRING_LEN(str); for (; s < send; s++) { c = *s; - if (ismbchar(*s)) { - int n = mbclen(*s); + if (ismbchar(s, enc)) { + int n = mbclen(s, enc); while (n-- && s < send) s++; @@ -1922,8 +1935,8 @@ rb_reg_quote(VALUE str) for (; s < send; s++) { c = *s; - if (ismbchar(*s)) { - int n = mbclen(*s); + if (ismbchar(s, enc)) { + int n = mbclen(s, enc); while (n-- && s < send) *t++ = *s++; @@ -2146,9 +2159,8 @@ rb_reg_init_copy(VALUE copy, VALUE re) rb_reg_check(re); s = RREGEXP(re)->str; len = RREGEXP(re)->len; - options = rb_reg_options(re); - if (rb_reg_initialize(copy, s, len, options, err) != 0) { - rb_exc_raise(rb_reg_error_desc(s, len, options, err)); + if (rb_reg_initialize(copy, s, len, rb_enc_get(re), rb_reg_options(re), err) != 0) { + rb_reg_raise(s, len, err, copy); } return copy; } @@ -2160,20 +2172,20 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) char *p, *s, *e; unsigned char uc; int no; + rb_encoding *enc = rb_enc_check(str, src); - + rb_enc_check(str, regexp); p = s = RSTRING_PTR(str); e = s + RSTRING_LEN(str); while (s < e) { - char *ss = s; + char *ss = s++; - uc = (unsigned char)*s++; - if (ismbchar(uc)) { - s += mbclen(uc) - 1; + if (ismbchar(ss, enc)) { + s += mbclen(ss, enc) - 1; continue; } - if (uc != '\\' || s == e) continue; + if (*ss != '\\' || s == e) continue; if (!val) { val = rb_str_buf_new(ss-p); @@ -2203,8 +2215,7 @@ rb_reg_regsub(VALUE str, VALUE src, struct re_registers *regs, VALUE regexp) name_end = name = s + 1; while (name_end < e) { if (*name_end == '>') break; - uc = (unsigned char)*name_end; - name_end += mbclen(uc); + name_end += mbclen(name_end, enc); } if (name_end < e) { no = name_to_backref_number(regs, regexp, name, name_end); diff --git a/regint.h b/regint.h index 6c6d2746fa..1705410eec 100644 --- a/regint.h +++ b/regint.h @@ -127,6 +127,7 @@ #define onig_st_nothing_key_free st_nothing_key_free #define onig_st_is_member st_is_member +#define USE_UPPER_CASE_TABLE #else #define st_init_table onig_st_init_table diff --git a/ruby.c b/ruby.c index 7fb36355fa..ad3c172971 100644 --- a/ruby.c +++ b/ruby.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Aug 10 12:47:31 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan @@ -676,7 +676,7 @@ proc_options(int argc, char **argv) case 'F': if (*++s) { - rb_fs = rb_reg_new(s, strlen(s), 0); + rb_fs = rb_reg_new(rb_str_new2(s), 0); } break; @@ -962,10 +962,14 @@ load_file(VALUE parser, const char *fname, int script) rb_raise(rb_eLoadError, "no Ruby script found in input"); } - c = rb_io_getc(f); + c = rb_io_getbyte(f); if (c == INT2FIX('#')) { - c = rb_io_getc(f); - if (c == INT2FIX('!') && !NIL_P(line = rb_io_gets(f))) { + c = rb_io_getbyte(f); + if (c == INT2FIX('!')) { + line = rb_io_gets(f); + if (NIL_P(line)) + return 0; + if ((p = strstr(RSTRING_PTR(line), "ruby")) == 0) { /* not ruby script, kick the program */ char **argv; @@ -1011,8 +1015,7 @@ load_file(VALUE parser, const char *fname, int script) } /* push back shebang for pragma may exist in next line */ - rb_io_ungetc(f, INT2FIX('\n')); - rb_io_ungetc(f, INT2FIX('!')); + rb_io_ungetc(f, rb_str_new2("!\n")); } else if (!NIL_P(c)) { rb_io_ungetc(f, c); diff --git a/signal.c b/signal.c index 772e5ec890..53e2a0f19f 100644 --- a/signal.c +++ b/signal.c @@ -6,7 +6,7 @@ $Date$ created at: Tue Dec 20 10:13:44 JST 1994 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan diff --git a/sprintf.c b/sprintf.c index 1c60837e78..7fbfe1b391 100644 --- a/sprintf.c +++ b/sprintf.c @@ -6,7 +6,7 @@ $Date$ created at: Fri Oct 15 10:39:26 JST 1993 - Copyright (C) 1993-2003 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan @@ -14,7 +14,7 @@ #include "ruby/ruby.h" #include "ruby/re.h" -#include +#include "ruby/encoding.h" #include #include @@ -115,7 +115,7 @@ sign_bits(int base, const char *p) ((nth >= argc) ? (rb_raise(rb_eArgError, "too few arguments"), 0) : argv[nth]) #define GETNUM(n, val) \ - for (; p < end && ISDIGIT(*p); p++) { \ + for (; p < end && rb_enc_isdigit(*p, enc); p++) { \ int next_n = 10 * n + (*p - '0'); \ if (next_n / 10 != n) {\ rb_raise(rb_eArgError, #val " too big"); \ @@ -254,6 +254,7 @@ rb_f_sprintf(int argc, const VALUE *argv) VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt) { + rb_encoding *enc; const char *p, *end; char *buf; int blen, bsiz; @@ -286,6 +287,7 @@ rb_str_format(int argc, const VALUE *argv, VALUE fmt) --argv; if (OBJ_TAINTED(fmt)) tainted = 1; StringValue(fmt); + enc = rb_enc_get(fmt); fmt = rb_str_new4(fmt); p = RSTRING_PTR(fmt); end = p + RSTRING_LEN(fmt); @@ -311,7 +313,7 @@ rb_str_format(int argc, const VALUE *argv, VALUE fmt) retry: switch (*p) { default: - if (ISPRINT(*p)) + if (rb_enc_isprint(*p, enc)) rb_raise(rb_eArgError, "malformed format string - %%%c", *p); else rb_raise(rb_eArgError, "malformed format string"); @@ -409,24 +411,38 @@ rb_str_format(int argc, const VALUE *argv, VALUE fmt) { VALUE val = GETARG(); VALUE tmp; - char c; + int c, n; tmp = rb_check_string_type(val); if (!NIL_P(tmp)) { - if (RSTRING_LEN(tmp) != 1) { + if (rb_enc_strlen(RSTRING_PTR(tmp),RSTRING_END(tmp),enc) != 1) { rb_raise(rb_eArgError, "%%c requires a character"); } - c = RSTRING_PTR(tmp)[0]; + c = rb_enc_codepoint(RSTRING_PTR(tmp), RSTRING_END(tmp), enc); } else { - c = NUM2INT(val) & 0xff; + c = NUM2INT(val); + } + n = rb_enc_codelen(c, enc); + if (n == 0) { + rb_raise(rb_eArgError, "invalid character"); } if (!(flags & FWIDTH)) { - PUSH(&c, 1); + CHECK(n); + rb_enc_mbcput(c, &buf[blen], enc); + blen += n; + } + else if ((flags & FMINUS)) { + CHECK(n); + rb_enc_mbcput(c, &buf[blen], enc); + blen += n; + FILL(' ', width-1); } else { - FILL(' ', width); - buf[blen - ((flags & FMINUS) ? width : 1)] = c; + FILL(' ', width-1); + CHECK(n); + rb_enc_mbcput(c, &buf[blen], enc); + blen += n; } } break; @@ -435,30 +451,42 @@ rb_str_format(int argc, const VALUE *argv, VALUE fmt) case 'p': { VALUE arg = GETARG(); - long len; + long len, slen; if (*p == 'p') arg = rb_inspect(arg); str = rb_obj_as_string(arg); if (OBJ_TAINTED(str)) tainted = 1; len = RSTRING_LEN(str); + enc = rb_enc_check(fmt, str); + if (flags&(FPREC|FWIDTH)) { + slen = rb_enc_strlen(RSTRING_PTR(str),RSTRING_END(str),enc); + if (slen < 0) { + rb_raise(rb_eArgError, "invalid mbstring sequence"); + } + } if (flags&FPREC) { - if (prec < len) { - len = prec; + if (prec < slen) { + char *p = rb_enc_nth(RSTRING_PTR(str), RSTRING_END(str), + prec, enc); + slen = prec; + len = p - RSTRING_PTR(str); } } /* need to adjust multi-byte string pos */ if (flags&FWIDTH) { - if (width > len) { - CHECK(width); - width -= len; + if (width > slen) { + width -= slen; if (!(flags&FMINUS)) { + CHECK(width); while (width--) { buf[blen++] = ' '; } } + CHECK(len); memcpy(&buf[blen], RSTRING_PTR(str), len); blen += len; if (flags&FMINUS) { + CHECK(width); while (width--) { buf[blen++] = ' '; } @@ -666,8 +694,9 @@ rb_str_format(int argc, const VALUE *argv, VALUE fmt) if (*p == 'X') { char *pp = s; - while (*pp) { - *pp = toupper(*pp); + int c; + while (c = (int)*pp) { + *pp = rb_enc_toupper(c, enc); pp++; } } diff --git a/string.c b/string.c index dd2d7473a8..d516d03438 100644 --- a/string.c +++ b/string.c @@ -6,7 +6,7 @@ $Date$ created at: Mon Aug 9 17:12:58 JST 1993 - Copyright (C) 1993-2006 Yukihiro Matsumoto + Copyright (C) 1993-2007 Yukihiro Matsumoto Copyright (C) 2000 Network Applied Communication Laboratory, Inc. Copyright (C) 2000 Information-technology Promotion Agency, Japan @@ -14,6 +14,7 @@ #include "ruby/ruby.h" #include "ruby/re.h" +#include "ruby/encoding.h" #define BEG(no) regs->beg[no] #define END(no) regs->end[no] @@ -30,6 +31,7 @@ VALUE rb_cSymbol; #define STR_TMPLOCK FL_USER7 #define STR_NOEMBED FL_USER1 +#define STR_SHARED FL_USER2 /* = ELTS_SHARED */ #define STR_ASSOC FL_USER3 #define STR_SHARED_P(s) FL_ALL(s, STR_NOEMBED|ELTS_SHARED) #define STR_ASSOC_P(s) FL_ALL(s, STR_NOEMBED|STR_ASSOC) @@ -90,11 +92,6 @@ VALUE rb_cSymbol; }\ } while (0) -char * -rb_str_ptr(VALUE str) { - return RSTRING_PTR(str); -} - VALUE rb_fs; static inline void @@ -159,6 +156,15 @@ rb_str_new(const char *ptr, long len) return str_new(rb_cString, ptr, len); } +VALUE +rb_enc_str_new(const char *ptr, long len, rb_encoding *enc) +{ + VALUE str = str_new(rb_cString, ptr, len); + + rb_enc_associate(str, enc); + return str; +} + VALUE rb_str_new2(const char *ptr) { @@ -203,6 +209,7 @@ str_new3(VALUE klass, VALUE str) RSTRING(str2)->as.heap.aux.shared = str; FL_SET(str2, ELTS_SHARED); } + rb_enc_copy((VALUE)str2, str); return str2; } @@ -233,6 +240,7 @@ str_new4(VALUE klass, VALUE str) FL_SET(str, ELTS_SHARED); RSTRING(str)->as.heap.aux.shared = str2; } + rb_enc_copy(str2, str); OBJ_INFECT(str2, str); return str2; } @@ -392,18 +400,48 @@ rb_str_init(int argc, VALUE *argv, VALUE str) return str; } +static int +str_strlen(VALUE str, rb_encoding *enc) +{ + int len; + + if (!enc) enc = rb_enc_get(str); + len = rb_enc_strlen(RSTRING_PTR(str), RSTRING_END(str), enc); + if (len < 0) { + rb_raise(rb_eArgError, "invalid mbstring sequence"); + } + return len; +} + /* * call-seq: * str.length => integer + * str.size => integer * - * Returns the length of str. + * Returns the character length of str. */ static VALUE rb_str_length(VALUE str) { - long len = RSTRING_LEN(str); - return LONG2NUM(len); + int len; + + len = str_strlen(str, rb_enc_get(str)); + return INT2NUM(len); +} + +/* + * call-seq: + * str.bytesize => integer + * + * Returns the length of str in bytes. + */ + +static VALUE +rb_str_bytesize(str) + VALUE str; +{ + return INT2NUM(RSTRING_LEN(str)); } /* @@ -438,8 +476,10 @@ VALUE rb_str_plus(VALUE str1, VALUE str2) { VALUE str3; + rb_encoding *enc; StringValue(str2); + enc = rb_enc_check(str1, str2); str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2)); memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1)); memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1), @@ -448,6 +488,7 @@ rb_str_plus(VALUE str1, VALUE str2) if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2)) OBJ_TAINT(str3); + rb_enc_associate(str3, enc); return str3; } @@ -481,8 +522,8 @@ rb_str_times(VALUE str, VALUE times) RSTRING_PTR(str), RSTRING_LEN(str)); } RSTRING_PTR(str2)[RSTRING_LEN(str2)] = '\0'; - OBJ_INFECT(str2, str); + rb_enc_copy(str2, str); return str2; } @@ -504,8 +545,10 @@ rb_str_times(VALUE str, VALUE times) static VALUE rb_str_format_m(VALUE str, VALUE arg) { - if (TYPE(arg) == T_ARRAY) { - return rb_str_format(RARRAY_LEN(arg), RARRAY_PTR(arg), str); + VALUE tmp = rb_check_array_type(arg); + + if (!NIL_P(tmp)) { + return rb_str_format(RARRAY_LEN(tmp), RARRAY_PTR(tmp), str); } return rb_str_format(1, &arg, str); } @@ -632,19 +675,66 @@ rb_str_s_try_convert(VALUE dummy, VALUE str) return rb_check_string_type(str); } +static char* +str_nth(const char *p, const char *e, int nth, rb_encoding *enc) +{ + p = rb_enc_nth(p, e, nth, enc); + if (!p) { + rb_raise(rb_eArgError, "invalid mbstring sequence"); + } + if (p > e) { + rb_raise(rb_eIndexError, "index out of range"); + } + return (char*)p; +} + +static int +str_offset(const char *p, const char *e, int nth, rb_encoding *enc) +{ + const char *pp = str_nth(p, e, nth, enc); + + return pp - p; +} + +static int +str_sublen(VALUE str, int pos, rb_encoding *enc) +{ + if (rb_enc_mbmaxlen(enc) == 1 || pos < 0) return pos; + else { + char *p = RSTRING_PTR(str); + char *e = p + pos; + int i; + + i = 0; + while (p < e) { + p += rb_enc_mbclen(p, enc); + i++; + } + return i; + } +} + +int +rb_str_sublen(VALUE str, int len) +{ + return str_sublen(str, len, rb_enc_get(str)); +} + VALUE rb_str_substr(VALUE str, long beg, long len) { + rb_encoding *enc = rb_enc_get(str); VALUE str2; + int slen = str_strlen(str, enc); if (len < 0) return Qnil; - if (beg > RSTRING_LEN(str)) return Qnil; + if (beg > slen) return Qnil; if (beg < 0) { - beg += RSTRING_LEN(str); + beg += slen; if (beg < 0) return Qnil; } - if (beg + len > RSTRING_LEN(str)) { - len = RSTRING_LEN(str) - beg; + if (beg + len > slen) { + len = slen - beg; } if (len < 0) { len = 0; @@ -652,16 +742,11 @@ rb_str_substr(VALUE str, long beg, long len) if (len == 0) { str2 = rb_str_new5(str,0,0); } - else if (len > RSTRING_EMBED_LEN_MAX && - beg + len == RSTRING_LEN(str) && !STR_ASSOC_P(str)) { - str2 = rb_str_new4(str); - str2 = str_new3(rb_obj_class(str2), str2); - RSTRING(str2)->as.heap.ptr += RSTRING_LEN(str2) - len; - RSTRING(str2)->as.heap.len = len; - } else { - str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len); + char *p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc); + str2 = rb_str_new5(str, p, str_offset(p, RSTRING_END(str), len, enc)); } + rb_enc_copy(str2, str); OBJ_INFECT(str2, str); return str2; @@ -848,7 +933,10 @@ rb_str_buf_append(VALUE str, VALUE str2) VALUE rb_str_append(VALUE str, VALUE str2) { + rb_encoding *enc; + StringValue(str2); + enc = rb_enc_check(str, str2); rb_str_modify(str); if (RSTRING_LEN(str2) > 0) { if (STR_ASSOC_P(str)) { @@ -863,6 +951,7 @@ rb_str_append(VALUE str, VALUE str2) } } OBJ_INFECT(str, str2); + rb_enc_associate(str, enc); return str; } @@ -875,8 +964,8 @@ rb_str_append(VALUE str, VALUE str2) * str.concat(obj) => str * * Append---Concatenates the given object to str. If the object is a - * Fixnum between 0 and 255, it is converted to a character before - * concatenation. + * Fixnum, it is considered as a codepoint, and is converted + * to a character before concatenation. * * a = "hello " * a << "world" #=> "hello world" @@ -887,11 +976,17 @@ VALUE rb_str_concat(VALUE str1, VALUE str2) { if (FIXNUM_P(str2)) { - int i = FIX2INT(str2); - if (0 <= i && i <= 0xff) { /* byte */ - char c = i; - return rb_str_cat(str1, &c, 1); + rb_encoding *enc = rb_enc_get(str1); + int c = FIX2INT(str2); + int pos = RSTRING_LEN(str1); + int len = rb_enc_codelen(c, enc); + + if (len == 0) { + rb_raise(rb_eArgError, "invalid codepoint 0x%x", c); } + rb_str_resize(str1, pos+len); + rb_enc_mbcput(c, RSTRING_PTR(str1)+pos, enc); + return str1; } return rb_str_append(str1, str2); } @@ -1048,6 +1143,7 @@ rb_str_cmp(VALUE str1, VALUE str2) long len; int retval; + rb_enc_check(str1, str2); /* xxxx error-less encoding check? */ len = lesser(RSTRING_LEN(str1), RSTRING_LEN(str2)); retval = rb_memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len); if (retval == 0) { @@ -1079,6 +1175,7 @@ rb_str_equal(VALUE str1, VALUE str2) } return rb_equal(str2, str1); } + rb_enc_check(str1, str2); /* need weak check */ if (RSTRING_LEN(str1) == RSTRING_LEN(str2) && rb_str_cmp(str1, str2) == 0) { return Qtrue; @@ -1194,15 +1291,23 @@ static long rb_str_index(VALUE str, VALUE sub, long offset) { long pos; + char *s; + long len, slen; + rb_encoding *enc; + enc = rb_enc_check(str, sub); + len = str_strlen(str, enc); + slen = str_strlen(sub, enc); if (offset < 0) { - offset += RSTRING_LEN(str); + offset += len; if (offset < 0) return -1; } - if (RSTRING_LEN(str) - offset < RSTRING_LEN(sub)) return -1; - if (RSTRING_LEN(sub) == 0) return offset; + if (len - offset < slen) return -1; + if (slen == 0) return offset; + s = offset ? str_nth(RSTRING_PTR(str), RSTRING_END(str), offset, enc) : RSTRING_PTR(str); + /* need proceed one character at a time */ pos = rb_memsearch(RSTRING_PTR(sub), RSTRING_LEN(sub), - RSTRING_PTR(str)+offset, RSTRING_LEN(str)-offset); + s, RSTRING_LEN(str)-(s - RSTRING_PTR(str))); if (pos < 0) return pos; return pos + offset; } @@ -1240,7 +1345,7 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) pos = 0; } if (pos < 0) { - pos += RSTRING_LEN(str); + pos += str_strlen(str, rb_enc_get(str)); if (pos < 0) { if (TYPE(sub) == T_REGEXP) { rb_backref_set(Qnil); @@ -1253,19 +1358,9 @@ rb_str_index_m(int argc, VALUE *argv, VALUE str) case T_REGEXP: pos = rb_reg_adjust_startpos(sub, str, pos, 0); pos = rb_reg_search(sub, str, pos, 0); + pos = rb_str_sublen(str, pos); break; - case T_FIXNUM: { - int c = FIX2INT(sub); - long len = RSTRING_LEN(str); - char *p = RSTRING_PTR(str); - - for (;poslen) { pos = rb_reg_adjust_startpos(sub, str, pos, 1); pos = rb_reg_search(sub, str, pos, 1); + pos = rb_str_sublen(str, pos); } if (pos >= 0) return LONG2NUM(pos); break; @@ -1382,23 +1486,6 @@ rb_str_rindex_m(int argc, VALUE *argv, VALUE str) pos = rb_str_rindex(str, sub, pos); if (pos >= 0) return LONG2NUM(pos); break; - - case T_FIXNUM: { - int c = FIX2INT(sub); - char *p = RSTRING_PTR(str) + pos; - char *pbeg = RSTRING_PTR(str); - - if (pos == RSTRING_LEN(str)) { - if (pos == 0) return Qnil; - --p; - } - while (pbeg <= p) { - if ((unsigned char)*p == c) - return LONG2NUM((char*)p - RSTRING_PTR(str)); - p--; - } - return Qnil; - } } return Qnil; } @@ -1462,7 +1549,7 @@ rb_str_match_m(int argc, VALUE *argv, VALUE str) return rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv); } -static char +static int succ_char(char *s) { char c = *s; @@ -1515,8 +1602,9 @@ succ_char(char *s) VALUE rb_str_succ(VALUE orig) { + rb_encoding *enc; VALUE str; - char *sbeg, *s; + char *sbeg, *s, *e; int c = -1; long n = 0; @@ -1524,10 +1612,13 @@ rb_str_succ(VALUE orig) OBJ_INFECT(str, orig); if (RSTRING_LEN(str) == 0) return str; + enc = rb_enc_get(orig); sbeg = RSTRING_PTR(str); s = sbeg + RSTRING_LEN(str) - 1; + e = RSTRING_END(str); while (sbeg <= s) { - if (ISALNUM(*s)) { + unsigned int cc = rb_enc_codepoint(s, e, enc); + if (rb_enc_isalnum(cc, enc)) { if ((c = succ_char(s)) == 0) break; n = s - sbeg; } @@ -1642,13 +1733,9 @@ rb_str_aref(VALUE str, VALUE indx) idx = FIX2LONG(indx); num_index: - if (idx < 0) { - idx = RSTRING_LEN(str) + idx; - } - if (idx < 0 || RSTRING_LEN(str) <= idx) { - return Qnil; - } - return rb_str_substr(str, idx, 1); + str = rb_str_substr(str, idx, 1); + if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil; + return str; case T_REGEXP: return rb_str_subpat(str, indx, 0); @@ -1664,14 +1751,14 @@ rb_str_aref(VALUE str, VALUE indx) long beg, len; VALUE tmp; - switch (rb_range_beg_len(indx, &beg, &len, RSTRING_LEN(str), 0)) { + len = str_strlen(str, rb_enc_get(str)); + switch (rb_range_beg_len(indx, &beg, &len, len, 0)) { case Qfalse: break; case Qnil: return Qnil; default: tmp = rb_str_substr(str, beg, len); - OBJ_INFECT(tmp, indx); return tmp; } } @@ -1745,27 +1832,8 @@ rb_str_aref_m(int argc, VALUE *argv, VALUE str) } static void -rb_str_splice(VALUE str, long beg, long len, VALUE val) +rb_str_splice_0(VALUE str, long beg, long len, VALUE val) { - if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); - - StringValue(val); - rb_str_modify(str); - - if (RSTRING_LEN(str) < beg) { - out_of_range: - rb_raise(rb_eIndexError, "index %ld out of string", beg); - } - if (beg < 0) { - if (-beg > RSTRING_LEN(str)) { - goto out_of_range; - } - beg += RSTRING_LEN(str); - } - if (RSTRING_LEN(str) < beg + len) { - len = RSTRING_LEN(str) - beg; - } - if (len < RSTRING_LEN(val)) { /* expand string */ RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1); @@ -1776,7 +1844,7 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val) RSTRING_PTR(str) + beg + len, RSTRING_LEN(str) - (beg + len)); } - if (RSTRING_LEN(str) < beg && len < 0) { + if (RSTRING_LEN(val) < beg && len < 0) { MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len); } if (RSTRING_LEN(val) > 0) { @@ -1789,6 +1857,41 @@ rb_str_splice(VALUE str, long beg, long len, VALUE val) OBJ_INFECT(str, val); } +static void +rb_str_splice(VALUE str, long beg, long len, VALUE val) +{ + long slen; + char *p, *e; + rb_encoding *enc; + + if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len); + + StringValue(val); + rb_str_modify(str); + enc = rb_enc_check(str, val); + slen = str_strlen(str, enc); + + if (slen < beg) { + out_of_range: + rb_raise(rb_eIndexError, "index %ld out of string", beg); + } + if (beg < 0) { + if (-beg > slen) { + goto out_of_range; + } + beg += slen; + } + if (slen < beg + len) { + len = slen - beg; + } + p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc); + e = str_nth(p, RSTRING_END(str), len, enc); + /* error check */ + beg = p - RSTRING_PTR(str); /* physical position */ + len = e - p; /* physical length */ + return rb_str_splice_0(str, beg, len, val); +} + void rb_str_update(VALUE str, long beg, long len, VALUE val) { @@ -1822,7 +1925,8 @@ rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val) } end = RMATCH(match)->END(nth); len = end - start; - rb_str_splice(str, start, len, val); + rb_enc_check(str, val); + rb_str_splice_0(str, start, len, val); } static VALUE @@ -1834,16 +1938,7 @@ rb_str_aset(VALUE str, VALUE indx, VALUE val) case T_FIXNUM: idx = FIX2LONG(indx); num_index: - if (RSTRING_LEN(str) <= idx) { - out_of_range: - rb_raise(rb_eIndexError, "index %ld out of string", idx); - } - if (idx < 0) { - if (-idx > RSTRING_LEN(str)) - goto out_of_range; - idx += RSTRING_LEN(str); - } - rb_str_splice(str, idx, 1, val); + rb_str_splice(str, idx, 1, val); return val; case T_REGEXP: @@ -1855,14 +1950,15 @@ rb_str_aset(VALUE str, VALUE indx, VALUE val) if (beg < 0) { rb_raise(rb_eIndexError, "string not matched"); } - rb_str_splice(str, beg, RSTRING_LEN(indx), val); + beg = rb_str_sublen(str, beg); + rb_str_splice(str, beg, str_strlen(indx, 0), val); return val; default: /* check if indx is Range */ { long beg, len; - if (rb_range_beg_len(indx, &beg, &len, RSTRING_LEN(str), 2)) { + if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) { rb_str_splice(str, beg, len, val); return val; } @@ -2352,6 +2448,7 @@ rb_str_replace(VALUE str, VALUE str2) } OBJ_INFECT(str, str2); + rb_enc_copy(str, str2); return str; } @@ -2394,33 +2491,6 @@ rb_str_chr(VALUE str) return rb_str_substr(str, 0, 1); } -/* - * call-seq: - * str.reverse! => str - * - * Reverses str in place. - */ - -static VALUE -rb_str_reverse_bang(VALUE str) -{ - char *s, *e; - char c; - - if (RSTRING_LEN(str) > 1) { - rb_str_modify(str); - s = RSTRING_PTR(str); - e = s + RSTRING_LEN(str) - 1; - while (s < e) { - c = *s; - *s++ = *e; - *e-- = c; - } - } - return str; -} - - /* * call-seq: * str.reverse => new_str @@ -2433,24 +2503,76 @@ rb_str_reverse_bang(VALUE str) static VALUE rb_str_reverse(VALUE str) { + rb_encoding *enc; VALUE obj; char *s, *e, *p; if (RSTRING_LEN(str) <= 1) return rb_str_dup(str); - + enc = rb_enc_get(str); obj = rb_str_new5(str, 0, RSTRING_LEN(str)); - s = RSTRING_PTR(str); e = s + RSTRING_LEN(str) - 1; - p = RSTRING_PTR(obj); + s = RSTRING_PTR(str); e = RSTRING_END(str); + p = RSTRING_END(obj); - while (e >= s) { - *p++ = *e--; + if (RSTRING_LEN(str) > 1) { + if (rb_enc_mbmaxlen(enc) == 1) { + while (s < e) { + *--p = *s++; + } + } + else { + while (s < e) { + int clen = rb_enc_mbclen(s, enc); + + if (clen == 0) { + rb_raise(rb_eArgError, "invalid mbstring sequence"); + } + p -= clen; + memcpy(p, s, clen); + s += clen; + } + } } + STR_SET_LEN(obj, RSTRING_LEN(str)); OBJ_INFECT(obj, str); + rb_enc_associate(obj, enc); return obj; } +/* + * call-seq: + * str.reverse! => str + * + * Reverses str in place. + */ + +static VALUE +rb_str_reverse_bang(VALUE str) +{ + rb_encoding *enc = rb_enc_get(str); + char *s, *e, c; + + if (RSTRING_LEN(str) > 1) { + rb_str_modify(str); + s = RSTRING_PTR(str); + e = RSTRING_END(str) - 1; + + if (rb_enc_mbmaxlen(enc) == 1) { + while (s < e) { + c = *s; + *s++ = *e; + *e-- = c; + } + } + else { + rb_str_shared_replace(str, rb_str_reverse(str)); + } + } + return str; +} + + /* * call-seq: * str.include? other_str => true or false @@ -2469,12 +2591,6 @@ rb_str_include(VALUE str, VALUE arg) { long i; - if (FIXNUM_P(arg)) { - if (memchr(RSTRING_PTR(str), FIX2INT(arg), RSTRING_LEN(str))) - return Qtrue; - return Qfalse; - } - StringValue(arg); i = rb_str_index(str, arg, 0); @@ -2561,7 +2677,22 @@ rb_str_to_s(VALUE str) return str; } -#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) +static void +str_cat_char(VALUE str, int c, rb_encoding *enc) +{ + char s[16]; + int n = rb_enc_codelen(c, enc); + + rb_enc_mbcput(c, s, enc); + rb_str_buf_cat(str, s, n); +} + +static void +prefix_escape(VALUE str, int c, rb_encoding *enc) +{ + str_cat_char(str, '\\', enc); + str_cat_char(str, c, enc); +} /* * call-seq: @@ -2578,69 +2709,71 @@ rb_str_to_s(VALUE str) VALUE rb_str_inspect(VALUE str) { + rb_encoding *enc = rb_enc_get(str); char *p, *pend; - VALUE result = rb_str_buf_new2("\""); - char s[5]; + VALUE result = rb_str_buf_new2(""); - p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str); + str_cat_char(result, '"', enc); + p = RSTRING_PTR(str); pend = RSTRING_END(str); while (p < pend) { - char c = *p++; - if (ismbchar(c) && p < pend) { - int len = mbclen(c); - rb_str_buf_cat(result, p - 1, len); - p += len - 1; - } - else if (c == '"'|| c == '\\' || (c == '#' && IS_EVSTR(p, pend))) { - s[0] = '\\'; s[1] = c; - rb_str_buf_cat(result, s, 2); - } - else if (ISPRINT(c)) { - s[0] = c; - rb_str_buf_cat(result, s, 1); + int c = rb_enc_codepoint(p, pend, enc); + int n = rb_enc_codelen(c, enc); + int cc; + + p += n; + if (c == '"'|| c == '\\' || + (c == '#' && (cc = rb_enc_codepoint(p,pend,enc), + (cc == '$' || cc == '@' || cc == '{')))) { + prefix_escape(result, c, enc); } else if (c == '\n') { - s[0] = '\\'; s[1] = 'n'; - rb_str_buf_cat(result, s, 2); + prefix_escape(result, 'n', enc); } else if (c == '\r') { - s[0] = '\\'; s[1] = 'r'; - rb_str_buf_cat(result, s, 2); + prefix_escape(result, 'r', enc); } else if (c == '\t') { - s[0] = '\\'; s[1] = 't'; - rb_str_buf_cat(result, s, 2); + prefix_escape(result, 't', enc); } else if (c == '\f') { - s[0] = '\\'; s[1] = 'f'; - rb_str_buf_cat(result, s, 2); + prefix_escape(result, 'f', enc); } else if (c == '\013') { - s[0] = '\\'; s[1] = 'v'; - rb_str_buf_cat(result, s, 2); + prefix_escape(result, 'v', enc); } else if (c == '\010') { - s[0] = '\\'; s[1] = 'b'; - rb_str_buf_cat(result, s, 2); + prefix_escape(result, 'b', enc); } else if (c == '\007') { - s[0] = '\\'; s[1] = 'a'; - rb_str_buf_cat(result, s, 2); + prefix_escape(result, 'a', enc); } else if (c == 033) { - s[0] = '\\'; s[1] = 'e'; - rb_str_buf_cat(result, s, 2); + prefix_escape(result, 'e', enc); + } + else if (rb_enc_isprint(c, enc)) { + char buf[5]; + + rb_enc_mbcput(c, buf, enc); + rb_str_buf_cat(result, buf, n); } else { - sprintf(s, "\\%03o", c & 0377); - rb_str_buf_cat2(result, s); + char buf[5]; + char *s = buf; + + sprintf(buf, "\\%03o", c & 0377); + while (*s) { + str_cat_char(result, *s++, enc); + } } } - rb_str_buf_cat2(result, "\""); + str_cat_char(result, '"', enc); OBJ_INFECT(result, str); + rb_enc_associate(result, enc); return result; } +#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{')) /* * call-seq: @@ -2653,6 +2786,7 @@ rb_str_inspect(VALUE str) VALUE rb_str_dump(VALUE str) { + rb_encoding *enc = rb_enc_from_index(0); long len; char *p, *pend; char *q, *qend; @@ -2675,7 +2809,7 @@ rb_str_dump(VALUE str) break; default: - if (ISPRINT(c)) { + if (rb_enc_isprint(c, enc)) { len++; } else { @@ -2701,9 +2835,6 @@ rb_str_dump(VALUE str) if (IS_EVSTR(p, pend)) *q++ = '\\'; *q++ = '#'; } - else if (ISPRINT(c)) { - *q++ = c; - } else if (c == '\n') { *q++ = '\\'; *q++ = 'n'; @@ -2736,6 +2867,9 @@ rb_str_dump(VALUE str) *q++ = '\\'; *q++ = 'e'; } + else if (rb_enc_isprint(c, enc)) { + *q++ = c; + } else { *q++ = '\\'; sprintf(q, "%03o", c&0xff); @@ -2745,6 +2879,8 @@ rb_str_dump(VALUE str) *q++ = '"'; OBJ_INFECT(result, str); + /* result from dump is ASCII */ + rb_enc_associate(result, enc); return result; } @@ -2761,20 +2897,22 @@ rb_str_dump(VALUE str) static VALUE rb_str_upcase_bang(VALUE str) { + rb_encoding *enc; char *s, *send; int modify = 0; rb_str_modify(str); - s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); + enc = rb_enc_get(str); + s = RSTRING_PTR(str); send = RSTRING_END(str); while (s < send) { - if (ismbchar(*s)) { - s+=mbclen(*s) - 1; - } - else if (ISLOWER(*s)) { - *s = toupper(*s); + int c = rb_enc_codepoint(s, send, enc); + + if (rb_enc_islower(c, enc)) { + /* assuming toupper returns codepoint with same size */ + rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); modify = 1; } - s++; + s += rb_enc_codelen(c, enc); } if (modify) return str; @@ -2815,20 +2953,22 @@ rb_str_upcase(VALUE str) static VALUE rb_str_downcase_bang(VALUE str) { + rb_encoding *enc; char *s, *send; int modify = 0; rb_str_modify(str); - s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); + enc = rb_enc_get(str); + s = RSTRING_PTR(str); send = RSTRING_END(str); while (s < send) { - if (ismbchar(*s)) { - s+=mbclen(*s) - 1; - } - else if (ISUPPER(*s)) { - *s = tolower(*s); + int c = rb_enc_codepoint(s, send, enc); + + if (rb_enc_isupper(c, enc)) { + /* assuming toupper returns codepoint with same size */ + rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); modify = 1; } - s++; + s += rb_enc_codelen(c, enc); } if (modify) return str; @@ -2874,24 +3014,29 @@ rb_str_downcase(VALUE str) static VALUE rb_str_capitalize_bang(VALUE str) { + rb_encoding *enc; char *s, *send; int modify = 0; + int c; rb_str_modify(str); + enc = rb_enc_get(str); if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil; - s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); - if (ISLOWER(*s)) { - *s = toupper(*s); + s = RSTRING_PTR(str); send = RSTRING_END(str); + + c = rb_enc_codepoint(s, send, enc); + if (rb_enc_islower(c, enc)) { + rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); modify = 1; } - while (++s < send) { - if (ismbchar(*s)) { - s+=mbclen(*s) - 1; - } - else if (ISUPPER(*s)) { - *s = tolower(*s); + s += rb_enc_codelen(c, enc); + while (s < send) { + c = rb_enc_codepoint(s, send, enc); + if (rb_enc_isupper(c, enc)) { + rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); modify = 1; } + s += rb_enc_codelen(c, enc); } if (modify) return str; return Qnil; @@ -2932,24 +3077,27 @@ rb_str_capitalize(VALUE str) static VALUE rb_str_swapcase_bang(VALUE str) { + rb_encoding *enc; char *s, *send; int modify = 0; rb_str_modify(str); - s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); + enc = rb_enc_get(str); + s = RSTRING_PTR(str); send = RSTRING_END(str); while (s < send) { - if (ismbchar(*s)) { - s+=mbclen(*s) - 1; - } - else if (ISUPPER(*s)) { - *s = tolower(*s); + int c = rb_enc_codepoint(s, send, enc); + + if (rb_enc_isupper(c, enc)) { + /* assuming toupper returns codepoint with same size */ + rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc); modify = 1; } - else if (ISLOWER(*s)) { - *s = toupper(*s); + else if (rb_enc_islower(c, enc)) { + /* assuming toupper returns codepoint with same size */ + rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc); modify = 1; } - s++; + s += rb_enc_codelen(c, enc); } if (modify) return str; @@ -2985,24 +3133,21 @@ struct tr { }; static int -trnext(struct tr *t) +trnext(struct tr *t, rb_encoding *enc) { for (;;) { if (!t->gen) { if (t->p == t->pend) return -1; - if (t->p < t->pend - 1 && *t->p == '\\') { - t->p++; - } - t->now = *(USTR)t->p++; + t->now = rb_enc_codepoint(t->p, t->pend, enc); + t->p += rb_enc_codelen(t->now, enc); if (t->p < t->pend - 1 && *t->p == '-') { t->p++; if (t->p < t->pend) { - if (t->now > *(USTR)t->p) { - t->p++; - continue; - } + int c = rb_enc_codepoint(t->p, t->pend, enc); + t->p += rb_enc_codelen(c, enc); + if (t->now > c) continue; t->gen = 1; - t->max = *(USTR)t->p++; + t->max = c; } } return t->now; @@ -3022,11 +3167,12 @@ static VALUE rb_str_delete_bang(int,VALUE*,VALUE); static VALUE tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) { + rb_encoding *enc; struct tr trsrc, trrepl; int cflag = 0; - int trans[256]; - int i, c, modify = 0; + int c, last, modify = 0; char *s, *send; + VALUE hash; StringValue(src); StringValue(repl); @@ -3039,74 +3185,139 @@ tr_trans(VALUE str, VALUE src, VALUE repl, int sflag) if (RSTRING_LEN(repl) == 0) { return rb_str_delete_bang(1, &src, str); } + enc = rb_enc_check(str, src); + if (rb_enc_check(str, repl) != enc) { + rb_raise(rb_eArgError, "character encodings differ"); + } trrepl.p = RSTRING_PTR(repl); trrepl.pend = trrepl.p + RSTRING_LEN(repl); trsrc.gen = trrepl.gen = 0; trsrc.now = trrepl.now = 0; trsrc.max = trrepl.max = 0; + hash = rb_hash_new(); if (cflag) { - for (i=0; i<256; i++) { - trans[i] = 1; + while ((c = trnext(&trsrc, enc)) >= 0) { + rb_hash_aset(hash, INT2NUM(c), Qtrue); } - while ((c = trnext(&trsrc)) >= 0) { - trans[c & 0xff] = -1; - } - while ((c = trnext(&trrepl)) >= 0) + while ((c = trnext(&trrepl, enc)) >= 0) /* retrieve last replacer */; - for (i=0; i<256; i++) { - if (trans[i] >= 0) { - trans[i] = trrepl.now; - } - } + last = trrepl.now; } else { int r; - for (i=0; i<256; i++) { - trans[i] = -1; - } - while ((c = trnext(&trsrc)) >= 0) { - r = trnext(&trrepl); + while ((c = trnext(&trsrc, enc)) >= 0) { + r = trnext(&trrepl, enc); if (r == -1) r = trrepl.now; - trans[c & 0xff] = r; + rb_hash_aset(hash, INT2NUM(c), INT2NUM(r)); } } rb_str_modify(str); - s = RSTRING_PTR(str); send = s + RSTRING_LEN(str); + s = RSTRING_PTR(str); send = RSTRING_END(str); if (sflag) { - char *t = s; - int c0, last = -1; + int clen, tlen, max = RSTRING_LEN(str); + int offset, save = -1; + char *buf = ALLOC_N(char, max), *t = buf; + VALUE v; + if (cflag) tlen = rb_enc_codelen(last, enc); while (s < send) { - c0 = *s++; - if ((c = trans[c0 & 0xff]) >= 0) { - if (last == c) continue; - last = c; - *t++ = c & 0xff; + c = rb_enc_codepoint(s, send, enc); + tlen = clen = rb_enc_codelen(c, enc); + + s += clen; + v = rb_hash_aref(hash, INT2NUM(c)); + if (!NIL_P(v)) { + if (!cflag) { + c = NUM2INT(v); + if (save == c) continue; + save = c; + tlen = rb_enc_codelen(c, enc); + modify = 1; + } + } + else if (cflag) { + save = c = last; modify = 1; } else { - last = -1; - *t++ = c0; + save = -1; } + while (t - buf + tlen >= max) { + offset = t - buf; + max *= 2; + REALLOC_N(buf, char, max); + t = buf + offset; + } + rb_enc_mbcput(c, t, enc); + t += tlen; } - if (RSTRING_LEN(str) > (t - RSTRING_PTR(str))) { - STR_SET_LEN(str, (t - RSTRING_PTR(str))); - modify = 1; - *t = '\0'; - } + *t = '\0'; + RSTRING(str)->as.heap.ptr = buf; + RSTRING(str)->as.heap.len = t - buf; + STR_SET_NOEMBED(str); + RSTRING(str)->as.heap.aux.capa = max; } - else { + else if (rb_enc_mbmaxlen(enc) == 1) { while (s < send) { - if ((c = trans[*s & 0xff]) >= 0) { - *s = c & 0xff; + VALUE v = rb_hash_aref(hash, INT2FIX(*s)); + if (!NIL_P(v)) { + if (cflag) { + *s = last; + } + else { + c = FIX2INT(v); + *s = c & 0xff; + } modify = 1; } s++; } } + else { + int clen, tlen, max = RSTRING_LEN(str) * 1.2; + int offset; + char *buf = ALLOC_N(char, max), *t = buf; + VALUE v; + + if (cflag) tlen = rb_enc_codelen(last, enc); + while (s < send) { + c = rb_enc_codepoint(s, send, enc); + tlen = clen = rb_enc_codelen(c, enc); + + v = rb_hash_aref(hash, INT2NUM(c)); + if (!NIL_P(v)) { + if (!cflag) { + c = NUM2INT(v); + tlen = rb_enc_codelen(c, enc); + modify = 1; + } + } + else if (cflag) { + c = last; + modify = 1; + } + while (t - buf + tlen >= max) { + offset = t - buf; + max *= 2; + REALLOC_N(buf, char, max); + t = buf + offset; + } + if (s != t) rb_enc_mbcput(c, t, enc); + s += clen; + t += tlen; + } + if (!STR_EMBED_P(str)) { + free(RSTRING(str)->as.heap.ptr); + } + *t = '\0'; + RSTRING(str)->as.heap.ptr = buf; + RSTRING(str)->as.heap.len = t - buf; + STR_SET_NOEMBED(str); + RSTRING(str)->as.heap.aux.capa = max; + } if (modify) return str; return Qnil; @@ -3155,34 +3366,32 @@ rb_str_tr(VALUE str, VALUE src, VALUE repl) } static void -tr_setup_table(VALUE str, char table[256], int init) +tr_setup_table(VALUE str, VALUE *tablep, VALUE *ctablep, rb_encoding *enc) { - char buf[256]; struct tr tr; - int i, c; - int cflag = 0; + int c; + VALUE table, ptable; tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str); tr.gen = tr.now = tr.max = 0; + table = rb_hash_new(); if (RSTRING_LEN(str) > 1 && RSTRING_PTR(str)[0] == '^') { - cflag = 1; tr.p++; + ptable = *ctablep; + *ctablep = table; + } + else { + ptable = *tablep; + *tablep = table; } - if (init) { - for (i=0; i<256; i++) { - table[i] = 1; + while ((c = trnext(&tr, enc)) >= 0) { + VALUE key = INT2NUM(c); + + if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) { + rb_hash_aset(table, key, Qtrue); } } - for (i=0; i<256; i++) { - buf[i] = cflag; - } - while ((c = trnext(&tr)) >= 0) { - buf[c & 0xff] = !cflag; - } - for (i=0; i<256; i++) { - table[i] = table[i] && buf[i]; - } } @@ -3197,10 +3406,10 @@ tr_setup_table(VALUE str, char table[256], int init) static VALUE rb_str_delete_bang(int argc, VALUE *argv, VALUE str) { + rb_encoding *enc; char *s, *send, *t; - char squeez[256]; + VALUE del = 0, nodel = 0; int modify = 0; - int init = 1; int i; if (argc < 1) { @@ -3210,20 +3419,28 @@ rb_str_delete_bang(int argc, VALUE *argv, VALUE str) VALUE s = argv[i]; StringValue(s); - tr_setup_table(s, squeez, init); - init = 0; + enc = rb_enc_check(str, s); + tr_setup_table(s, &del, &nodel, enc); } rb_str_modify(str); s = t = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qnil; - send = s + RSTRING_LEN(str); + send = RSTRING_END(str); while (s < send) { - if (squeez[*s & 0xff]) + int c = rb_enc_codepoint(s, send, enc); + int clen = rb_enc_codelen(c, enc); + VALUE v = INT2NUM(c); + + if ((del && !NIL_P(rb_hash_aref(del, v))) && + (!nodel || NIL_P(rb_hash_aref(nodel, v)))) { modify = 1; - else - *t++ = *s; - s++; + } + else { + if (t != s) rb_enc_mbcput(c, t, enc); + t += clen; + } + s += clen; } *t = '\0'; STR_SET_LEN(str, t - RSTRING_PTR(str)); @@ -3267,37 +3484,43 @@ rb_str_delete(int argc, VALUE *argv, VALUE str) static VALUE rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str) { - char squeez[256]; + rb_encoding *enc; + VALUE del = 0, nodel = 0; char *s, *send, *t; - int c, save, modify = 0; - int init = 1; + int save, modify = 0; int i; if (argc == 0) { - for (i=0; i<256; i++) { - squeez[i] = 1; - } + enc = rb_enc_get(str); } else { for (i=0; i= 0) { regs = RMATCH(rb_backref_get())->regs; if (start == end && BEG(0) == END(0)) { @@ -3550,11 +3784,12 @@ rb_str_split_m(int argc, VALUE *argv, VALUE str) break; } else if (last_null == 1) { - rb_ary_push(result, rb_str_substr(str, beg, mbclen2(RSTRING_PTR(str)[beg],spat))); + rb_ary_push(result, rb_str_substr(str, beg, + rb_enc_mbclen(RSTRING_PTR(str)+beg,enc))); beg = start; } else { - start += mbclen2(RSTRING_PTR(str)[start],spat); + start += rb_enc_mbclen(RSTRING_PTR(str)+start,enc); last_null = 1; continue; } @@ -3652,9 +3887,10 @@ rb_str_split(VALUE str, const char *sep0) static VALUE rb_str_each_line(int argc, VALUE *argv, VALUE str) { + rb_encoding *enc; VALUE rs; int newline; - char *p = RSTRING_PTR(str), *pend = p + RSTRING_LEN(str), *s; + char *p = RSTRING_PTR(str), *pend = p + RSTRING_LEN(str), *s = p; char *ptr = p; long len = RSTRING_LEN(str), rslen; VALUE line; @@ -3662,7 +3898,6 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str) if (rb_scan_args(argc, argv, "01", &rs) == 0) { rs = rb_rs; } - RETURN_ENUMERATOR(str, argc, argv); if (NIL_P(rs)) { @@ -3670,28 +3905,28 @@ rb_str_each_line(int argc, VALUE *argv, VALUE str) return str; } StringValue(rs); + enc = rb_enc_check(str, rs); rslen = RSTRING_LEN(rs); if (rslen == 0) { newline = '\n'; } else { - newline = RSTRING_PTR(rs)[rslen-1]; + newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc); } - for (s = p, p += rslen; p < pend; p++) { - if (rslen == 0 && *p == '\n') { - if (*++p != '\n') continue; - while (*p == '\n') p++; - } - if (RSTRING_PTR(str) < p && p[-1] == newline && - (rslen <= 1 || - rb_memcmp(RSTRING_PTR(rs), p-rslen, rslen) == 0)) { - line = rb_str_new5(str, s, p - s); + while (p < pend) { + int c = rb_enc_codepoint(p, pend, enc); + int n = rb_enc_codelen(c, enc); + + if (c == newline && + (rslen <= 1 || rb_memcmp(RSTRING_PTR(rs), p, rslen) == 0)) { + line = rb_str_new5(str, s, p - s + (rslen ? rslen : n)); OBJ_INFECT(line, str); rb_yield(line); str_mod_check(str, ptr, len); - s = p; + s = p + n; } + p += n; } if (s != pend) { @@ -3744,6 +3979,44 @@ rb_str_each_byte(VALUE str) } +/* + * Document-method: chars + * call-seq: + * str.chars => anEnumerator + * str.chars {|substr| block } => str + * + * Returns an enumerator that gives each character in the string. + * If a block is given, it iterates over each character in the string. + * + * "foo".lines.to_a #=> ["f","o","o"] + */ + +/* + * Document-method: each_char + * call-seq: + * str.each_char {|cstr| block } => str + * + * Passes each character in str to the given block. + * + * "hello".each_char {|c| print c, ' ' } + * + * produces: + * + * h e l l o + */ + +static VALUE +rb_str_each_char(VALUE str) +{ + int i, len = str_strlen(str, 0); + + RETURN_ENUMERATOR(str, 0, 0); + for (i=0; i str or nil @@ -3918,13 +4191,21 @@ rb_str_chomp(int argc, VALUE *argv, VALUE str) static VALUE rb_str_lstrip_bang(VALUE str) { + rb_encoding *enc; char *s, *t, *e; + rb_str_modify(str); + enc = rb_enc_get(str); s = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qnil; - e = t = s + RSTRING_LEN(str); + e = t = RSTRING_END(str); /* remove spaces at head */ - while (s < t && ISSPACE(*s)) s++; + while (s < e) { + int cc = rb_enc_codepoint(s, e, enc); + + if (!rb_enc_isspace(cc, enc)) break; + s += rb_enc_codelen(cc, enc); + } if (s > RSTRING_PTR(str)) { rb_str_modify(str); @@ -3972,21 +4253,30 @@ rb_str_lstrip(VALUE str) static VALUE rb_str_rstrip_bang(VALUE str) { + rb_encoding *enc; char *s, *t, *e; + int space_seen = Qfalse; - s = RSTRING_PTR(str); + rb_str_modify(str); + enc = rb_enc_get(str); + s = t = RSTRING_PTR(str); if (!s || RSTRING_LEN(str) == 0) return Qnil; - e = t = s + RSTRING_LEN(str); - - /* remove trailing '\0's */ - while (s < t && t[-1] == '\0') t--; - - /* remove trailing spaces */ - while (s < t && ISSPACE(*(t-1))) t--; + e = RSTRING_END(str); + while (s < e) { + int cc = rb_enc_codepoint(s, e, enc); + if (!cc || rb_enc_isspace(cc, enc)) { + if (!space_seen) t = s; + space_seen = Qtrue; + } + else { + space_seen = Qfalse; + } + s += rb_enc_codelen(cc, enc); + } if (t < e) { rb_str_modify(str); - STR_SET_LEN(str, t-s); + STR_SET_LEN(str, t-RSTRING_PTR(str)); RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0'; return str; } @@ -4054,10 +4344,12 @@ rb_str_strip(VALUE str) static VALUE scan_once(VALUE str, VALUE pat, long *start) { + rb_encoding *enc; VALUE result, match; struct re_registers *regs; long i; + enc = rb_enc_check(str, pat); if (rb_reg_search(pat, str, *start, 0) >= 0) { match = rb_backref_get(); regs = RMATCH(match)->regs; @@ -4066,7 +4358,7 @@ scan_once(VALUE str, VALUE pat, long *start) * Always consume at least one character of the input string */ if (RSTRING_LEN(str) > END(0)) - *start = END(0)+mbclen2(RSTRING_PTR(str)[END(0)],pat); + *start = END(0)+rb_enc_mbclen(RSTRING_PTR(str)+END(0),enc); else *start = END(0)+1; } @@ -4251,7 +4543,7 @@ rb_str_intern(VALUE s) if (OBJ_TAINTED(str) && rb_safe_level() >= 1) { rb_raise(rb_eSecurityError, "Insecure: can't intern tainted string"); } - id = rb_intern2(RSTRING_PTR(str), RSTRING_LEN(str)); + id = rb_intern_str(str); return ID2SYM(id); } @@ -4335,65 +4627,81 @@ rb_str_sum(int argc, VALUE *argv, VALUE str) static VALUE rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag) { + rb_encoding *enc; VALUE w; - long width, flen = 0; + long width, len, flen = 1, fclen = 1; VALUE res; - char *p, *pend, *f = " "; - long n; - VALUE pad; + char *p, *f = " "; + long n, llen, rlen; + volatile VALUE pad; rb_scan_args(argc, argv, "11", &w, &pad); + enc = rb_enc_get(str); width = NUM2LONG(w); if (argc == 2) { StringValue(pad); + rb_enc_check(str, pad); f = RSTRING_PTR(pad); flen = RSTRING_LEN(pad); + fclen = str_strlen(pad, enc); if (flen == 0) { rb_raise(rb_eArgError, "zero width padding"); } } - if (width < 0 || RSTRING_LEN(str) >= width) return rb_str_dup(str); - res = rb_str_new5(str, 0, width); +#if 0 + else if (!m17n_asciicompat(enc)) { + rb_raise(rb_eArgError, "character encodings differ"); + } +#endif + len = str_strlen(str, enc); + if (width < 0 || len >= width) return rb_str_dup(str); + n = width - len; + llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2); + rlen = n - llen; + res = rb_str_new5(str, 0, RSTRING_LEN(str)+n*flen/fclen+2); p = RSTRING_PTR(res); - if (jflag != 'l') { - n = width - RSTRING_LEN(str); - pend = p + ((jflag == 'r') ? n : n/2); + while (llen) { if (flen <= 1) { - while (p < pend) { - *p++ = *f; - } + *p++ = *f; + llen--; + } + else if (llen > fclen) { + memcpy(p,f,flen); + p += flen; + llen -= fclen; } else { - char *q = f; - while (p + flen <= pend) { - memcpy(p,f,flen); - p += flen; - } - while (p < pend) { - *p++ = *q++; - } + char *fp = str_nth(f, f+flen, llen, enc); + n = fp - f; + memcpy(p,f,n); + p+=n; + break; } } - memcpy(p, RSTRING_PTR(str), RSTRING_LEN(str)+1); - if (jflag != 'r') { - p += RSTRING_LEN(str); pend = RSTRING_PTR(res) + width; + memcpy(p, RSTRING_PTR(str), RSTRING_LEN(str)); + p+=RSTRING_LEN(str); + while (rlen) { if (flen <= 1) { - while (p < pend) { - *p++ = *f; - } + *p++ = *f; + rlen--; + } + else if (rlen > fclen) { + memcpy(p,f,flen); + p += flen; + rlen -= fclen; } else { - while (p + flen <= pend) { - memcpy(p,f,flen); - p += flen; - } - while (p < pend) { - *p++ = *f++; - } + char *fp = str_nth(f, f+flen, rlen, enc); + n = fp - f; + memcpy(p,f,n); + p+=n; + break; } } + *p = '\0'; + STR_SET_LEN(res, p-RSTRING_PTR(res)); OBJ_INFECT(res, str); - if (flen > 0) OBJ_INFECT(res, pad); + if (!NIL_P(pad)) OBJ_INFECT(res, pad); return res; } @@ -4493,6 +4801,7 @@ rb_str_partition(VALUE str, VALUE sep) failed: return rb_ary_new3(3, str, rb_str_new(0,0),rb_str_new(0,0)); } + pos = rb_str_sublen(str, pos); if (regex) { sep = rb_str_subpat(str, sep, 0); if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed; @@ -4534,11 +4843,13 @@ rb_str_rpartition(VALUE str, VALUE sep) rb_raise(rb_eTypeError, "type mismatch: %s given", rb_obj_classname(sep)); } + pos = rb_str_sublen(str, pos); pos = rb_str_rindex(str, sep, pos); } if (pos < 0) { return rb_ary_new3(3, rb_str_new(0,0),rb_str_new(0,0), str); } + pos = rb_str_sublen(str, pos); if (regex) { sep = rb_reg_nth_match(0, rb_backref_get()); } @@ -4563,6 +4874,7 @@ rb_str_start_with(int argc, VALUE *argv, VALUE str) for (i=0; i