From 9962aad7b0184e385b40c26c5a109bff7abbe43c Mon Sep 17 00:00:00 2001 From: naruse Date: Sat, 10 Aug 2013 20:44:10 +0000 Subject: [PATCH] * file.c (rb_str_normalize_ospath): HFS Plus (Mac OS Extended) uses a variant of Normal Form D in which U+2000 through U+2FFF, U+F900 through U+FAFF, and U+2F800 through U+2FAFF are not decomposed (this avoids problems with round trip conversions from old Mac text encodings). http://developer.apple.com/library/mac/qa/qa1173/_index.html Therefore fix r42457 to exclude the range. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@42498 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 10 +++ dir.c | 4 +- ext/-test-/string/depend | 1 + ext/-test-/string/extconf.rb | 1 + ext/-test-/string/normalize.c | 18 +++++ file.c | 43 +++++++++++- internal.h | 5 ++ test/-ext-/string/test_normalize.rb | 105 ++++++++++++++++++++++++++++ 8 files changed, 183 insertions(+), 4 deletions(-) create mode 100644 ext/-test-/string/normalize.c create mode 100644 test/-ext-/string/test_normalize.rb diff --git a/ChangeLog b/ChangeLog index 3e74589bb1..131ae62709 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +Sun Aug 11 04:48:14 2013 NARUSE, Yui + + * file.c (rb_str_normalize_ospath): + HFS Plus (Mac OS Extended) uses a variant of Normal Form D in which + U+2000 through U+2FFF, U+F900 through U+FAFF, and U+2F800 through + U+2FAFF are not decomposed (this avoids problems with round trip + conversions from old Mac text encodings). + http://developer.apple.com/library/mac/qa/qa1173/_index.html + Therefore fix r42457 to exclude the range. + Sun Aug 11 03:26:07 2013 Tanaka Akira * bignum.c (bitsize): Fix a conditional expression. diff --git a/dir.c b/dir.c index 7e9c659f9e..cf9aeafee4 100644 --- a/dir.c +++ b/dir.c @@ -84,8 +84,6 @@ char *strchr(char*,char); #include #include -VALUE rb_str_normalize_ospath(const char *ptr, long len); - static inline int is_hfs(DIR *dirp) { @@ -1420,7 +1418,7 @@ glob_helper( name = dp->d_name; namlen = NAMLEN(dp); # if HAVE_HFS - if (hfs_p && has_nonascii(name, namlen)) { + if (0&&hfs_p && has_nonascii(name, namlen)) { if (!NIL_P(utf8str = rb_str_normalize_ospath(name, namlen))) { RSTRING_GETMEM(utf8str, name, namlen); } diff --git a/ext/-test-/string/depend b/ext/-test-/string/depend index aad1fab427..86617ff289 100644 --- a/ext/-test-/string/depend +++ b/ext/-test-/string/depend @@ -2,3 +2,4 @@ $(OBJS): $(HDRS) $(ruby_headers) \ $(hdrdir)/ruby/encoding.h \ $(hdrdir)/ruby/oniguruma.h qsort.o: $(hdrdir)/ruby/util.h +normalize.o: $(top_srcdir)/internal.h diff --git a/ext/-test-/string/extconf.rb b/ext/-test-/string/extconf.rb index 42c10b994b..10d33cbab9 100644 --- a/ext/-test-/string/extconf.rb +++ b/ext/-test-/string/extconf.rb @@ -1,3 +1,4 @@ +$INCFLAGS << " -I$(topdir) -I$(top_srcdir)" $srcs = Dir[File.join($srcdir, "*.{#{SRC_EXT.join(%q{,})}}")] inits = $srcs.map {|s| File.basename(s, ".*")} inits.delete("init") diff --git a/ext/-test-/string/normalize.c b/ext/-test-/string/normalize.c new file mode 100644 index 0000000000..22bb6d7887 --- /dev/null +++ b/ext/-test-/string/normalize.c @@ -0,0 +1,18 @@ +#include "ruby.h" +#include "internal.h" + +#ifdef __APPLE__ +static VALUE +normalize_ospath(VALUE str) +{ + return rb_str_normalize_ospath(RSTRING_PTR(str), RSTRING_LEN(str)); +} +#else +#define normalize_ospath rb_f_notimplement +#endif + +void +Init_normalize(VALUE klass) +{ + rb_define_method(klass, "normalize_ospath", normalize_ospath, 0); +} diff --git a/file.c b/file.c index 3923ddeec7..347b7e6d0e 100644 --- a/file.c +++ b/file.c @@ -245,7 +245,7 @@ rb_str_encode_ospath(VALUE path) #ifdef __APPLE__ VALUE -rb_str_normalize_ospath(const char *ptr, long len) +rb_str_normalize_ospath0(const char *ptr, long len) { VALUE str; CFIndex buflen = 0; @@ -267,6 +267,47 @@ rb_str_normalize_ospath(const char *ptr, long len) CFRelease(s); return str; } + +VALUE +rb_str_normalize_ospath(const char *ptr, long len) +{ + const char *p = ptr; + const char *e = ptr + len; + const char *p1 = p; + VALUE str = rb_str_buf_new(len); + rb_encoding *enc = rb_utf8_encoding(); + rb_enc_associate(str, enc); + + while (p < e) { + int l; + int r = rb_enc_precise_mbclen(p, e, enc); + if (!MBCLEN_CHARFOUND_P(r)) { + /* invalid byte shall not happen but */ + rb_str_append(str, rb_str_normalize_ospath0(p1, p-p1)); + rb_str_cat2(str, "\xEF\xBF\xBD"); + p += 1; + } + l = MBCLEN_CHARFOUND_LEN(r); + int c = rb_enc_mbc_to_codepoint(p, e, enc); + if ((0x2000 <= c && c <= 0x2FFF) || (0xF900 <= c && c <= 0xFAFF) || + (0x2F800 <= c && c <= 0x2FAFF)) { + if (p - p1 > 0) { + rb_str_append(str, rb_str_normalize_ospath0(p1, p-p1)); + } + rb_str_cat(str, p, l); + p += l; + p1 = p; + } + else { + p += l; + } + } + if (p - p1 > 0) { + rb_str_append(str, rb_str_normalize_ospath0(p1, p-p1)); + } + + return str; +} #endif static long diff --git a/internal.h b/internal.h index a4cf43b939..d52b2e9442 100644 --- a/internal.h +++ b/internal.h @@ -513,6 +513,11 @@ VALUE rb_big_mul_karatsuba(VALUE x, VALUE y); VALUE rb_big_mul_toom3(VALUE x, VALUE y); VALUE rb_big_sq_fast(VALUE x); +/* file.c */ +#ifdef __APPLE__ +VALUE rb_str_normalize_ospath(const char *ptr, long len); +#endif + /* io.c */ void rb_maygvl_fd_fix_cloexec(int fd); diff --git a/test/-ext-/string/test_normalize.rb b/test/-ext-/string/test_normalize.rb new file mode 100644 index 0000000000..da24391110 --- /dev/null +++ b/test/-ext-/string/test_normalize.rb @@ -0,0 +1,105 @@ +require 'test/unit' +require "-test-/string/string" +require "tempfile" + +class Test_StringNormalize < Test::Unit::TestCase +=begin + def test_normalize_all + exclude = [ + #0x340, 0x341, 0x343, 0x344 + ] + (0x0080..0xFFFD).each do |n| + next if 0xD800 <= n && n <= 0xDFFF + next if exclude.include? n + code = n.to_s(16) + Tempfile.create("#{code}-#{n.chr(Encoding::UTF_8)}-") do |tempfile| + ary = Dir.glob(File.expand_path("../#{code}-*", tempfile.path)) + assert_equal 1, ary.size + result = ary[0] + rn = result[/\/\h+-(.+?)-/, 1] + #assert_equal tempfile.path, result, "#{rn.dump} is not U+#{n.to_s(16)}" + r2 = Bug::String.new(result ).normalize_ospath + rn2 = r2[/\/\h+-(.+?)-/, 1] + if tempfile.path == result + if tempfile.path == r2 + else + puts "U+#{n.to_s(16)} shouldn't be r2#{rn2.dump}" + end + else + if tempfile.path == r2 + # puts "U+#{n.to_s(16)} shouldn't be r#{rn.dump}" + elsif result == r2 + puts "U+#{n.to_s(16)} shouldn't be #{rn.dump}" + else + puts "U+#{n.to_s(16)} shouldn't be r#{rn.dump} r2#{rn2.dump}" + end + end + end + end + end +=end + + def test_normalize + %[ + \u304C \u304B\u3099 + \u3077 \u3075\u309A + \u308F\u3099 \u308F\u3099 + \u30F4 \u30A6\u3099 + \u30DD \u30DB\u309A + \u30AB\u303A \u30AB\u303A + \u00C1 A\u0301 + B\u030A B\u030A + \u0386 \u0391\u0301 + \u03D3 \u03D2\u0301 + \u0401 \u0415\u0308 + \u2260 =\u0338 + ].scan(/(\S+)\s+(\S+)/) do |expected, src| + result = Bug::String.new(src).normalize_ospath + assert_equal expected, result, + "#{expected.dump} is expected but #{src.dump}" + end + rescue NotImplementedError + end + + def test_not_normalize_kc + %[ + \u2460 + \u2162 + \u3349 + \u33A1 + \u337B + \u2116 + \u33CD + \u2121 + \u32A4 + \u3231 + ].split.each do |src| + result = Bug::String.new(src).normalize_ospath + assert_equal src, result, + "#{src.dump} is expected not to be normalized, but #{result.dump}" + end + rescue NotImplementedError + end + + def test_dont_normalize_hfsplus + %[ + \u2190\u0338 + \u219A + \u212B + \uF90A + \uF9F4 + \uF961 \uF9DB + \uF96F \uF3AA + \uF915 \uF95C \uF9BF + \uFA0C + \uFA10 + \uFA19 + \uFA26 + ].split.each do |src| + result = Bug::String.new(src).normalize_ospath + assert_equal src, result, + "#{src.dump} is expected not to be normalized, but #{result.dump}" + end + rescue NotImplementedError + end +end