1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

* lib/csv.rb: Upgrading output encoding as needed. [ruby-core:33135]

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29808 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
jeg2 2010-11-16 23:55:29 +00:00
parent 6cba0f0663
commit 40e5b39336
3 changed files with 62 additions and 38 deletions

View file

@ -1,3 +1,7 @@
Tue Nov 17 08:54:04 2010 James Edward Gray II <jeg2@ruby-lang.org>
* lib/csv.rb: Upgrading output encoding as needed. [ruby-core:33135]
Tue Nov 16 22:30:39 2010 Yusuke Endoh <mame@tsg.ne.jp> Tue Nov 16 22:30:39 2010 Yusuke Endoh <mame@tsg.ne.jp>
* vm_insnhelper.c (vm_throw): remove fear of undefined behavior :-) * vm_insnhelper.c (vm_throw): remove fear of undefined behavior :-)

View file

@ -1562,9 +1562,10 @@ class CSV
options = DEFAULT_OPTIONS.merge(options) options = DEFAULT_OPTIONS.merge(options)
# create the IO object we will read from # create the IO object we will read from
@io = if data.is_a? String then StringIO.new(data) else data end @io = data.is_a?(String) ? StringIO.new(data) : data
# honor the IO encoding if we can, otherwise default to ASCII-8BIT # honor the IO encoding if we can, otherwise default to ASCII-8BIT
@encoding = raw_encoding || Encoding.default_internal || Encoding.default_external @encoding = raw_encoding || Encoding.default_internal ||
Encoding.default_external
# #
# prepare for building safe regular expressions in the target encoding, # prepare for building safe regular expressions in the target encoding,
# if we can transcode the needed characters # if we can transcode the needed characters
@ -1711,7 +1712,15 @@ class CSV
@headers = row if header_row? @headers = row if header_row?
@lineno += 1 @lineno += 1
@io << row.map(&@quote).join(@col_sep) + @row_sep # quote and separate output = row.map(&@quote).join(@col_sep) + @row_sep # quote and separate
if @io.is_a?(StringIO) and
output.encoding != raw_encoding and
( compatible_encoding = Encoding.compatible?( @io.string.encoding,
output.encoding ) )
@io = StringIO.new(@io.string.force_encoding(compatible_encoding))
@io.seek(0, IO::SEEK_END)
end
@io << output
self # for chaining self # for chaining
end end
@ -2043,11 +2052,13 @@ class CSV
@row_sep = @row_sep.to_s.encode(@encoding) @row_sep = @row_sep.to_s.encode(@encoding)
# establish quoting rules # establish quoting rules
@force_quotes = options.delete(:force_quotes) @force_quotes = options.delete(:force_quotes)
do_quote = lambda do |field| do_quote = lambda do |field|
@quote_char + field = String(field)
String(field).gsub(@quote_char, @quote_char * 2) + encoded_quote = @quote_char.encode(field.encoding)
@quote_char encoded_quote +
field.gsub(encoded_quote, encoded_quote * 2) +
encoded_quote
end end
quotable_chars = encode_str("\r\n", @col_sep, @quote_char) quotable_chars = encode_str("\r\n", @col_sep, @quote_char)
@quote = if @force_quotes @quote = if @force_quotes
@ -2297,6 +2308,7 @@ class CSV
end end
private private
def raw_encoding def raw_encoding
if @io.respond_to? :internal_encoding if @io.respond_to? :internal_encoding
@io.internal_encoding || @io.external_encoding @io.internal_encoding || @io.external_encoding

View file

@ -15,43 +15,43 @@ class TestEncodings < Test::Unit::TestCase
def setup def setup
@temp_csv_path = File.join(File.dirname(__FILE__), "temp.csv") @temp_csv_path = File.join(File.dirname(__FILE__), "temp.csv")
end end
def teardown def teardown
File.unlink(@temp_csv_path) if File.exist? @temp_csv_path File.unlink(@temp_csv_path) if File.exist? @temp_csv_path
end end
######################################## ########################################
### Hand Test Some Popular Encodings ### ### Hand Test Some Popular Encodings ###
######################################## ########################################
def test_parses_utf8_encoding def test_parses_utf8_encoding
assert_parses( [ %w[ one two … ], assert_parses( [ %w[ one two … ],
%w[ 1 … 3 ], %w[ 1 … 3 ],
%w[ … 5 6 ] ], "UTF-8" ) %w[ … 5 6 ] ], "UTF-8" )
end end
def test_parses_latin1_encoding def test_parses_latin1_encoding
assert_parses( [ %w[ one two Résumé ], assert_parses( [ %w[ one two Résumé ],
%w[ 1 Résumé 3 ], %w[ 1 Résumé 3 ],
%w[ Résumé 5 6 ] ], "ISO-8859-1" ) %w[ Résumé 5 6 ] ], "ISO-8859-1" )
end end
def test_parses_utf16be_encoding def test_parses_utf16be_encoding
assert_parses( [ %w[ one two … ], assert_parses( [ %w[ one two … ],
%w[ 1 … 3 ], %w[ 1 … 3 ],
%w[ … 5 6 ] ], "UTF-16BE" ) %w[ … 5 6 ] ], "UTF-16BE" )
end end
def test_parses_shift_jis_encoding def test_parses_shift_jis_encoding
assert_parses( [ %w[ 一 二 三 ], assert_parses( [ %w[ 一 二 三 ],
%w[ 四 五 六 ], %w[ 四 五 六 ],
%w[ 七 八 九 ] ], "Shift_JIS" ) %w[ 七 八 九 ] ], "Shift_JIS" )
end end
########################################################### ###########################################################
### Try Simple Reading for All Non-dummy Ruby Encodings ### ### Try Simple Reading for All Non-dummy Ruby Encodings ###
########################################################### ###########################################################
def test_reading_with_most_encodings def test_reading_with_most_encodings
each_encoding do |encoding| each_encoding do |encoding|
begin begin
@ -62,7 +62,7 @@ class TestEncodings < Test::Unit::TestCase
end end
end end
end end
def test_regular_expression_escaping def test_regular_expression_escaping
each_encoding do |encoding| each_encoding do |encoding|
begin begin
@ -73,18 +73,18 @@ class TestEncodings < Test::Unit::TestCase
end end
end end
end end
####################################################################### #######################################################################
### Stress Test ASCII Compatible and Non-ASCII Compatible Encodings ### ### Stress Test ASCII Compatible and Non-ASCII Compatible Encodings ###
####################################################################### #######################################################################
def test_auto_line_ending_detection def test_auto_line_ending_detection
# arrange data to place a \r at the end of CSV's read ahead point # arrange data to place a \r at the end of CSV's read ahead point
encode_for_tests([["a" * 509]], row_sep: "\r\n") do |data| encode_for_tests([["a" * 509]], row_sep: "\r\n") do |data|
assert_equal("\r\n".encode(data.encoding), CSV.new(data).row_sep) assert_equal("\r\n".encode(data.encoding), CSV.new(data).row_sep)
end end
end end
def test_csv_chars_are_transcoded def test_csv_chars_are_transcoded
encode_for_tests([%w[abc def]]) do |data| encode_for_tests([%w[abc def]]) do |data|
%w[col_sep row_sep quote_char].each do |csv_char| %w[col_sep row_sep quote_char].each do |csv_char|
@ -93,7 +93,7 @@ class TestEncodings < Test::Unit::TestCase
end end
end end
end end
def test_parser_works_with_encoded_headers def test_parser_works_with_encoded_headers
encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
parsed = CSV.parse(data, headers: true) parsed = CSV.parse(data, headers: true)
@ -105,7 +105,7 @@ class TestEncodings < Test::Unit::TestCase
end end
end end
end end
def test_built_in_converters_transcode_to_utf_8_then_convert def test_built_in_converters_transcode_to_utf_8_then_convert
encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
parsed = CSV.parse(data, converters: :integer) parsed = CSV.parse(data, converters: :integer)
@ -114,7 +114,7 @@ class TestEncodings < Test::Unit::TestCase
assert_equal([1, 2, 3], parsed[1]) assert_equal([1, 2, 3], parsed[1])
end end
end end
def test_built_in_header_converters_transcode_to_utf_8_then_convert def test_built_in_header_converters_transcode_to_utf_8_then_convert
encode_for_tests([%w[one two three], %w[1 2 3]]) do |data| encode_for_tests([%w[one two three], %w[1 2 3]]) do |data|
parsed = CSV.parse( data, headers: true, parsed = CSV.parse( data, headers: true,
@ -125,7 +125,7 @@ class TestEncodings < Test::Unit::TestCase
"Wrong data encoding." ) "Wrong data encoding." )
end end
end end
def test_open_allows_you_to_set_encodings def test_open_allows_you_to_set_encodings
encode_for_tests([%w[abc def]]) do |data| encode_for_tests([%w[abc def]]) do |data|
# read and write in encoding # read and write in encoding
@ -136,7 +136,7 @@ class TestEncodings < Test::Unit::TestCase
"Wrong data encoding." ) "Wrong data encoding." )
end end
end end
# read and write with transcoding # read and write with transcoding
File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
f << data f << data
@ -149,7 +149,7 @@ class TestEncodings < Test::Unit::TestCase
end end
end end
end end
def test_foreach_allows_you_to_set_encodings def test_foreach_allows_you_to_set_encodings
encode_for_tests([%w[abc def]]) do |data| encode_for_tests([%w[abc def]]) do |data|
# read and write in encoding # read and write in encoding
@ -158,7 +158,7 @@ class TestEncodings < Test::Unit::TestCase
assert( row.all? { |f| f.encoding == data.encoding }, assert( row.all? { |f| f.encoding == data.encoding },
"Wrong data encoding." ) "Wrong data encoding." )
end end
# read and write with transcoding # read and write with transcoding
File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
f << data f << data
@ -170,7 +170,7 @@ class TestEncodings < Test::Unit::TestCase
end end
end end
end end
def test_read_allows_you_to_set_encodings def test_read_allows_you_to_set_encodings
encode_for_tests([%w[abc def]]) do |data| encode_for_tests([%w[abc def]]) do |data|
# read and write in encoding # read and write in encoding
@ -178,7 +178,7 @@ class TestEncodings < Test::Unit::TestCase
rows = CSV.read(@temp_csv_path, encoding: data.encoding.name) rows = CSV.read(@temp_csv_path, encoding: data.encoding.name)
assert( rows.flatten.all? { |f| f.encoding == data.encoding }, assert( rows.flatten.all? { |f| f.encoding == data.encoding },
"Wrong data encoding." ) "Wrong data encoding." )
# read and write with transcoding # read and write with transcoding
File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f| File.open(@temp_csv_path, "wb:UTF-32BE:#{data.encoding.name}") do |f|
f << data f << data
@ -189,11 +189,11 @@ class TestEncodings < Test::Unit::TestCase
"Wrong data encoding." ) "Wrong data encoding." )
end end
end end
################################# #################################
### Write CSV in any Encoding ### ### Write CSV in any Encoding ###
################################# #################################
def test_can_write_csv_in_any_encoding def test_can_write_csv_in_any_encoding
each_encoding do |encoding| each_encoding do |encoding|
# test generate_line with encoding hint # test generate_line with encoding hint
@ -204,11 +204,11 @@ class TestEncodings < Test::Unit::TestCase
next next
end end
assert_equal(encoding, csv.encoding) assert_equal(encoding, csv.encoding)
# test generate_line with encoding guessing from fields # test generate_line with encoding guessing from fields
csv = %w[abc d|ef].map { |f| f.encode(encoding) }.to_csv(col_sep: "|") csv = %w[abc d|ef].map { |f| f.encode(encoding) }.to_csv(col_sep: "|")
assert_equal(encoding, csv.encoding) assert_equal(encoding, csv.encoding)
# writing to files # writing to files
data = encode_ary([%w[abc d,ef], %w[123 456 ]], encoding) data = encode_ary([%w[abc d,ef], %w[123 456 ]], encoding)
CSV.open(@temp_csv_path, "wb:#{encoding.name}") do |f| CSV.open(@temp_csv_path, "wb:#{encoding.name}") do |f|
@ -217,9 +217,17 @@ class TestEncodings < Test::Unit::TestCase
assert_equal(data, CSV.read(@temp_csv_path, encoding: encoding.name)) assert_equal(data, CSV.read(@temp_csv_path, encoding: encoding.name))
end end
end end
def test_encoding_is_upgraded_during_writing_as_needed
data = ["foo".force_encoding("US-ASCII"), "\u3042"]
assert_equal("US-ASCII", data.first.encoding.name)
assert_equal("UTF-8", data.last.encoding.name)
assert_equal("UTF-8", data.join.encoding.name)
assert_equal("UTF-8", data.to_csv.encoding.name)
end
private private
def assert_parses(fields, encoding, options = { }) def assert_parses(fields, encoding, options = { })
encoding = Encoding.find(encoding) unless encoding.is_a? Encoding encoding = Encoding.find(encoding) unless encoding.is_a? Encoding
fields = encode_ary(fields, encoding) fields = encode_ary(fields, encoding)
@ -229,11 +237,11 @@ class TestEncodings < Test::Unit::TestCase
assert_equal(encoding, field.encoding, "Field[#{i + 1}] was transcoded.") assert_equal(encoding, field.encoding, "Field[#{i + 1}] was transcoded.")
end end
end end
def encode_ary(ary, encoding) def encode_ary(ary, encoding)
ary.map { |row| row.map { |field| field.encode(encoding) } } ary.map { |row| row.map { |field| field.encode(encoding) } }
end end
def ary_to_data(ary, options = { }) def ary_to_data(ary, options = { })
encoding = ary.flatten.first.encoding encoding = ary.flatten.first.encoding
quote_char = (options[:quote_char] || '"').encode(encoding) quote_char = (options[:quote_char] || '"').encode(encoding)
@ -245,12 +253,12 @@ class TestEncodings < Test::Unit::TestCase
}.join(col_sep) + row_sep }.join(col_sep) + row_sep
}.join.encode(encoding) }.join.encode(encoding)
end end
def encode_for_tests(data, options = { }) def encode_for_tests(data, options = { })
yield ary_to_data(encode_ary(data, "UTF-8"), options) yield ary_to_data(encode_ary(data, "UTF-8"), options)
yield ary_to_data(encode_ary(data, "UTF-16BE"), options) yield ary_to_data(encode_ary(data, "UTF-16BE"), options)
end end
def each_encoding def each_encoding
Encoding.list.each do |encoding| Encoding.list.each do |encoding|
next if encoding.dummy? # skip "dummy" encodings next if encoding.dummy? # skip "dummy" encodings