diff --git a/activesupport/CHANGELOG b/activesupport/CHANGELOG index e3c0c8a71b..9ea01ada2e 100644 --- a/activesupport/CHANGELOG +++ b/activesupport/CHANGELOG @@ -1,5 +1,7 @@ *SVN* +* Ensure Chars#tidy_bytes only tidies broken bytes. Closes #6397 [Manfred Stienstra] + * Add 'unloadable', a method used to mark any constant as requiring an unload after each request. [Nicholas Seckar] * Make core_ext/string/access.rb multibyte safe. Closes #6388 [Manfred Stienstra] diff --git a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb index 6c8eb88702..5b64734297 100644 --- a/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb +++ b/activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb @@ -259,13 +259,18 @@ module ActiveSupport::Multibyte::Handlers g_unpack(str).length end - # Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string + # Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string def tidy_bytes(str) - str.unpack('C*').map { |n| - n < 128 ? n.chr : - n < 160 ? [UCD.cp1252[n] || n].pack('U') : - n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr - }.join + str.split(//u).map do |c| + if !UTF8_PAT.match(c) + n = c.unpack('C')[0] + n < 128 ? n.chr : + n < 160 ? [UCD.cp1252[n] || n].pack('U') : + n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr + else + c + end + end.join end protected diff --git a/activesupport/test/multibyte_handler_test.rb b/activesupport/test/multibyte_handler_test.rb index 06bc904c04..95d6faec7c 100644 --- a/activesupport/test/multibyte_handler_test.rb +++ b/activesupport/test/multibyte_handler_test.rb @@ -228,7 +228,10 @@ module UTF8HandlingTest def test_tidy_bytes result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*') assert_equal result, @handler.tidy_bytes(@bytestring) - assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a') + assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a'), + 'tidy_bytes should leave surrounding characters intact' + assert_equal "é#{result}é", @handler.tidy_bytes('é' + @bytestring + 'é'), + 'tidy_bytes should leave surrounding characters intact' assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') } assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla @@ -236,7 +239,7 @@ module UTF8HandlingTest assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro assert_equal "\x00", @handler.tidy_bytes("\x00") # null char - assert_equal [0xef, 0xbf, 0xbd].pack('U*'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char + assert_equal [0xfffd].pack('U'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char end protected