mirror of
https://github.com/rails/rails.git
synced 2022-11-09 12:12:34 -05:00
Ensure Chars#tidy_bytes only tidies broken bytes. Closes #6397 [Manfred Stienstra]
git-svn-id: http://svn-commit.rubyonrails.org/rails/trunk@5316 5ecf4fe2-1ee6-0310-87b1-e25e094e27de
This commit is contained in:
parent
2d33676691
commit
911f3db00a
3 changed files with 18 additions and 8 deletions
|
@ -1,5 +1,7 @@
|
||||||
*SVN*
|
*SVN*
|
||||||
|
|
||||||
|
* Ensure Chars#tidy_bytes only tidies broken bytes. Closes #6397 [Manfred Stienstra]
|
||||||
|
|
||||||
* Add 'unloadable', a method used to mark any constant as requiring an unload after each request. [Nicholas Seckar]
|
* Add 'unloadable', a method used to mark any constant as requiring an unload after each request. [Nicholas Seckar]
|
||||||
|
|
||||||
* Make core_ext/string/access.rb multibyte safe. Closes #6388 [Manfred Stienstra]
|
* Make core_ext/string/access.rb multibyte safe. Closes #6388 [Manfred Stienstra]
|
||||||
|
|
|
@ -259,13 +259,18 @@ module ActiveSupport::Multibyte::Handlers
|
||||||
g_unpack(str).length
|
g_unpack(str).length
|
||||||
end
|
end
|
||||||
|
|
||||||
# Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string
|
# Replaces all the non-utf-8 bytes by their iso-8859-1 or cp1252 equivalent resulting in a valid utf-8 string
|
||||||
def tidy_bytes(str)
|
def tidy_bytes(str)
|
||||||
str.unpack('C*').map { |n|
|
str.split(//u).map do |c|
|
||||||
|
if !UTF8_PAT.match(c)
|
||||||
|
n = c.unpack('C')[0]
|
||||||
n < 128 ? n.chr :
|
n < 128 ? n.chr :
|
||||||
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
|
n < 160 ? [UCD.cp1252[n] || n].pack('U') :
|
||||||
n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
|
n < 192 ? "\xC2" + n.chr : "\xC3" + (n-64).chr
|
||||||
}.join
|
else
|
||||||
|
c
|
||||||
|
end
|
||||||
|
end.join
|
||||||
end
|
end
|
||||||
|
|
||||||
protected
|
protected
|
||||||
|
|
|
@ -228,7 +228,10 @@ module UTF8HandlingTest
|
||||||
def test_tidy_bytes
|
def test_tidy_bytes
|
||||||
result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
|
result = [0xb8, 0x17e, 0x8, 0x2c6, 0xa5].pack('U*')
|
||||||
assert_equal result, @handler.tidy_bytes(@bytestring)
|
assert_equal result, @handler.tidy_bytes(@bytestring)
|
||||||
assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a')
|
assert_equal "a#{result}a", @handler.tidy_bytes('a' + @bytestring + 'a'),
|
||||||
|
'tidy_bytes should leave surrounding characters intact'
|
||||||
|
assert_equal "é#{result}é", @handler.tidy_bytes('é' + @bytestring + 'é'),
|
||||||
|
'tidy_bytes should leave surrounding characters intact'
|
||||||
assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') }
|
assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') }
|
||||||
|
|
||||||
assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla
|
assert_equal "\xC3\xA7", @handler.tidy_bytes("\xE7") # iso_8859_1: small c cedilla
|
||||||
|
@ -236,7 +239,7 @@ module UTF8HandlingTest
|
||||||
assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote
|
assert_equal "\xE2\x80\x9C", @handler.tidy_bytes("\x93") # win_1252: left smart quote
|
||||||
assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro
|
assert_equal "\xE2\x82\xAC", @handler.tidy_bytes("\x80") # win_1252: euro
|
||||||
assert_equal "\x00", @handler.tidy_bytes("\x00") # null char
|
assert_equal "\x00", @handler.tidy_bytes("\x00") # null char
|
||||||
assert_equal [0xef, 0xbf, 0xbd].pack('U*'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char
|
assert_equal [0xfffd].pack('U'), @handler.tidy_bytes("\xef\xbf\xbd") # invalid char
|
||||||
end
|
end
|
||||||
|
|
||||||
protected
|
protected
|
||||||
|
|
Loading…
Reference in a new issue