Use URI.get_encoding to look up encodings.

Use the (undocumented) URI.get_encoding method introduced in Ruby 2.1 to
look up encodings by the aliases specified in HTML5. This means that the
behavior will differ slightly between versions of Ruby, but the
encodings selected are largely compatible.

For example, `ISO-8859-1` is an alias for `Windows-1252` per the HTML5
specification, while in ruby versions < 2.1 it will be used as is. These
two encodings are largely compatible, and the alias exists due to
servers that return a `charset=ISO-8859-1` when they actually are using
`Windows-1252`.

Other aliases that differ include `shift_jis` (rendered as
`Windows-31J`) and `euc-jp` (rendered as `CP51932`).
This commit is contained in:
Andy Brody 2015-11-16 15:23:08 -08:00
parent f6c6485ee9
commit de03c9d4d1
4 changed files with 132 additions and 12 deletions

View File

@ -49,21 +49,29 @@ module RestClient
private
# Automatically set the encoding of the response object based on the
# presence of a Content-Type... charset header.
#
# If a charset is found and represents a valid encoding, call
# force_encoding on the response to alter it to the correct representation.
#
# @param [Response] response
#
# @return [Encoding,nil]
#
def self.fix_encoding(response)
charset = RestClient::Utils.get_encoding_from_headers(response.headers)
encoding = nil
return unless charset
begin
encoding = Encoding.find(charset) if charset
rescue ArgumentError
encoding = RestClient::Utils.find_encoding(charset)
if encoding
response.force_encoding(encoding)
elsif RestClient.log
RestClient.log << "No such encoding: #{charset.inspect}"
end
return unless encoding
response.force_encoding(encoding)
response
encoding
end
def body_truncated(length)

View File

@ -29,6 +29,36 @@ module RestClient
nil
end
# Return the Encoding for a String encoding name.
#
# In ruby 2.1+ use URI.get_encoding() in order to support the encoding
# names and aliases specified by HTML5. Otherwise call Encoding.find().
#
# Note that the HTML5 specification indicates that certain valid encodings
# be treated as other similar encodings. For example, `ISO-8859-1` is
# rendered as `Windows-1252` even though it differs in certain control
# characters.
#
# @param [String] name A string encoding name, such as "utf-8"
# @return [Encoding, nil]
#
# @see Encoding.find
# @see URI.get_encoding
# @see https://encoding.spec.whatwg.org/#concept-encoding-get
#
def self.find_encoding(name)
if URI.respond_to?(:get_encoding)
return URI.get_encoding(name)
end
begin
Encoding.find(name)
rescue ArgumentError => e
raise unless e.message.include?('unknown encoding name')
nil
end
end
# Parse semi-colon separated, potentially quoted header string iteratively.
#
# @private

View File

@ -61,7 +61,7 @@ describe RestClient do
body = "\xfe".force_encoding('ASCII-8BIT')
stub_request(:get, "www.example.com").to_return(
:body => body, :status => 200, :headers => {
'Content-Type' => 'application/octet-stream; charset=binary'
'Content-Type' => 'application/octet-stream'
})
response = RestClient.get "www.example.com"
response.encoding.should eq Encoding::BINARY
@ -71,7 +71,7 @@ describe RestClient do
response.valid_encoding?.should eq true
end
it 'handles euc-jp' do
it 'handles CP51932 / EUC-JP' do
body = "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA".
force_encoding(Encoding::BINARY)
body_utf8 = 'あいうえお'
@ -82,12 +82,48 @@ describe RestClient do
'Content-Type' => 'text/plain; charset=EUC-JP'
})
response = RestClient.get 'www.example.com'
response.encoding.should eq Encoding::EUC_JP
# URI.get_encoding turns EUC-JP into CP51932, though I'm not sure why.
[Encoding::EUC_JP, Encoding::CP51932].should include(response.encoding)
response.valid_encoding?.should eq true
response.length.should eq 5
response.encode('utf-8').should eq body_utf8
end
it 'handles big5 traditional chinese' do
body = "\xB5e\xAE\xD1"
body_utf8 = '畫書'
body_utf8.encoding.should eq Encoding::UTF_8
stub_request(:get, 'www.example.com').to_return(
:body => body, :status => 200, :headers => {
'Content-Type' => 'text/plain; charset=Big5'
})
response = RestClient.get 'www.example.com'
response.encoding.should eq Encoding::Big5
response.valid_encoding?.should eq true
response.length.should eq 2
response.encode('utf-8').should eq body_utf8
end
it 'handles gbk simplified chinese' do
body = "\xBB\xAD\xCA\xE9"
body_utf8 = '画书'
body_utf8.encoding.should eq Encoding::UTF_8
stub_request(:get, 'www.example.com').to_return(
:body => body, :status => 200, :headers => {
'Content-Type' => 'text/plain; charset=GBK'
})
response = RestClient.get 'www.example.com'
response.encoding.should eq Encoding::GBK
response.valid_encoding?.should eq true
response.length.should eq 2
response.encode('utf-8').should eq body_utf8
end
it 'defaults to Encoding.default_external' do
stub_request(:get, 'www.example.com').to_return(
body: 'abc', status: 200, headers: {

View File

@ -33,6 +33,52 @@ describe RestClient::Utils do
end
end
describe '.find_encoding' do
it 'finds various normal encoding names' do
{
'utf-8' => Encoding::UTF_8,
'big5' => Encoding::Big5,
'euc-kr' => Encoding::EUC_KR,
'WINDOWS-1252' => Encoding::Windows_1252,
'windows-31j' => Encoding::Windows_31J,
}.each_pair do |name, enc|
RestClient::Utils.find_encoding(name).should eq enc
end
end
it 'returns nil on failures' do
%w{nonexistent utf-99}.each do |name|
RestClient::Utils.find_encoding(name).should be_nil
end
end
it 'uses URI.get_encoding if available', if: RUBY_VERSION >= '2.1' do
{
'utf8' => Encoding::UTF_8,
'utf-16' => Encoding::UTF_16LE,
'latin1' => Encoding::Windows_1252,
'iso-8859-1' => Encoding::Windows_1252,
'shift_jis' => Encoding::Windows_31J,
'euc-jp' => Encoding::CP51932,
}.each_pair do |name, enc|
RestClient::Utils.find_encoding(name).should eq enc
end
end
it 'uses Encoding.find if URI.get_encoding unavailable', if: RUBY_VERSION < '2.1' do
{
'utf8' => nil,
'utf-16' => Encoding::UTF_16,
'latin1' => nil,
'iso-8859-1' => Encoding::ISO_8859_1,
'shift_jis' => Encoding::Shift_JIS,
'euc-jp' => Encoding::EUC_JP,
}.each_pair do |name, enc|
RestClient::Utils.find_encoding(name).should eq enc
end
end
end
describe '.cgi_parse_header' do
it 'parses headers' do
RestClient::Utils.cgi_parse_header('text/plain').