mirror of
https://github.com/rest-client/rest-client.git
synced 2022-11-09 13:49:40 -05:00
Use URI.get_encoding to look up encodings.
Use the (undocumented) URI.get_encoding method introduced in Ruby 2.1 to look up encodings by the aliases specified in HTML5. This means that the behavior will differ slightly between versions of Ruby, but the encodings selected are largely compatible. For example, `ISO-8859-1` is an alias for `Windows-1252` per the HTML5 specification, while in ruby versions < 2.1 it will be used as is. These two encodings are largely compatible, and the alias exists due to servers that return a `charset=ISO-8859-1` when they actually are using `Windows-1252`. Other aliases that differ include `shift_jis` (rendered as `Windows-31J`) and `euc-jp` (rendered as `CP51932`).
This commit is contained in:
parent
f6c6485ee9
commit
de03c9d4d1
4 changed files with 132 additions and 12 deletions
|
@ -49,21 +49,29 @@ module RestClient
|
|||
|
||||
private
|
||||
|
||||
# Automatically set the encoding of the response object based on the
|
||||
# presence of a Content-Type... charset header.
|
||||
#
|
||||
# If a charset is found and represents a valid encoding, call
|
||||
# force_encoding on the response to alter it to the correct representation.
|
||||
#
|
||||
# @param [Response] response
|
||||
#
|
||||
# @return [Encoding,nil]
|
||||
#
|
||||
def self.fix_encoding(response)
|
||||
charset = RestClient::Utils.get_encoding_from_headers(response.headers)
|
||||
encoding = nil
|
||||
return unless charset
|
||||
|
||||
begin
|
||||
encoding = Encoding.find(charset) if charset
|
||||
rescue ArgumentError
|
||||
encoding = RestClient::Utils.find_encoding(charset)
|
||||
|
||||
if encoding
|
||||
response.force_encoding(encoding)
|
||||
elsif RestClient.log
|
||||
RestClient.log << "No such encoding: #{charset.inspect}"
|
||||
end
|
||||
|
||||
return unless encoding
|
||||
|
||||
response.force_encoding(encoding)
|
||||
|
||||
response
|
||||
encoding
|
||||
end
|
||||
|
||||
def body_truncated(length)
|
||||
|
|
|
@ -29,6 +29,36 @@ module RestClient
|
|||
nil
|
||||
end
|
||||
|
||||
# Return the Encoding for a String encoding name.
|
||||
#
|
||||
# In ruby 2.1+ use URI.get_encoding() in order to support the encoding
|
||||
# names and aliases specified by HTML5. Otherwise call Encoding.find().
|
||||
#
|
||||
# Note that the HTML5 specification indicates that certain valid encodings
|
||||
# be treated as other similar encodings. For example, `ISO-8859-1` is
|
||||
# rendered as `Windows-1252` even though it differs in certain control
|
||||
# characters.
|
||||
#
|
||||
# @param [String] name A string encoding name, such as "utf-8"
|
||||
# @return [Encoding, nil]
|
||||
#
|
||||
# @see Encoding.find
|
||||
# @see URI.get_encoding
|
||||
# @see https://encoding.spec.whatwg.org/#concept-encoding-get
|
||||
#
|
||||
def self.find_encoding(name)
|
||||
if URI.respond_to?(:get_encoding)
|
||||
return URI.get_encoding(name)
|
||||
end
|
||||
|
||||
begin
|
||||
Encoding.find(name)
|
||||
rescue ArgumentError => e
|
||||
raise unless e.message.include?('unknown encoding name')
|
||||
nil
|
||||
end
|
||||
end
|
||||
|
||||
# Parse semi-colon separated, potentially quoted header string iteratively.
|
||||
#
|
||||
# @private
|
||||
|
|
|
@ -61,7 +61,7 @@ describe RestClient do
|
|||
body = "\xfe".force_encoding('ASCII-8BIT')
|
||||
stub_request(:get, "www.example.com").to_return(
|
||||
:body => body, :status => 200, :headers => {
|
||||
'Content-Type' => 'application/octet-stream; charset=binary'
|
||||
'Content-Type' => 'application/octet-stream'
|
||||
})
|
||||
response = RestClient.get "www.example.com"
|
||||
response.encoding.should eq Encoding::BINARY
|
||||
|
@ -71,7 +71,7 @@ describe RestClient do
|
|||
response.valid_encoding?.should eq true
|
||||
end
|
||||
|
||||
it 'handles euc-jp' do
|
||||
it 'handles CP51932 / EUC-JP' do
|
||||
body = "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA".
|
||||
force_encoding(Encoding::BINARY)
|
||||
body_utf8 = 'あいうえお'
|
||||
|
@ -82,12 +82,48 @@ describe RestClient do
|
|||
'Content-Type' => 'text/plain; charset=EUC-JP'
|
||||
})
|
||||
response = RestClient.get 'www.example.com'
|
||||
response.encoding.should eq Encoding::EUC_JP
|
||||
|
||||
# URI.get_encoding turns EUC-JP into CP51932, though I'm not sure why.
|
||||
[Encoding::EUC_JP, Encoding::CP51932].should include(response.encoding)
|
||||
|
||||
response.valid_encoding?.should eq true
|
||||
response.length.should eq 5
|
||||
response.encode('utf-8').should eq body_utf8
|
||||
end
|
||||
|
||||
it 'handles big5 traditional chinese' do
|
||||
body = "\xB5e\xAE\xD1"
|
||||
body_utf8 = '畫書'
|
||||
body_utf8.encoding.should eq Encoding::UTF_8
|
||||
|
||||
stub_request(:get, 'www.example.com').to_return(
|
||||
:body => body, :status => 200, :headers => {
|
||||
'Content-Type' => 'text/plain; charset=Big5'
|
||||
})
|
||||
response = RestClient.get 'www.example.com'
|
||||
response.encoding.should eq Encoding::Big5
|
||||
response.valid_encoding?.should eq true
|
||||
response.length.should eq 2
|
||||
response.encode('utf-8').should eq body_utf8
|
||||
end
|
||||
|
||||
it 'handles gbk simplified chinese' do
|
||||
body = "\xBB\xAD\xCA\xE9"
|
||||
body_utf8 = '画书'
|
||||
body_utf8.encoding.should eq Encoding::UTF_8
|
||||
|
||||
stub_request(:get, 'www.example.com').to_return(
|
||||
:body => body, :status => 200, :headers => {
|
||||
'Content-Type' => 'text/plain; charset=GBK'
|
||||
})
|
||||
response = RestClient.get 'www.example.com'
|
||||
response.encoding.should eq Encoding::GBK
|
||||
response.valid_encoding?.should eq true
|
||||
response.length.should eq 2
|
||||
response.encode('utf-8').should eq body_utf8
|
||||
end
|
||||
|
||||
|
||||
it 'defaults to Encoding.default_external' do
|
||||
stub_request(:get, 'www.example.com').to_return(
|
||||
body: 'abc', status: 200, headers: {
|
||||
|
|
|
@ -33,6 +33,52 @@ describe RestClient::Utils do
|
|||
end
|
||||
end
|
||||
|
||||
describe '.find_encoding' do
|
||||
it 'finds various normal encoding names' do
|
||||
{
|
||||
'utf-8' => Encoding::UTF_8,
|
||||
'big5' => Encoding::Big5,
|
||||
'euc-kr' => Encoding::EUC_KR,
|
||||
'WINDOWS-1252' => Encoding::Windows_1252,
|
||||
'windows-31j' => Encoding::Windows_31J,
|
||||
}.each_pair do |name, enc|
|
||||
RestClient::Utils.find_encoding(name).should eq enc
|
||||
end
|
||||
end
|
||||
|
||||
it 'returns nil on failures' do
|
||||
%w{nonexistent utf-99}.each do |name|
|
||||
RestClient::Utils.find_encoding(name).should be_nil
|
||||
end
|
||||
end
|
||||
|
||||
it 'uses URI.get_encoding if available', if: RUBY_VERSION >= '2.1' do
|
||||
{
|
||||
'utf8' => Encoding::UTF_8,
|
||||
'utf-16' => Encoding::UTF_16LE,
|
||||
'latin1' => Encoding::Windows_1252,
|
||||
'iso-8859-1' => Encoding::Windows_1252,
|
||||
'shift_jis' => Encoding::Windows_31J,
|
||||
'euc-jp' => Encoding::CP51932,
|
||||
}.each_pair do |name, enc|
|
||||
RestClient::Utils.find_encoding(name).should eq enc
|
||||
end
|
||||
end
|
||||
|
||||
it 'uses Encoding.find if URI.get_encoding unavailable', if: RUBY_VERSION < '2.1' do
|
||||
{
|
||||
'utf8' => nil,
|
||||
'utf-16' => Encoding::UTF_16,
|
||||
'latin1' => nil,
|
||||
'iso-8859-1' => Encoding::ISO_8859_1,
|
||||
'shift_jis' => Encoding::Shift_JIS,
|
||||
'euc-jp' => Encoding::EUC_JP,
|
||||
}.each_pair do |name, enc|
|
||||
RestClient::Utils.find_encoding(name).should eq enc
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
describe '.cgi_parse_header' do
|
||||
it 'parses headers' do
|
||||
RestClient::Utils.cgi_parse_header('text/plain').
|
||||
|
|
Loading…
Reference in a new issue