diff --git a/lib/restclient/response.rb b/lib/restclient/response.rb index 89be144..77fdf8c 100644 --- a/lib/restclient/response.rb +++ b/lib/restclient/response.rb @@ -49,21 +49,29 @@ module RestClient private + # Automatically set the encoding of the response object based on the + # presence of a Content-Type... charset header. + # + # If a charset is found and represents a valid encoding, call + # force_encoding on the response to alter it to the correct representation. + # + # @param [Response] response + # + # @return [Encoding,nil] + # def self.fix_encoding(response) charset = RestClient::Utils.get_encoding_from_headers(response.headers) - encoding = nil + return unless charset - begin - encoding = Encoding.find(charset) if charset - rescue ArgumentError + encoding = RestClient::Utils.find_encoding(charset) + + if encoding + response.force_encoding(encoding) + elsif RestClient.log RestClient.log << "No such encoding: #{charset.inspect}" end - return unless encoding - - response.force_encoding(encoding) - - response + encoding end def body_truncated(length) diff --git a/lib/restclient/utils.rb b/lib/restclient/utils.rb index bbcf1d3..d232bf9 100644 --- a/lib/restclient/utils.rb +++ b/lib/restclient/utils.rb @@ -29,6 +29,36 @@ module RestClient nil end + # Return the Encoding for a String encoding name. + # + # In ruby 2.1+ use URI.get_encoding() in order to support the encoding + # names and aliases specified by HTML5. Otherwise call Encoding.find(). + # + # Note that the HTML5 specification indicates that certain valid encodings + # be treated as other similar encodings. For example, `ISO-8859-1` is + # rendered as `Windows-1252` even though it differs in certain control + # characters. + # + # @param [String] name A string encoding name, such as "utf-8" + # @return [Encoding, nil] + # + # @see Encoding.find + # @see URI.get_encoding + # @see https://encoding.spec.whatwg.org/#concept-encoding-get + # + def self.find_encoding(name) + if URI.respond_to?(:get_encoding) + return URI.get_encoding(name) + end + + begin + Encoding.find(name) + rescue ArgumentError => e + raise unless e.message.include?('unknown encoding name') + nil + end + end + # Parse semi-colon separated, potentially quoted header string iteratively. # # @private diff --git a/spec/integration/integration_spec.rb b/spec/integration/integration_spec.rb index d4e6e4b..0981b39 100644 --- a/spec/integration/integration_spec.rb +++ b/spec/integration/integration_spec.rb @@ -61,7 +61,7 @@ describe RestClient do body = "\xfe".force_encoding('ASCII-8BIT') stub_request(:get, "www.example.com").to_return( :body => body, :status => 200, :headers => { - 'Content-Type' => 'application/octet-stream; charset=binary' + 'Content-Type' => 'application/octet-stream' }) response = RestClient.get "www.example.com" response.encoding.should eq Encoding::BINARY @@ -71,7 +71,7 @@ describe RestClient do response.valid_encoding?.should eq true end - it 'handles euc-jp' do + it 'handles CP51932 / EUC-JP' do body = "\xA4\xA2\xA4\xA4\xA4\xA6\xA4\xA8\xA4\xAA". force_encoding(Encoding::BINARY) body_utf8 = 'あいうえお' @@ -82,12 +82,48 @@ describe RestClient do 'Content-Type' => 'text/plain; charset=EUC-JP' }) response = RestClient.get 'www.example.com' - response.encoding.should eq Encoding::EUC_JP + + # URI.get_encoding turns EUC-JP into CP51932, though I'm not sure why. + [Encoding::EUC_JP, Encoding::CP51932].should include(response.encoding) + response.valid_encoding?.should eq true response.length.should eq 5 response.encode('utf-8').should eq body_utf8 end + it 'handles big5 traditional chinese' do + body = "\xB5e\xAE\xD1" + body_utf8 = '畫書' + body_utf8.encoding.should eq Encoding::UTF_8 + + stub_request(:get, 'www.example.com').to_return( + :body => body, :status => 200, :headers => { + 'Content-Type' => 'text/plain; charset=Big5' + }) + response = RestClient.get 'www.example.com' + response.encoding.should eq Encoding::Big5 + response.valid_encoding?.should eq true + response.length.should eq 2 + response.encode('utf-8').should eq body_utf8 + end + + it 'handles gbk simplified chinese' do + body = "\xBB\xAD\xCA\xE9" + body_utf8 = '画书' + body_utf8.encoding.should eq Encoding::UTF_8 + + stub_request(:get, 'www.example.com').to_return( + :body => body, :status => 200, :headers => { + 'Content-Type' => 'text/plain; charset=GBK' + }) + response = RestClient.get 'www.example.com' + response.encoding.should eq Encoding::GBK + response.valid_encoding?.should eq true + response.length.should eq 2 + response.encode('utf-8').should eq body_utf8 + end + + it 'defaults to Encoding.default_external' do stub_request(:get, 'www.example.com').to_return( body: 'abc', status: 200, headers: { diff --git a/spec/unit/utils_spec.rb b/spec/unit/utils_spec.rb index ceac0f8..421e565 100644 --- a/spec/unit/utils_spec.rb +++ b/spec/unit/utils_spec.rb @@ -33,6 +33,52 @@ describe RestClient::Utils do end end + describe '.find_encoding' do + it 'finds various normal encoding names' do + { + 'utf-8' => Encoding::UTF_8, + 'big5' => Encoding::Big5, + 'euc-kr' => Encoding::EUC_KR, + 'WINDOWS-1252' => Encoding::Windows_1252, + 'windows-31j' => Encoding::Windows_31J, + }.each_pair do |name, enc| + RestClient::Utils.find_encoding(name).should eq enc + end + end + + it 'returns nil on failures' do + %w{nonexistent utf-99}.each do |name| + RestClient::Utils.find_encoding(name).should be_nil + end + end + + it 'uses URI.get_encoding if available', if: RUBY_VERSION >= '2.1' do + { + 'utf8' => Encoding::UTF_8, + 'utf-16' => Encoding::UTF_16LE, + 'latin1' => Encoding::Windows_1252, + 'iso-8859-1' => Encoding::Windows_1252, + 'shift_jis' => Encoding::Windows_31J, + 'euc-jp' => Encoding::CP51932, + }.each_pair do |name, enc| + RestClient::Utils.find_encoding(name).should eq enc + end + end + + it 'uses Encoding.find if URI.get_encoding unavailable', if: RUBY_VERSION < '2.1' do + { + 'utf8' => nil, + 'utf-16' => Encoding::UTF_16, + 'latin1' => nil, + 'iso-8859-1' => Encoding::ISO_8859_1, + 'shift_jis' => Encoding::Shift_JIS, + 'euc-jp' => Encoding::EUC_JP, + }.each_pair do |name, enc| + RestClient::Utils.find_encoding(name).should eq enc + end + end + end + describe '.cgi_parse_header' do it 'parses headers' do RestClient::Utils.cgi_parse_header('text/plain').