mirror of
https://github.com/ruby/ruby.git
synced 2022-11-09 12:17:21 -05:00
[ruby/net-http] Add HTTP#response_body_encoding for setting response body encoding
This allows for the ability to opt-in to a method to set the encoding of response bodies. By setting the accessor to a String or Encoding instance, it will use the specified encoding. Setting the value of true will try to detect the encoding of the response body, either using the Content-Type header (assuming it specifies charset) or by scanning for a <meta> tag in the document that specifies the encoding. The default is false in which case no forcing of encoding will be done (same as before the patch). Implements [Feature #2567] Implements [Feature #15517] https://github.com/ruby/net-http/commit/6233e6b7c1 Co-authored-by: Yui Naruse <naruse@ruby-lang.org>
This commit is contained in:
parent
4bd38e8120
commit
ebb4378237
4 changed files with 462 additions and 0 deletions
|
@ -698,6 +698,7 @@ module Net #:nodoc:
|
|||
@continue_timeout = nil
|
||||
@max_retries = 1
|
||||
@debug_output = nil
|
||||
@response_body_encoding = false
|
||||
|
||||
@proxy_from_env = false
|
||||
@proxy_uri = nil
|
||||
|
@ -745,6 +746,18 @@ module Net #:nodoc:
|
|||
# The local port used to establish the connection.
|
||||
attr_accessor :local_port
|
||||
|
||||
# The encoding to use for the response body. If Encoding, uses the
|
||||
# specified encoding. If other true value, tries to detect the response
|
||||
# body encoding.
|
||||
attr_reader :response_body_encoding
|
||||
|
||||
# Set the encoding to use for the response body. If given a String, find
|
||||
# the related Encoding.
|
||||
def response_body_encoding=(value)
|
||||
value = Encoding.find(value) if value.is_a?(String)
|
||||
@response_body_encoding = value
|
||||
end
|
||||
|
||||
attr_writer :proxy_from_env
|
||||
attr_writer :proxy_address
|
||||
attr_writer :proxy_port
|
||||
|
@ -1592,6 +1605,7 @@ module Net #:nodoc:
|
|||
begin
|
||||
res = HTTPResponse.read_new(@socket)
|
||||
res.decode_content = req.decode_content
|
||||
res.body_encoding = @response_body_encoding
|
||||
end while res.kind_of?(HTTPInformation)
|
||||
|
||||
res.uri = req.uri
|
||||
|
|
|
@ -84,6 +84,7 @@ class Net::HTTPResponse
|
|||
@read = false
|
||||
@uri = nil
|
||||
@decode_content = false
|
||||
@body_encoding = false
|
||||
end
|
||||
|
||||
# The HTTP version supported by the server.
|
||||
|
@ -106,6 +107,18 @@ class Net::HTTPResponse
|
|||
# Accept-Encoding header from the user.
|
||||
attr_accessor :decode_content
|
||||
|
||||
# The encoding to use for the response body. If Encoding, use that encoding.
|
||||
# If other true value, attempt to detect the appropriate encoding, and use
|
||||
# that.
|
||||
attr_reader :body_encoding
|
||||
|
||||
# Set the encoding to use for the response body. If given a String, find
|
||||
# the related Encoding.
|
||||
def body_encoding=(value)
|
||||
value = Encoding.find(value) if value.is_a?(String)
|
||||
@body_encoding = value
|
||||
end
|
||||
|
||||
def inspect
|
||||
"#<#{self.class} #{@code} #{@message} readbody=#{@read}>"
|
||||
end
|
||||
|
@ -214,6 +227,17 @@ class Net::HTTPResponse
|
|||
end
|
||||
@read = true
|
||||
|
||||
case enc = @body_encoding
|
||||
when Encoding, false, nil
|
||||
# Encoding: force given encoding
|
||||
# false/nil: do not force encoding
|
||||
else
|
||||
# other value: detect encoding from body
|
||||
enc = detect_encoding(@body)
|
||||
end
|
||||
|
||||
@body.force_encoding(enc) if enc
|
||||
|
||||
@body
|
||||
end
|
||||
|
||||
|
@ -245,6 +269,141 @@ class Net::HTTPResponse
|
|||
|
||||
private
|
||||
|
||||
# :nodoc:
|
||||
def detect_encoding(str, encoding=nil)
|
||||
if encoding
|
||||
elsif encoding = type_params['charset']
|
||||
elsif encoding = check_bom(str)
|
||||
else
|
||||
encoding = case content_type&.downcase
|
||||
when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml}
|
||||
/\A<xml[ \t\r\n]+
|
||||
version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+
|
||||
encoding[ \t\r\n]*=[ \t\r\n]*
|
||||
(?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str
|
||||
encoding = $1 || $2 || Encoding::UTF_8
|
||||
when %r{text/html.*}
|
||||
sniff_encoding(str)
|
||||
end
|
||||
end
|
||||
return encoding
|
||||
end
|
||||
|
||||
# :nodoc:
|
||||
def sniff_encoding(str, encoding=nil)
|
||||
# the encoding sniffing algorithm
|
||||
# http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
|
||||
if enc = scanning_meta(str)
|
||||
enc
|
||||
# 6. last visited page or something
|
||||
# 7. frequency
|
||||
elsif str.ascii_only?
|
||||
Encoding::US_ASCII
|
||||
elsif str.dup.force_encoding(Encoding::UTF_8).valid_encoding?
|
||||
Encoding::UTF_8
|
||||
end
|
||||
# 8. implementation-defined or user-specified
|
||||
end
|
||||
|
||||
# :nodoc:
|
||||
def check_bom(str)
|
||||
case str.byteslice(0, 2)
|
||||
when "\xFE\xFF"
|
||||
return Encoding::UTF_16BE
|
||||
when "\xFF\xFE"
|
||||
return Encoding::UTF_16LE
|
||||
end
|
||||
if "\xEF\xBB\xBF" == str.byteslice(0, 3)
|
||||
return Encoding::UTF_8
|
||||
end
|
||||
nil
|
||||
end
|
||||
|
||||
# :nodoc:
|
||||
def scanning_meta(str)
|
||||
require 'strscan'
|
||||
ss = StringScanner.new(str)
|
||||
if ss.scan_until(/<meta[\t\n\f\r ]*/)
|
||||
attrs = {} # attribute_list
|
||||
got_pragma = false
|
||||
need_pragma = nil
|
||||
charset = nil
|
||||
|
||||
# step: Attributes
|
||||
while attr = get_attribute(ss)
|
||||
name, value = *attr
|
||||
next if attrs[name]
|
||||
attrs[name] = true
|
||||
case name
|
||||
when 'http-equiv'
|
||||
got_pragma = true if value == 'content-type'
|
||||
when 'content'
|
||||
encoding = extracting_encodings_from_meta_elements(value)
|
||||
unless charset
|
||||
charset = encoding
|
||||
end
|
||||
need_pragma = true
|
||||
when 'charset'
|
||||
need_pragma = false
|
||||
charset = value
|
||||
end
|
||||
end
|
||||
|
||||
# step: Processing
|
||||
return if need_pragma.nil?
|
||||
return if need_pragma && !got_pragma
|
||||
|
||||
charset = Encoding.find(charset) rescue nil
|
||||
return unless charset
|
||||
charset = Encoding::UTF_8 if charset == Encoding::UTF_16
|
||||
return charset # tentative
|
||||
end
|
||||
nil
|
||||
end
|
||||
|
||||
def get_attribute(ss)
|
||||
ss.scan(/[\t\n\f\r \/]*/)
|
||||
if ss.peek(1) == '>'
|
||||
ss.getch
|
||||
return nil
|
||||
end
|
||||
name = ss.scan(/[^=\t\n\f\r \/>]*/)
|
||||
name.downcase!
|
||||
raise if name.empty?
|
||||
ss.skip(/[\t\n\f\r ]*/)
|
||||
if ss.getch != '='
|
||||
value = ''
|
||||
return [name, value]
|
||||
end
|
||||
ss.skip(/[\t\n\f\r ]*/)
|
||||
case ss.peek(1)
|
||||
when '"'
|
||||
ss.getch
|
||||
value = ss.scan(/[^"]+/)
|
||||
value.downcase!
|
||||
ss.getch
|
||||
when "'"
|
||||
ss.getch
|
||||
value = ss.scan(/[^']+/)
|
||||
value.downcase!
|
||||
ss.getch
|
||||
when '>'
|
||||
value = ''
|
||||
else
|
||||
value = ss.scan(/[^\t\n\f\r >]+/)
|
||||
value.downcase!
|
||||
end
|
||||
[name, value]
|
||||
end
|
||||
|
||||
def extracting_encodings_from_meta_elements(value)
|
||||
# http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
|
||||
if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value
|
||||
return $1 || $2 || $3
|
||||
end
|
||||
return nil
|
||||
end
|
||||
|
||||
##
|
||||
# Checks for a supported Content-Encoding header and yields an Inflate
|
||||
# wrapper for this response's socket when zlib is present. If the
|
||||
|
|
|
@ -1294,3 +1294,57 @@ class TestNetHTTPLocalBind < Test::Unit::TestCase
|
|||
end
|
||||
end
|
||||
|
||||
class TestNetHTTPForceEncoding < Test::Unit::TestCase
|
||||
CONFIG = {
|
||||
'host' => 'localhost',
|
||||
'proxy_host' => nil,
|
||||
'proxy_port' => nil,
|
||||
}
|
||||
|
||||
include TestNetHTTPUtils
|
||||
|
||||
def fe_request(force_enc, content_type=nil)
|
||||
@server.mount_proc('/fe') do |req, res|
|
||||
res['Content-Type'] = content_type if content_type
|
||||
res.body = "hello\u1234"
|
||||
end
|
||||
|
||||
http = Net::HTTP.new(config('host'), config('port'))
|
||||
http.local_host = Addrinfo.tcp(config('host'), config('port')).ip_address
|
||||
assert_not_nil(http.local_host)
|
||||
assert_nil(http.local_port)
|
||||
|
||||
http.response_body_encoding = force_enc
|
||||
http.get('/fe')
|
||||
end
|
||||
|
||||
def test_response_body_encoding_false
|
||||
res = fe_request(false)
|
||||
assert_equal("hello\u1234".b, res.body)
|
||||
assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
|
||||
end
|
||||
|
||||
def test_response_body_encoding_true_without_content_type
|
||||
res = fe_request(true)
|
||||
assert_equal("hello\u1234".b, res.body)
|
||||
assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
|
||||
end
|
||||
|
||||
def test_response_body_encoding_true_with_content_type
|
||||
res = fe_request(true, 'text/html; charset=utf-8')
|
||||
assert_equal("hello\u1234", res.body)
|
||||
assert_equal(Encoding::UTF_8, res.body.encoding)
|
||||
end
|
||||
|
||||
def test_response_body_encoding_string_without_content_type
|
||||
res = fe_request('utf-8')
|
||||
assert_equal("hello\u1234", res.body)
|
||||
assert_equal(Encoding::UTF_8, res.body.encoding)
|
||||
end
|
||||
|
||||
def test_response_body_encoding_encoding_without_content_type
|
||||
res = fe_request(Encoding::UTF_8)
|
||||
assert_equal("hello\u1234", res.body)
|
||||
assert_equal(Encoding::UTF_8, res.body.encoding)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -54,6 +54,241 @@ EOS
|
|||
assert_equal 'hello', body
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_false
|
||||
body = "hello\u1234"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{body.bytesize}
|
||||
|
||||
#{body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal "hello\u1234".b, body
|
||||
assert_equal Encoding::ASCII_8BIT, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_encoding
|
||||
body = "hello\u1234"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{body.bytesize}
|
||||
|
||||
#{body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = Encoding.find('utf-8')
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal "hello\u1234", body
|
||||
assert_equal Encoding::UTF_8, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_string
|
||||
body = "hello\u1234"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{body.bytesize}
|
||||
|
||||
#{body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = 'utf-8'
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal "hello\u1234", body
|
||||
assert_equal Encoding::UTF_8, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_true_without_content_type_header
|
||||
body = "hello\u1234"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{body.bytesize}
|
||||
|
||||
#{body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = true
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal "hello\u1234".b, body
|
||||
assert_equal Encoding::ASCII_8BIT, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_true_with_utf8_content_type_header
|
||||
body = "hello\u1234"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{body.bytesize}
|
||||
Content-Type: text/plain; charset=utf-8
|
||||
|
||||
#{body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = true
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal "hello\u1234", body
|
||||
assert_equal Encoding::UTF_8, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_true_with_iso_8859_1_content_type_header
|
||||
body = "hello\u1234"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{body.bytesize}
|
||||
Content-Type: text/plain; charset=iso-8859-1
|
||||
|
||||
#{body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = true
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal "hello\u1234".force_encoding("ISO-8859-1"), body
|
||||
assert_equal Encoding::ISO_8859_1, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_true_with_utf8_meta_charset
|
||||
res_body = "<html><meta charset=\"utf-8\">hello\u1234</html>"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{res_body.bytesize}
|
||||
Content-Type: text/html
|
||||
|
||||
#{res_body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = true
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal res_body, body
|
||||
assert_equal Encoding::UTF_8, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_true_with_iso8859_1_meta_charset
|
||||
res_body = "<html><meta charset=\"iso-8859-1\">hello\u1234</html>"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{res_body.bytesize}
|
||||
Content-Type: text/html
|
||||
|
||||
#{res_body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = true
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal res_body.force_encoding("ISO-8859-1"), body
|
||||
assert_equal Encoding::ISO_8859_1, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_true_with_utf8_meta_content_charset
|
||||
res_body = "<meta http-equiv='content-type' content='text/html; charset=UTF-8'>hello\u1234</html>"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{res_body.bytesize}
|
||||
Content-Type: text/html
|
||||
|
||||
#{res_body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = true
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal res_body, body
|
||||
assert_equal Encoding::UTF_8, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_body_encoding_true_with_iso8859_1_meta_content_charset
|
||||
res_body = "<meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>hello\u1234</html>"
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
Connection: close
|
||||
Content-Length: #{res_body.bytesize}
|
||||
Content-Type: text/html
|
||||
|
||||
#{res_body}
|
||||
EOS
|
||||
|
||||
res = Net::HTTPResponse.read_new(io)
|
||||
res.body_encoding = true
|
||||
|
||||
body = nil
|
||||
|
||||
res.reading_body io, true do
|
||||
body = res.read_body
|
||||
end
|
||||
|
||||
assert_equal res_body.force_encoding("ISO-8859-1"), body
|
||||
assert_equal Encoding::ISO_8859_1, body.encoding
|
||||
end
|
||||
|
||||
def test_read_body_block
|
||||
io = dummy_io(<<EOS)
|
||||
HTTP/1.1 200 OK
|
||||
|
|
Loading…
Reference in a new issue