1
0
Fork 0
mirror of https://github.com/ruby/ruby.git synced 2022-11-09 12:17:21 -05:00

[ruby/net-http] Add HTTP#response_body_encoding for setting response body encoding

This allows for the ability to opt-in to a method to set the
encoding of response bodies.  By setting the accessor to a String
or Encoding instance, it will use the specified encoding.
Setting the value of true will try to detect the encoding of the
response body, either using the Content-Type header (assuming it
specifies charset) or by scanning for a <meta> tag in the document
that specifies the encoding.  The default is false in which case
no forcing of encoding will be done (same as before the patch).

Implements [Feature #2567]
Implements [Feature #15517]

https://github.com/ruby/net-http/commit/6233e6b7c1

Co-authored-by: Yui Naruse <naruse@ruby-lang.org>
This commit is contained in:
Jeremy Evans 2022-04-11 08:17:19 -07:00 committed by git
parent 4bd38e8120
commit ebb4378237
4 changed files with 462 additions and 0 deletions

View file

@ -698,6 +698,7 @@ module Net #:nodoc:
@continue_timeout = nil
@max_retries = 1
@debug_output = nil
@response_body_encoding = false
@proxy_from_env = false
@proxy_uri = nil
@ -745,6 +746,18 @@ module Net #:nodoc:
# The local port used to establish the connection.
attr_accessor :local_port
# The encoding to use for the response body. If Encoding, uses the
# specified encoding. If other true value, tries to detect the response
# body encoding.
attr_reader :response_body_encoding
# Set the encoding to use for the response body. If given a String, find
# the related Encoding.
def response_body_encoding=(value)
value = Encoding.find(value) if value.is_a?(String)
@response_body_encoding = value
end
attr_writer :proxy_from_env
attr_writer :proxy_address
attr_writer :proxy_port
@ -1592,6 +1605,7 @@ module Net #:nodoc:
begin
res = HTTPResponse.read_new(@socket)
res.decode_content = req.decode_content
res.body_encoding = @response_body_encoding
end while res.kind_of?(HTTPInformation)
res.uri = req.uri

View file

@ -84,6 +84,7 @@ class Net::HTTPResponse
@read = false
@uri = nil
@decode_content = false
@body_encoding = false
end
# The HTTP version supported by the server.
@ -106,6 +107,18 @@ class Net::HTTPResponse
# Accept-Encoding header from the user.
attr_accessor :decode_content
# The encoding to use for the response body. If Encoding, use that encoding.
# If other true value, attempt to detect the appropriate encoding, and use
# that.
attr_reader :body_encoding
# Set the encoding to use for the response body. If given a String, find
# the related Encoding.
def body_encoding=(value)
value = Encoding.find(value) if value.is_a?(String)
@body_encoding = value
end
def inspect
"#<#{self.class} #{@code} #{@message} readbody=#{@read}>"
end
@ -214,6 +227,17 @@ class Net::HTTPResponse
end
@read = true
case enc = @body_encoding
when Encoding, false, nil
# Encoding: force given encoding
# false/nil: do not force encoding
else
# other value: detect encoding from body
enc = detect_encoding(@body)
end
@body.force_encoding(enc) if enc
@body
end
@ -245,6 +269,141 @@ class Net::HTTPResponse
private
# :nodoc:
def detect_encoding(str, encoding=nil)
if encoding
elsif encoding = type_params['charset']
elsif encoding = check_bom(str)
else
encoding = case content_type&.downcase
when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml}
/\A<xml[ \t\r\n]+
version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+
encoding[ \t\r\n]*=[ \t\r\n]*
(?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str
encoding = $1 || $2 || Encoding::UTF_8
when %r{text/html.*}
sniff_encoding(str)
end
end
return encoding
end
# :nodoc:
def sniff_encoding(str, encoding=nil)
# the encoding sniffing algorithm
# http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
if enc = scanning_meta(str)
enc
# 6. last visited page or something
# 7. frequency
elsif str.ascii_only?
Encoding::US_ASCII
elsif str.dup.force_encoding(Encoding::UTF_8).valid_encoding?
Encoding::UTF_8
end
# 8. implementation-defined or user-specified
end
# :nodoc:
def check_bom(str)
case str.byteslice(0, 2)
when "\xFE\xFF"
return Encoding::UTF_16BE
when "\xFF\xFE"
return Encoding::UTF_16LE
end
if "\xEF\xBB\xBF" == str.byteslice(0, 3)
return Encoding::UTF_8
end
nil
end
# :nodoc:
def scanning_meta(str)
require 'strscan'
ss = StringScanner.new(str)
if ss.scan_until(/<meta[\t\n\f\r ]*/)
attrs = {} # attribute_list
got_pragma = false
need_pragma = nil
charset = nil
# step: Attributes
while attr = get_attribute(ss)
name, value = *attr
next if attrs[name]
attrs[name] = true
case name
when 'http-equiv'
got_pragma = true if value == 'content-type'
when 'content'
encoding = extracting_encodings_from_meta_elements(value)
unless charset
charset = encoding
end
need_pragma = true
when 'charset'
need_pragma = false
charset = value
end
end
# step: Processing
return if need_pragma.nil?
return if need_pragma && !got_pragma
charset = Encoding.find(charset) rescue nil
return unless charset
charset = Encoding::UTF_8 if charset == Encoding::UTF_16
return charset # tentative
end
nil
end
def get_attribute(ss)
ss.scan(/[\t\n\f\r \/]*/)
if ss.peek(1) == '>'
ss.getch
return nil
end
name = ss.scan(/[^=\t\n\f\r \/>]*/)
name.downcase!
raise if name.empty?
ss.skip(/[\t\n\f\r ]*/)
if ss.getch != '='
value = ''
return [name, value]
end
ss.skip(/[\t\n\f\r ]*/)
case ss.peek(1)
when '"'
ss.getch
value = ss.scan(/[^"]+/)
value.downcase!
ss.getch
when "'"
ss.getch
value = ss.scan(/[^']+/)
value.downcase!
ss.getch
when '>'
value = ''
else
value = ss.scan(/[^\t\n\f\r >]+/)
value.downcase!
end
[name, value]
end
def extracting_encodings_from_meta_elements(value)
# http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value
return $1 || $2 || $3
end
return nil
end
##
# Checks for a supported Content-Encoding header and yields an Inflate
# wrapper for this response's socket when zlib is present. If the

View file

@ -1294,3 +1294,57 @@ class TestNetHTTPLocalBind < Test::Unit::TestCase
end
end
class TestNetHTTPForceEncoding < Test::Unit::TestCase
CONFIG = {
'host' => 'localhost',
'proxy_host' => nil,
'proxy_port' => nil,
}
include TestNetHTTPUtils
def fe_request(force_enc, content_type=nil)
@server.mount_proc('/fe') do |req, res|
res['Content-Type'] = content_type if content_type
res.body = "hello\u1234"
end
http = Net::HTTP.new(config('host'), config('port'))
http.local_host = Addrinfo.tcp(config('host'), config('port')).ip_address
assert_not_nil(http.local_host)
assert_nil(http.local_port)
http.response_body_encoding = force_enc
http.get('/fe')
end
def test_response_body_encoding_false
res = fe_request(false)
assert_equal("hello\u1234".b, res.body)
assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
end
def test_response_body_encoding_true_without_content_type
res = fe_request(true)
assert_equal("hello\u1234".b, res.body)
assert_equal(Encoding::ASCII_8BIT, res.body.encoding)
end
def test_response_body_encoding_true_with_content_type
res = fe_request(true, 'text/html; charset=utf-8')
assert_equal("hello\u1234", res.body)
assert_equal(Encoding::UTF_8, res.body.encoding)
end
def test_response_body_encoding_string_without_content_type
res = fe_request('utf-8')
assert_equal("hello\u1234", res.body)
assert_equal(Encoding::UTF_8, res.body.encoding)
end
def test_response_body_encoding_encoding_without_content_type
res = fe_request(Encoding::UTF_8)
assert_equal("hello\u1234", res.body)
assert_equal(Encoding::UTF_8, res.body.encoding)
end
end

View file

@ -54,6 +54,241 @@ EOS
assert_equal 'hello', body
end
def test_read_body_body_encoding_false
body = "hello\u1234"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{body.bytesize}
#{body}
EOS
res = Net::HTTPResponse.read_new(io)
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal "hello\u1234".b, body
assert_equal Encoding::ASCII_8BIT, body.encoding
end
def test_read_body_body_encoding_encoding
body = "hello\u1234"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{body.bytesize}
#{body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = Encoding.find('utf-8')
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal "hello\u1234", body
assert_equal Encoding::UTF_8, body.encoding
end
def test_read_body_body_encoding_string
body = "hello\u1234"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{body.bytesize}
#{body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = 'utf-8'
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal "hello\u1234", body
assert_equal Encoding::UTF_8, body.encoding
end
def test_read_body_body_encoding_true_without_content_type_header
body = "hello\u1234"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{body.bytesize}
#{body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = true
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal "hello\u1234".b, body
assert_equal Encoding::ASCII_8BIT, body.encoding
end
def test_read_body_body_encoding_true_with_utf8_content_type_header
body = "hello\u1234"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{body.bytesize}
Content-Type: text/plain; charset=utf-8
#{body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = true
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal "hello\u1234", body
assert_equal Encoding::UTF_8, body.encoding
end
def test_read_body_body_encoding_true_with_iso_8859_1_content_type_header
body = "hello\u1234"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{body.bytesize}
Content-Type: text/plain; charset=iso-8859-1
#{body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = true
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal "hello\u1234".force_encoding("ISO-8859-1"), body
assert_equal Encoding::ISO_8859_1, body.encoding
end
def test_read_body_body_encoding_true_with_utf8_meta_charset
res_body = "<html><meta charset=\"utf-8\">hello\u1234</html>"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{res_body.bytesize}
Content-Type: text/html
#{res_body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = true
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal res_body, body
assert_equal Encoding::UTF_8, body.encoding
end
def test_read_body_body_encoding_true_with_iso8859_1_meta_charset
res_body = "<html><meta charset=\"iso-8859-1\">hello\u1234</html>"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{res_body.bytesize}
Content-Type: text/html
#{res_body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = true
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal res_body.force_encoding("ISO-8859-1"), body
assert_equal Encoding::ISO_8859_1, body.encoding
end
def test_read_body_body_encoding_true_with_utf8_meta_content_charset
res_body = "<meta http-equiv='content-type' content='text/html; charset=UTF-8'>hello\u1234</html>"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{res_body.bytesize}
Content-Type: text/html
#{res_body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = true
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal res_body, body
assert_equal Encoding::UTF_8, body.encoding
end
def test_read_body_body_encoding_true_with_iso8859_1_meta_content_charset
res_body = "<meta http-equiv='content-type' content='text/html; charset=ISO-8859-1'>hello\u1234</html>"
io = dummy_io(<<EOS)
HTTP/1.1 200 OK
Connection: close
Content-Length: #{res_body.bytesize}
Content-Type: text/html
#{res_body}
EOS
res = Net::HTTPResponse.read_new(io)
res.body_encoding = true
body = nil
res.reading_body io, true do
body = res.read_body
end
assert_equal res_body.force_encoding("ISO-8859-1"), body
assert_equal Encoding::ISO_8859_1, body.encoding
end
def test_read_body_block
io = dummy_io(<<EOS)
HTTP/1.1 200 OK