diff --git a/ChangeLog b/ChangeLog index 62fbae0afb..5c2d7f8b39 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +Thu Mar 18 00:00:58 2010 NARUSE, Yui + + * lib/uri/common.rb (URI#{en,de}code_www_form_component): + renamed from URI#{en,de}code_www_component. [ruby-dev:40672] + + * lib/uri/common.rb (URI#encode_www_form_component): %-encoded + element should have always two hex. + + * lib/uri/common.rb (URI#encode_www_form_component): + better treatment for ASCII incompatible encodings and + encodings whose lead byte may use 7bit. + + * lib/uri/common.rb (URI#decode_www_form_component): add %20. + + * lib/uri/common.rb (URI#decode_www_form_component): add + result's encoding as 2nd argument. + + * lib/uri/common.rb (URI#decode_www_form): added. + Wed Mar 17 16:25:53 2010 Nobuyoshi Nakada * hash.c (rb_hash_aset): allow recursive key. [ruby-core:24648] diff --git a/lib/uri/common.rb b/lib/uri/common.rb index b4de6ad40c..d9aa15a4c3 100644 --- a/lib/uri/common.rb +++ b/lib/uri/common.rb @@ -729,49 +729,60 @@ module URI # # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data # - # See URI.decode_www_component(str), URI.encode_www_form(enum) - def self.encode_www_component(str) + # See URI.decode_www_form_component, URI.encode_www_form + def self.encode_www_form_component(str) if TBLENCWWWCOMP_.empty? 256.times do |i| case i when 0x20 TBLENCWWWCOMP_[' '] = '+' - when 0x2A, 0x2D, 0x2E, 0x30..0x39, 0x41..0x5A, 0x5F, 0x61..0x7A + # when 0x2A, 0x2D, 0x2E, 0x30..0x39, 0x41..0x5A, 0x5F, 0x61..0x7A else - TBLENCWWWCOMP_[i.chr] = '%%%X' % i + TBLENCWWWCOMP_[i.chr] = '%%%02X' % i end end TBLENCWWWCOMP_.freeze end - str = str.to_s.dup - enc = str.encoding - str.force_encoding(Encoding::ASCII_8BIT) - str.gsub!(/[^*\-.0-9A-Z_a-z]/, TBLENCWWWCOMP_) - str.force_encoding(enc) + str = str.to_s + case str.encoding + when Encoding::ASCII_8BIT, Encoding::US_ASCII, Encoding::UTF_8 + str = str.dup.force_encoding(Encoding::ASCII_8BIT) + str.gsub!(/[^*\-.0-9A-Z_a-z]/, TBLENCWWWCOMP_) + when Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE + reg = Regexp.new('[^*\-.0-9A-Z_a-z]+'.encode(str.encoding)) + str = str.gsub(reg){ + $&.force_encoding(Encoding::ASCII_8BIT).gsub(/./, TBLENCWWWCOMP_). + force_encoding(str.encoding) + } + else + if str.encoding.ascii_compatible? + str = str.gsub(/[^*\-.0-9A-Z_a-z]+/){ + $&.force_encoding(Encoding::ASCII_8BIT).gsub(/./, TBLENCWWWCOMP_)} + else + str = str.force_encoding(Encoding::ASCII_8BIT).gsub(/./, TBLENCWWWCOMP_) + end + end + str.force_encoding(Encoding::US_ASCII) end # Decode given +str+ of URL-encoded form data. # # This decods + to SP. # - # See URI.encode_www_component(str) - def self.decode_www_component(str) + # See URI.encode_www_form_component, URI.decode_www_form + def self.decode_www_form_component(str, enc=Encoding::UTF_8) if TBLDECWWWCOMP_.empty? 256.times do |i| - case i - when 0x20 - TBLDECWWWCOMP_['+'] = ' ' - else - h, l = i>>4, i&15 - TBLDECWWWCOMP_['%%%X%X' % [h, l]] = i.chr - TBLDECWWWCOMP_['%%%x%X' % [h, l]] = i.chr - TBLDECWWWCOMP_['%%%X%x' % [h, l]] = i.chr - TBLDECWWWCOMP_['%%%x%x' % [h, l]] = i.chr - end + h, l = i>>4, i&15 + TBLDECWWWCOMP_['%%%X%X' % [h, l]] = i.chr + TBLDECWWWCOMP_['%%%x%X' % [h, l]] = i.chr + TBLDECWWWCOMP_['%%%X%x' % [h, l]] = i.chr + TBLDECWWWCOMP_['%%%x%x' % [h, l]] = i.chr + TBLDECWWWCOMP_['+'] = ' ' if i == 0x20 end TBLDECWWWCOMP_.freeze end - str.gsub(/\+|%\h\h/, TBLDECWWWCOMP_) + str.gsub(/\+|%\h\h/, TBLDECWWWCOMP_).force_encoding(Encoding::UTF_8) end # Generate URL-encoded form data from given +enum+. @@ -779,7 +790,7 @@ module URI # This generates application/x-www-form-urlencoded data defined in HTML5 # from given an Enumerable object. # - # This internally uses URI.encode_www_component(str). + # This internally uses URI.encode_www_form_component(str). # # This doesn't convert encodings of give items, so convert them before call # this method if you want to send data as other than original encoding or @@ -789,7 +800,7 @@ module URI # # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data # - # See URI.encode_www_component(str) + # See URI.encode_www_form_component, URI.decode_www_form def self.encode_www_form(enum) str = nil enum.each do |k,v| @@ -798,12 +809,43 @@ module URI else str = ''.force_encoding(Encoding::US_ASCII) end - str << encode_www_component(k) + str << encode_www_form_component(k) str << '=' - str << encode_www_component(v) + str << encode_www_form_component(v) end str end + + # Decode URL-encoded form data from given +str+. + # + # This decodes application/x-www-form-urlencoded data + # and returns array of key-value array. + # This internally uses URI.decode_www_form_component. + # + # _charset_ hack is not supported now because the mapping from given charset + # to Ruby's encoding is not clear yet. + # see also http://www.w3.org/TR/html5/syntax.html#character-encodings-0 + # + # This refers http://www.w3.org/TR/html5/forms.html#url-encoded-form-data + # + # ary = URI.decode_www_form("a=1&a=2&b=3") + # p ary #=> [['a', '1'], ['a', '2'], ['b', '3']] + # p ary.assoc('a').last #=> '1' + # p ary.assoc('b').last #=> '3' + # p ary.rassoc('a').last #=> '2' + # p Hash[ary] # => {"a"=>"2", "b"=>"3"} + # + # See URI.decode_www_form_component, URI.encode_www_form + def self.decode_www_form(str, enc=Encoding::UTF_8) + ary = [] + unless /\A\??(?[^=;&]*=[^;&]*(?:[;&][^=;&]*=[^;&]*)*)\z/ =~ str + raise ArgumentError, "invalid data of application/x-www-form-urlencoded (#{str})" + end + query.scan(/([^=;&]+)=([^;&]*)/) do + ary << [decode_www_form_component($1, enc), decode_www_form_component($2, enc)] + end + ary + end end module Kernel diff --git a/test/uri/test_common.rb b/test/uri/test_common.rb index 9a5fd6751f..730210e8b3 100644 --- a/test/uri/test_common.rb +++ b/test/uri/test_common.rb @@ -50,16 +50,22 @@ class TestCommon < Test::Unit::TestCase assert_raise(NoMethodError) { Object.new.URI("http://www.ruby-lang.org/") } end - def test_encode_www_component - assert_equal("+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F09%3A%3B%3C%3D%3E%3F%40" \ + def test_encode_www_form_component + assert_equal("%00+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F09%3A%3B%3C%3D%3E%3F%40" \ "AZ%5B%5C%5D%5E_%60az%7B%7C%7D%7E", - URI.encode_www_component(" !\"\#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~")) + URI.encode_www_form_component("\x00 !\"\#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~")) + assert_equal("%95%41", URI.encode_www_form_component( + "\x95\x41".force_encoding(Encoding::Shift_JIS))) + assert_equal("%30%42", URI.encode_www_form_component( + "\x30\x42".force_encoding(Encoding::UTF_16BE))) + assert_equal("%30%42", URI.encode_www_form_component( + "\x30\x42".force_encoding(Encoding::ISO_2022_JP))) end - def test_decode_www_component - assert_equal(" !\"\#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~", - URI.decode_www_component( - "+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F09%3A%3B%3C%3D%3E%3F%40" \ + def test_decode_www_form_component + assert_equal(" !\"\#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~", + URI.decode_www_form_component( + "%20+%21%22%23%24%25%26%27%28%29*%2B%2C-.%2F09%3A%3B%3C%3D%3E%3F%40" \ "AZ%5B%5C%5D%5E_%60az%7B%7C%7D%7E")) end @@ -74,6 +80,12 @@ class TestCommon < Test::Unit::TestCase assert_equal(expected, URI.encode_www_form([["a", "1"], ["\u3042", "\u6F22"]])) assert_equal(expected, URI.encode_www_form([[:a, 1], [:"\u3042", "\u6F22"]])) end + + def test_decode_www_form + assert_equal([%w[a 1], %w[a 2]], URI.decode_www_form("a=1&a=2")) + assert_equal([%w[a 1], ["\u3042", "\u6F22"]], + URI.decode_www_form("a=1&%E3%81%82=%E6%BC%A2")) + end end