Avoind unnecesary force_encoding operations

They're costly. This will also avoid some edge cases where
charlock_holmes assigns a weird encoding to a perfectly valid UTF-8
string.
This commit is contained in:
Alejandro Rodríguez 2017-06-14 12:11:03 -04:00
parent 00c15cc27c
commit 520866a0d0
2 changed files with 19 additions and 4 deletions

View file

@ -14,9 +14,9 @@ module Gitlab
ENCODING_CONFIDENCE_THRESHOLD = 50
def encode!(message)
return nil unless message.respond_to? :force_encoding
return nil unless message.respond_to?(:force_encoding)
return message if message.encoding == Encoding::UTF_8 && message.valid_encoding?
# if message is utf-8 encoding, just return it
message.force_encoding("UTF-8")
return message if message.valid_encoding?
@ -50,6 +50,9 @@ module Gitlab
end
def encode_utf8(message)
return nil unless message.is_a?(String)
return message if message.encoding == Encoding::UTF_8 && message.valid_encoding?
detect = CharlockHolmes::EncodingDetector.detect(message)
if detect && detect[:encoding]
begin

View file

@ -6,6 +6,9 @@ describe Gitlab::EncodingHelper do
describe '#encode!' do
[
["nil", nil, nil],
["empty string", "".encode("ASCII-8BIT"), "".encode("UTF-8")],
["invalid utf-8 encoded string", "my bad string\xE5".force_encoding("UTF-8"), "my bad string"],
[
'leaves ascii only string as is',
'ascii only string',
@ -81,6 +84,9 @@ describe Gitlab::EncodingHelper do
describe '#encode_utf8' do
[
["nil", nil, nil],
["empty string", "".encode("ASCII-8BIT"), "".encode("UTF-8")],
["invalid utf-8 encoded string", "my bad string\xE5".force_encoding("UTF-8"), "my bad stringå"],
[
"encodes valid utf8 encoded string to utf8",
"λ, λ, λ".encode("UTF-8"),
@ -95,12 +101,18 @@ describe Gitlab::EncodingHelper do
"encodes valid ISO-8859-1 encoded string to utf8",
"Rüby ist eine Programmiersprache. Wir verlängern den text damit ICU die Sprache erkennen kann.".encode("ISO-8859-1", "UTF-8"),
"Rüby ist eine Programmiersprache. Wir verlängern den text damit ICU die Sprache erkennen kann.".encode("UTF-8")
],
[
# Test case from https://gitlab.com/gitlab-org/gitlab-ce/issues/39227
"Equifax branch name",
"refs/heads/Equifax".encode("UTF-8"),
"refs/heads/Equifax".encode("UTF-8")
]
].each do |description, test_string, xpect|
it description do
r = ext_class.encode_utf8(test_string.force_encoding('UTF-8'))
r = ext_class.encode_utf8(test_string)
expect(r).to eq(xpect)
expect(r.encoding.name).to eq('UTF-8')
expect(r.encoding.name).to eq('UTF-8') if xpect
end
end