Avoind unnecesary force_encoding
operations
They're costly. This will also avoid some edge cases where charlock_holmes assigns a weird encoding to a perfectly valid UTF-8 string.
This commit is contained in:
parent
00c15cc27c
commit
520866a0d0
2 changed files with 19 additions and 4 deletions
|
@ -14,9 +14,9 @@ module Gitlab
|
|||
ENCODING_CONFIDENCE_THRESHOLD = 50
|
||||
|
||||
def encode!(message)
|
||||
return nil unless message.respond_to? :force_encoding
|
||||
return nil unless message.respond_to?(:force_encoding)
|
||||
return message if message.encoding == Encoding::UTF_8 && message.valid_encoding?
|
||||
|
||||
# if message is utf-8 encoding, just return it
|
||||
message.force_encoding("UTF-8")
|
||||
return message if message.valid_encoding?
|
||||
|
||||
|
@ -50,6 +50,9 @@ module Gitlab
|
|||
end
|
||||
|
||||
def encode_utf8(message)
|
||||
return nil unless message.is_a?(String)
|
||||
return message if message.encoding == Encoding::UTF_8 && message.valid_encoding?
|
||||
|
||||
detect = CharlockHolmes::EncodingDetector.detect(message)
|
||||
if detect && detect[:encoding]
|
||||
begin
|
||||
|
|
|
@ -6,6 +6,9 @@ describe Gitlab::EncodingHelper do
|
|||
|
||||
describe '#encode!' do
|
||||
[
|
||||
["nil", nil, nil],
|
||||
["empty string", "".encode("ASCII-8BIT"), "".encode("UTF-8")],
|
||||
["invalid utf-8 encoded string", "my bad string\xE5".force_encoding("UTF-8"), "my bad string"],
|
||||
[
|
||||
'leaves ascii only string as is',
|
||||
'ascii only string',
|
||||
|
@ -81,6 +84,9 @@ describe Gitlab::EncodingHelper do
|
|||
|
||||
describe '#encode_utf8' do
|
||||
[
|
||||
["nil", nil, nil],
|
||||
["empty string", "".encode("ASCII-8BIT"), "".encode("UTF-8")],
|
||||
["invalid utf-8 encoded string", "my bad string\xE5".force_encoding("UTF-8"), "my bad stringå"],
|
||||
[
|
||||
"encodes valid utf8 encoded string to utf8",
|
||||
"λ, λ, λ".encode("UTF-8"),
|
||||
|
@ -95,12 +101,18 @@ describe Gitlab::EncodingHelper do
|
|||
"encodes valid ISO-8859-1 encoded string to utf8",
|
||||
"Rüby ist eine Programmiersprache. Wir verlängern den text damit ICU die Sprache erkennen kann.".encode("ISO-8859-1", "UTF-8"),
|
||||
"Rüby ist eine Programmiersprache. Wir verlängern den text damit ICU die Sprache erkennen kann.".encode("UTF-8")
|
||||
],
|
||||
[
|
||||
# Test case from https://gitlab.com/gitlab-org/gitlab-ce/issues/39227
|
||||
"Equifax branch name",
|
||||
"refs/heads/Equifax".encode("UTF-8"),
|
||||
"refs/heads/Equifax".encode("UTF-8")
|
||||
]
|
||||
].each do |description, test_string, xpect|
|
||||
it description do
|
||||
r = ext_class.encode_utf8(test_string.force_encoding('UTF-8'))
|
||||
r = ext_class.encode_utf8(test_string)
|
||||
expect(r).to eq(xpect)
|
||||
expect(r.encoding.name).to eq('UTF-8')
|
||||
expect(r.encoding.name).to eq('UTF-8') if xpect
|
||||
end
|
||||
end
|
||||
|
||||
|
|
Loading…
Reference in a new issue