mirror of
https://github.com/rails/rails.git
synced 2022-11-09 12:12:34 -05:00
replace edit distance implementation with one written from scratch to avoid license issues
This commit is contained in:
parent
3aae3db170
commit
bf14453af9
1 changed files with 23 additions and 106 deletions
|
@ -1,112 +1,29 @@
|
|||
#
|
||||
# Levenshtein distance algorithm implementation for Ruby, with UTF-8 support
|
||||
#
|
||||
# Author:: Paul BATTLEY (pbattley @ gmail.com)
|
||||
# Version:: 1.3
|
||||
# Date:: 2005-04-19
|
||||
#
|
||||
# == About
|
||||
#
|
||||
# The Levenshtein distance is a measure of how similar two strings s and t are,
|
||||
# calculated as the number of deletions/insertions/substitutions needed to
|
||||
# transform s into t. The greater the distance, the more the strings differ.
|
||||
#
|
||||
# The Levenshtein distance is also sometimes referred to as the
|
||||
# easier-to-pronounce-and-spell 'edit distance'.
|
||||
#
|
||||
# == Revision history
|
||||
#
|
||||
# * 2005-05-19 1.3 Repairing an oversight, distance can now be called via
|
||||
# Levenshtein.distance(s, t)
|
||||
# * 2005-05-04 1.2 Now uses just one 1-dimensional array. I think this is as
|
||||
# far as optimisation can go.
|
||||
# * 2005-05-04 1.1 Now storing only the current and previous rows of the matrix
|
||||
# instead of the whole lot.
|
||||
#
|
||||
# == Licence
|
||||
#
|
||||
# Copyright (c) 2005 Paul Battley
|
||||
#
|
||||
# Usage of the works is permitted provided that this instrument is retained
|
||||
# with the works, so that any entity that uses the works is notified of this
|
||||
# instrument.
|
||||
#
|
||||
# DISCLAIMER: THE WORKS ARE WITHOUT WARRANTY.
|
||||
#
|
||||
|
||||
module Levenshtein
|
||||
# Based on the pseudocode in http://en.wikipedia.org/wiki/Levenshtein_distance.
|
||||
def self.distance(s1, s2)
|
||||
s = s1.unpack('U*')
|
||||
t = s2.unpack('U*')
|
||||
m = s.length
|
||||
n = t.length
|
||||
|
||||
#
|
||||
# Calculate the Levenshtein distance between two strings +str1+ and +str2+.
|
||||
# +str1+ and +str2+ should be ASCII or UTF-8.
|
||||
#
|
||||
def distance(str1, str2)
|
||||
s = str1.unpack('U*')
|
||||
t = str2.unpack('U*')
|
||||
n = s.length
|
||||
m = t.length
|
||||
return m if (0 == n)
|
||||
return n if (0 == m)
|
||||
|
||||
d = (0..m).to_a
|
||||
x = nil
|
||||
# matrix initialization
|
||||
d = []
|
||||
0.upto(m) { |i| d << [i] }
|
||||
0.upto(n) { |j| d[0][j] = j }
|
||||
|
||||
(0...n).each do |i|
|
||||
e = i+1
|
||||
(0...m).each do |j|
|
||||
cost = (s[i] == t[j]) ? 0 : 1
|
||||
x = [
|
||||
d[j+1] + 1, # insertion
|
||||
e + 1, # deletion
|
||||
d[j] + cost # substitution
|
||||
].min
|
||||
d[j] = e
|
||||
e = x
|
||||
end
|
||||
d[m] = x
|
||||
end
|
||||
|
||||
return x
|
||||
# distance computation
|
||||
1.upto(m) do |i|
|
||||
1.upto(n) do |j|
|
||||
cost = s[i] == t[j] ? 0 : 1
|
||||
d[i][j] = [
|
||||
d[i-1][j] + 1, # deletion
|
||||
d[i][j-1] + 1, # insertion
|
||||
d[i-1][j-1] + cost, # substitution
|
||||
].min
|
||||
end
|
||||
end
|
||||
|
||||
extend self
|
||||
end
|
||||
|
||||
if (__FILE__ == $0)
|
||||
require 'test/unit'
|
||||
|
||||
class LevenshteinTest < Test::Unit::TestCase
|
||||
include Levenshtein
|
||||
|
||||
EXPECTED = [
|
||||
# Easy ones
|
||||
['test', 'test', 0],
|
||||
['test', 'tent', 1],
|
||||
['gumbo', 'gambol', 2],
|
||||
['kitten', 'sitting', 3],
|
||||
# Empty strings
|
||||
['foo', '', 3],
|
||||
['', '', 0],
|
||||
['a', '', 1],
|
||||
# UTF-8
|
||||
["f\303\266o", 'foo', 1],
|
||||
["fran\303\247ais", 'francais', 1],
|
||||
["fran\303\247ais", "fran\303\246ais", 1],
|
||||
["\347\247\201\343\201\256\345\220\215\345\211\215\343\201\257"<<
|
||||
"\343\203\235\343\203\274\343\203\253\343\201\247\343\201\231",
|
||||
"\343\201\274\343\201\217\343\201\256\345\220\215\345\211\215\343\201"<<
|
||||
"\257\343\203\235\343\203\274\343\203\253\343\201\247\343\201\231",
|
||||
2], # Japanese
|
||||
# Edge cases
|
||||
['a', 'a', 0],
|
||||
['0123456789', 'abcdefghijklmnopqrstuvwxyz', 26]
|
||||
]
|
||||
|
||||
def test_known_distances
|
||||
EXPECTED.each do |a,b,x|
|
||||
assert_equal(x, distance(a, b))
|
||||
assert_equal(x, distance(b, a))
|
||||
end
|
||||
end
|
||||
end
|
||||
# all done
|
||||
return d[m][n]
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue