<div dir="ltr">I've begun work on this and my implementation in D passes all the std.algorithm unit tests, but because it now uses a single array instead of a matrix, path() no longer provides the correct answer.<br><br>
I'm working on trying to amend it so that there is consistency.</div><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, May 22, 2014 at 12:26 PM, Max Klyga via Digitalmars-d <span dir="ltr"><<a href="mailto:digitalmars-d@puremagic.com" target="_blank">digitalmars-d@puremagic.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">You should make a pull request with this implementation adapted to std.algorithm interface<div class="HOEnZb"><div class="h5">
<br>
<br>
On 2014-05-22 09:49:09 +0000, Nordlöw said:<br>
<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
I justd discovered that the std.algorithm implementation of Levenshtein Distance requires O(m*n) memory usage.<br>
<br>
This is not neccessary. I have a C++-implementation of Damerau-Levenshtein that requires only O(3*min(m,n)). Is anybody interested in discussion modifying std.algorithm to make use of this?<br>
<br>
Here's C++ implementation:<br>
<br>
<br>
template<class T> inline void perm3_231(T& a, T& b, T& c) {<br>
T _t=a; a=b; b=c; c=_t;<br>
}<br>
template<class T> inline pure bool is_min(const T& a) {<br>
return a == pnw::minof<T>();<br>
}<br>
template<class T> inline pure bool is_max(const T& a) {<br>
return a == pnw::maxof<T>();<br>
}<br>
<br>
template<class T, class D = size_t><br>
inline pure<br>
D damerau_levenshtein(const T& s_, const T& t_,<br>
D max_distance = std::numeric_limits<D>::max(),<br>
D insert_weight = static_cast<D>(10),<br>
D delete_weight = static_cast<D>(7),<br>
D replace_weight = static_cast<D>(5),<br>
D transposition_weight = static_cast<D>(3))<br>
{<br>
// reorder s and t to minimize memory usage<br>
bool ook = s_.size() >= t_.size(); // argument ordering ok flag<br>
const T& s = ook ? s_ : t_; // assure \c s becomes the \em longest<br>
const T& t = ook ? t_ : s_; // assure \c t becomes the \em shortest<br>
<br>
const D m = s.size();<br>
const D n = t.size();<br>
<br>
if (m == 0) { return n; }<br>
if (n == 0) { return m; }<br>
<br>
// Adapt the algorithm to use less space, O(3*min(n,m)) instead of O(mn),<br>
// since it only requires that the previous row/column and current row/column be stored at<br>
// any one time.<br>
#ifdef HAVE_C99_VARIABLE_LENGTH_<u></u>ARRAYS<br>
D cc_[n+1], pc_[n+1], sc_[n+1]; // current, previous and second-previous column on stack<br>
#elif HAVE_CXX11_UNIQUE_PTR<br>
std::unique_ptr<D[]> cc_(new D[n+1]); // current column<br>
std::unique_ptr<D[]> pc_(new D[n+1]); // previous column<br>
std::unique_ptr<D[]> sc_(new D[n+1]); // second previous column<br>
#else<br>
auto cc_ = new D[n+1]; // current column<br>
auto pc_ = new D[n+1]; // previous column<br>
auto sc_ = new D[n+1]; // second previous column<br>
//std::vector<D> cc_(n+1), pc_(n+1), sc_(n+1); // current, previous and second previous column<br>
#endif<br>
D * cc = &cc_[0], * pc = &pc_[0], * sc = &sc_[0]; // pointers for efficient swapping<br>
<br>
// initialize previous column<br>
for (D i = 0; i < n+1; ++i) { pc[i] = i * insert_weight; }<br>
<br>
// second previous column \c sc will be defined in second \c i iteration in outer-loop<br>
<br>
const auto D_max = std::numeric_limits<D>::max();<br>
<br>
// Computing the Levenshtein distance is based on the observation that if we<br>
// reserve a matrix to hold the Levenshtein distances between all prefixes<br>
// of the first string and all prefixes of the second, then we can compute<br>
// the values in the matrix by flood filling the matrix, and thus find the<br>
// distance between the two full strings as the last value computed.<br>
// This algorithm, an example of bottom-up dynamic programming, is<br>
// discussed, with variants, in the 1974 article The String-to-string<br>
// correction problem by Robert A. Wagner and Michael J.Fischer.<br>
for (D i = 0; i < m; ++i) {<br>
cc[0] = i+insert_weight;<br>
auto tmin = D_max; // row/column total min<br>
for (D j = 0; j < n; ++j) {<br>
// TODO Use sub_dist<br>
//auto sub_dist = damerau_levenshtein(s[i], t[j]); // recurse if for example T is an std::vector<std::string><br>
cc[j+1] = pnw::min(pc[j+1] + insert_weight, // insertion<br>
cc[j] + delete_weight, // deletion<br>
pc[j] + (s[i] == t[j] ? 0 : replace_weight)); // substitution<br>
<br>
// transposition<br>
if (not is_max(transposition_weight)) { // if transposition should be allowed<br>
if (i > 0 and j > 0 and // we need at least two characters<br>
s[i-1] == t[j] and // and first must be equal second<br>
s[i] == t[j-1] // and vice versa<br>
) {<br>
cc[j+1] = std::min(cc[j+1],<br>
sc[j-1] + transposition_weight);<br>
}<br>
}<br>
<br>
if (not is_max(max_distance)) {<br>
tmin = std::min(tmin, cc[j+1]);<br>
}<br>
}<br>
<br>
if ((not is_max(max_distance)) and<br>
tmin >= max_distance) {<br>
// if no element is smaller than \p max_distance<br>
return max_distance;<br>
}<br>
<br>
if (transposition_weight) {<br>
perm3_231(pc, cc, sc); // rotate pointers<br>
} else {<br>
std::swap(cc, pc);<br>
}<br>
}<br>
<br>
#if !(defined(HAVE_C99_VARIABLE_<u></u>LENGTH_ARRAYS) || defined(HAVE_CXX11_UNIQUE_PTR)<u></u>)<br>
delete [] cc_;<br>
delete [] pc_;<br>
delete [] sc_;<br>
#endif<br>
return pc[n];<br>
}<br>
<br>
<br>
/*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em sequences \p s and \p t.<br>
* Computing LD is also called Optimal String Alignment (OSA).<br>
*/<br>
template<class T, class D = size_t><br>
inline pure<br>
D levenshtein(const T& s, const T& t,<br>
D max_distance = std::numeric_limits<D>::max(),<br>
D insert_weight = static_cast<D>(10),<br>
D delete_weight = static_cast<D>(7),<br>
D replace_weight = static_cast<D>(5))<br>
{<br>
return damerau_levenshtein(s, t, max_distance, insert_weight, delete_weight, replace_weight,<br>
std::numeric_limits<D>::max())<u></u>;<br>
}<br>
<br>
/*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em arrays \p s and \p t.<br>
* Computing LD is also called Optimal String Alignment (OSA).<br>
*/<br>
template<class D = size_t><br>
inline pure<br>
D levenshtein(const char * s, const char * t,<br>
D max_distance = std::numeric_limits<D>::max(),<br>
D insert_weight = static_cast<D>(10),<br>
D delete_weight = static_cast<D>(7),<br>
D replace_weight = static_cast<D>(5))<br>
{<br>
return levenshtein(csc(s),<br>
csc(t),<br>
max_distance, insert_weight, delete_weight, replace_weight);<br>
}<br>
<br>
/* ---------------------------- Group Separator ---------------------------- */<br>
<br>
template<class T, class D = size_t><br>
inline pure<br>
D test_levenshtein_symmetry(<u></u>const T& s, const T& t,<br>
D max_distance = std::numeric_limits<D>::max())<br>
{<br>
D st = levenshtein(s, t, max_distance, static_cast<D>(1),static_cast<<u></u>D>(1),static_cast<D>(1));<br>
D ts = levenshtein(t, s, max_distance, static_cast<D>(1),static_cast<<u></u>D>(1),static_cast<D>(1));<br>
bool sym = (st == ts); // symmetry<br>
return sym ? st : std::numeric_limits<D>::max();<br>
}<br>
</blockquote>
<br>
<br>
</div></div></blockquote></div><br></div>