<div dir="ltr">I've begun work on this and my implementation in D passes all the std.algorithm unit tests, but because it now uses a single array instead of a matrix, path() no longer provides the correct answer.<br><br>

I'm working on trying to amend it so that there is consistency.</div><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, May 22, 2014 at 12:26 PM, Max Klyga via Digitalmars-d <span dir="ltr"><<a href="mailto:digitalmars-d@puremagic.com" target="_blank">digitalmars-d@puremagic.com</a>></span> wrote:<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">You should make a pull request with this implementation adapted to std.algorithm interface<div class="HOEnZb"><div class="h5">

<br>

<br>

On 2014-05-22 09:49:09 +0000, Nordlöw said:<br>

<br>

<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">

I justd discovered that the std.algorithm implementation of Levenshtein Distance requires O(m*n) memory usage.<br>

<br>

This is not neccessary. I have a C++-implementation of Damerau-Levenshtein that requires only O(3*min(m,n)). Is anybody interested in discussion modifying std.algorithm to make use of this?<br>

<br>

Here's C++ implementation:<br>

<br>

<br>

template<class T> inline void perm3_231(T& a, T& b, T& c) {<br>

     T _t=a; a=b; b=c; c=_t;<br>

}<br>

template<class T> inline pure bool is_min(const T& a) {<br>

     return a == pnw::minof<T>();<br>

}<br>

template<class T> inline pure bool is_max(const T& a) {<br>

     return a == pnw::maxof<T>();<br>

}<br>

<br>

template<class T, class D = size_t><br>

inline pure<br>

D damerau_levenshtein(const T& s_, const T& t_,<br>

                       D max_distance = std::numeric_limits<D>::max(),<br>

                       D insert_weight = static_cast<D>(10),<br>

                       D delete_weight = static_cast<D>(7),<br>

                       D replace_weight = static_cast<D>(5),<br>

                       D transposition_weight = static_cast<D>(3))<br>

{<br>

     // reorder s and t to minimize memory usage<br>

     bool ook = s_.size() >= t_.size(); // argument ordering ok flag<br>

     const T& s = ook ? s_ : t_; // assure \c s becomes the \em longest<br>

     const T& t = ook ? t_ : s_; // assure \c t becomes the \em shortest<br>

<br>

     const D m = s.size();<br>

     const D n = t.size();<br>

<br>

     if (m == 0) { return n; }<br>

     if (n == 0) { return m; }<br>

<br>

     // Adapt the algorithm to use less space, O(3*min(n,m)) instead of O(mn),<br>

     // since it only requires that the previous row/column and current row/column be stored at<br>

     // any one time.<br>

#ifdef HAVE_C99_VARIABLE_LENGTH_<u></u>ARRAYS<br>

     D cc_[n+1], pc_[n+1], sc_[n+1]; // current, previous and second-previous column on stack<br>

#elif HAVE_CXX11_UNIQUE_PTR<br>

     std::unique_ptr<D[]> cc_(new D[n+1]);      // current column<br>

     std::unique_ptr<D[]> pc_(new D[n+1]);      // previous column<br>

     std::unique_ptr<D[]> sc_(new D[n+1]);      // second previous column<br>

#else<br>

     auto cc_ = new D[n+1];      // current column<br>

     auto pc_ = new D[n+1];      // previous column<br>

     auto sc_ = new D[n+1];      // second previous column<br>

     //std::vector<D> cc_(n+1), pc_(n+1), sc_(n+1); // current, previous and second previous column<br>

#endif<br>

     D * cc = &cc_[0], * pc = &pc_[0], * sc = &sc_[0]; // pointers for efficient swapping<br>

<br>

     // initialize previous column<br>

     for (D i = 0; i < n+1; ++i) { pc[i] = i * insert_weight; }<br>

<br>

     // second previous column \c sc will be defined in second \c i iteration in outer-loop<br>

<br>

     const auto D_max = std::numeric_limits<D>::max();<br>

<br>

     // Computing the Levenshtein distance is based on the observation that if we<br>

     // reserve a matrix to hold the Levenshtein distances between all prefixes<br>

     // of the first string and all prefixes of the second, then we can compute<br>

     // the values in the matrix by flood filling the matrix, and thus find the<br>

     // distance between the two full strings as the last value computed.<br>

     // This algorithm, an example of bottom-up dynamic programming, is<br>

     // discussed, with variants, in the 1974 article The String-to-string<br>

     // correction problem by Robert A. Wagner and Michael J.Fischer.<br>

     for (D i = 0; i < m; ++i) {<br>

         cc[0] = i+insert_weight;<br>

         auto tmin = D_max; // row/column total min<br>

         for (D j = 0; j < n; ++j) {<br>

             // TODO Use sub_dist<br>

             //auto sub_dist = damerau_levenshtein(s[i], t[j]); // recurse if for example T is an std::vector<std::string><br>

             cc[j+1] = pnw::min(pc[j+1] + insert_weight,         // insertion<br>

                                cc[j] + delete_weight,           // deletion<br>

                                pc[j] + (s[i] == t[j] ? 0 : replace_weight)); // substitution<br>

<br>

             // transposition<br>

             if (not is_max(transposition_weight)) { // if transposition should be allowed<br>

                 if (i > 0 and j > 0 and // we need at least two characters<br>

                     s[i-1] == t[j] and  // and first must be equal second<br>

                     s[i]   == t[j-1]    // and vice versa<br>

                     ) {<br>

                     cc[j+1] = std::min(cc[j+1],<br>

                                        sc[j-1] + transposition_weight);<br>

                 }<br>

             }<br>

<br>

             if (not is_max(max_distance)) {<br>

                 tmin = std::min(tmin, cc[j+1]);<br>

             }<br>

         }<br>

<br>

         if ((not is_max(max_distance)) and<br>

             tmin >= max_distance) {<br>

             // if no element is smaller than \p max_distance<br>

             return max_distance;<br>

         }<br>

<br>

         if (transposition_weight) {<br>

             perm3_231(pc, cc, sc); // rotate pointers<br>

         } else {<br>

             std::swap(cc, pc);<br>

         }<br>

     }<br>

<br>

#if !(defined(HAVE_C99_VARIABLE_<u></u>LENGTH_ARRAYS) || defined(HAVE_CXX11_UNIQUE_PTR)<u></u>)<br>

     delete [] cc_;<br>

     delete [] pc_;<br>

     delete [] sc_;<br>

#endif<br>

     return pc[n];<br>

}<br>

<br>

<br>

/*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em sequences \p s and \p t.<br>

  * Computing LD is also called Optimal String Alignment (OSA).<br>

  */<br>

template<class T, class D = size_t><br>

inline pure<br>

D levenshtein(const T& s, const T& t,<br>

               D max_distance = std::numeric_limits<D>::max(),<br>

               D insert_weight = static_cast<D>(10),<br>

               D delete_weight = static_cast<D>(7),<br>

               D replace_weight = static_cast<D>(5))<br>

{<br>

     return damerau_levenshtein(s, t, max_distance, insert_weight, delete_weight, replace_weight,<br>

                                std::numeric_limits<D>::max())<u></u>;<br>

}<br>

<br>

/*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em arrays \p s and \p t.<br>

  * Computing LD is also called Optimal String Alignment (OSA).<br>

  */<br>

template<class D = size_t><br>

inline pure<br>

D levenshtein(const char * s, const char * t,<br>

               D max_distance = std::numeric_limits<D>::max(),<br>

               D insert_weight = static_cast<D>(10),<br>

               D delete_weight = static_cast<D>(7),<br>

               D replace_weight = static_cast<D>(5))<br>

{<br>

     return levenshtein(csc(s),<br>

                        csc(t),<br>

                        max_distance, insert_weight, delete_weight, replace_weight);<br>

}<br>

<br>

/* ---------------------------- Group Separator ---------------------------- */<br>

<br>

template<class T, class D = size_t><br>

inline pure<br>

D test_levenshtein_symmetry(<u></u>const T& s, const T& t,<br>

                             D max_distance = std::numeric_limits<D>::max())<br>

{<br>

     D st = levenshtein(s, t, max_distance, static_cast<D>(1),static_cast<<u></u>D>(1),static_cast<D>(1));<br>

     D ts = levenshtein(t, s, max_distance, static_cast<D>(1),static_cast<<u></u>D>(1),static_cast<D>(1));<br>

     bool sym = (st == ts); // symmetry<br>

     return sym ? st : std::numeric_limits<D>::max();<br>

}<br>

</blockquote>

<br>

<br>

</div></div></blockquote></div><br></div>