Unnecessary Large Memory Usage of Levenshtein Distance in Phobos

Thu May 22 02:49:09 PDT 2014

I justd discovered that the std.algorithm implementation of 
Levenshtein Distance requires O(m*n) memory usage.

This is not neccessary. I have a C++-implementation of 
Damerau-Levenshtein that requires only O(3*min(m,n)). Is anybody 
interested in discussion modifying std.algorithm to make use of 
this?

Here's C++ implementation:

template<class T> inline void perm3_231(T& a, T& b, T& c) {
     T _t=a; a=b; b=c; c=_t;
}
template<class T> inline pure bool is_min(const T& a) {
     return a == pnw::minof<T>();
}
template<class T> inline pure bool is_max(const T& a) {
     return a == pnw::maxof<T>();
}

template<class T, class D = size_t>
inline pure
D damerau_levenshtein(const T& s_, const T& t_,
                       D max_distance = 
std::numeric_limits<D>::max(),
                       D insert_weight = static_cast<D>(10),
                       D delete_weight = static_cast<D>(7),
                       D replace_weight = static_cast<D>(5),
                       D transposition_weight = static_cast<D>(3))
{
     // reorder s and t to minimize memory usage
     bool ook = s_.size() >= t_.size(); // argument ordering ok 
flag
     const T& s = ook ? s_ : t_; // assure \c s becomes the \em 
longest
     const T& t = ook ? t_ : s_; // assure \c t becomes the \em 
shortest

     const D m = s.size();
     const D n = t.size();

     if (m == 0) { return n; }
     if (n == 0) { return m; }

     // Adapt the algorithm to use less space, O(3*min(n,m)) 
instead of O(mn),
     // since it only requires that the previous row/column and 
current row/column be stored at
     // any one time.
#ifdef HAVE_C99_VARIABLE_LENGTH_ARRAYS
     D cc_[n+1], pc_[n+1], sc_[n+1]; // current, previous and 
second-previous column on stack
#elif HAVE_CXX11_UNIQUE_PTR
     std::unique_ptr<D[]> cc_(new D[n+1]);      // current column
     std::unique_ptr<D[]> pc_(new D[n+1]);      // previous column
     std::unique_ptr<D[]> sc_(new D[n+1]);      // second previous 
column
#else
     auto cc_ = new D[n+1];      // current column
     auto pc_ = new D[n+1];      // previous column
     auto sc_ = new D[n+1];      // second previous column
     //std::vector<D> cc_(n+1), pc_(n+1), sc_(n+1); // current, 
previous and second previous column
#endif
     D * cc = &cc_[0], * pc = &pc_[0], * sc = &sc_[0]; // pointers 
for efficient swapping

     // initialize previous column
     for (D i = 0; i < n+1; ++i) { pc[i] = i * insert_weight; }

     // second previous column \c sc will be defined in second \c 
i iteration in outer-loop

     const auto D_max = std::numeric_limits<D>::max();

     // Computing the Levenshtein distance is based on the 
observation that if we
     // reserve a matrix to hold the Levenshtein distances between 
all prefixes
     // of the first string and all prefixes of the second, then 
we can compute
     // the values in the matrix by flood filling the matrix, and 
thus find the
     // distance between the two full strings as the last value 
computed.
     // This algorithm, an example of bottom-up dynamic 
programming, is
     // discussed, with variants, in the 1974 article The 
String-to-string
     // correction problem by Robert A. Wagner and Michael 
J.Fischer.
     for (D i = 0; i < m; ++i) {
         cc[0] = i+insert_weight;
         auto tmin = D_max; // row/column total min
         for (D j = 0; j < n; ++j) {
             // TODO Use sub_dist
             //auto sub_dist = damerau_levenshtein(s[i], t[j]); // 
recurse if for example T is an std::vector<std::string>
             cc[j+1] = pnw::min(pc[j+1] + insert_weight,         
// insertion
                                cc[j] + delete_weight,           
// deletion
                                pc[j] + (s[i] == t[j] ? 0 : 
replace_weight)); // substitution

             // transposition
             if (not is_max(transposition_weight)) { // if 
transposition should be allowed
                 if (i > 0 and j > 0 and // we need at least two 
characters
                     s[i-1] == t[j] and  // and first must be 
equal second
                     s[i]   == t[j-1]    // and vice versa
                     ) {
                     cc[j+1] = std::min(cc[j+1],
                                        sc[j-1] + 
transposition_weight);
                 }
             }

             if (not is_max(max_distance)) {
                 tmin = std::min(tmin, cc[j+1]);
             }
         }

         if ((not is_max(max_distance)) and
             tmin >= max_distance) {
             // if no element is smaller than \p max_distance
             return max_distance;
         }

         if (transposition_weight) {
             perm3_231(pc, cc, sc); // rotate pointers
         } else {
             std::swap(cc, pc);
         }
     }

#if !(defined(HAVE_C99_VARIABLE_LENGTH_ARRAYS) || 
defined(HAVE_CXX11_UNIQUE_PTR))
     delete [] cc_;
     delete [] pc_;
     delete [] sc_;
#endif
     return pc[n];
}

/*! Get \em Levenshtein (Edit) Distance (LD) metric between the 
\em sequences \p s and \p t.
  * Computing LD is also called Optimal String Alignment (OSA).
  */
template<class T, class D = size_t>
inline pure
D levenshtein(const T& s, const T& t,
               D max_distance = std::numeric_limits<D>::max(),
               D insert_weight = static_cast<D>(10),
               D delete_weight = static_cast<D>(7),
               D replace_weight = static_cast<D>(5))
{
     return damerau_levenshtein(s, t, max_distance, insert_weight, 
delete_weight, replace_weight,
                                std::numeric_limits<D>::max());
}

/*! Get \em Levenshtein (Edit) Distance (LD) metric between the 
\em arrays \p s and \p t.
  * Computing LD is also called Optimal String Alignment (OSA).
  */
template<class D = size_t>
inline pure
D levenshtein(const char * s, const char * t,
               D max_distance = std::numeric_limits<D>::max(),
               D insert_weight = static_cast<D>(10),
               D delete_weight = static_cast<D>(7),
               D replace_weight = static_cast<D>(5))
{
     return levenshtein(csc(s),
                        csc(t),
                        max_distance, insert_weight, 
delete_weight, replace_weight);
}

/* ---------------------------- Group Separator 
---------------------------- */

template<class T, class D = size_t>
inline pure
D test_levenshtein_symmetry(const T& s, const T& t,
                             D max_distance = 
std::numeric_limits<D>::max())
{
     D st = levenshtein(s, t, max_distance, 
static_cast<D>(1),static_cast<D>(1),static_cast<D>(1));
     D ts = levenshtein(t, s, max_distance, 
static_cast<D>(1),static_cast<D>(1),static_cast<D>(1));
     bool sym = (st == ts); // symmetry
     return sym ? st : std::numeric_limits<D>::max();
}