Unnecessary Large Memory Usage of Levenshtein Distance in Phobos

Max Klyga via Digitalmars-d digitalmars-d at puremagic.com
Thu May 22 04:26:23 PDT 2014


You should make a pull request with this implementation adapted to 
std.algorithm interface

On 2014-05-22 09:49:09 +0000, Nordlöw said:

> I justd discovered that the std.algorithm implementation of Levenshtein 
> Distance requires O(m*n) memory usage.
> 
> This is not neccessary. I have a C++-implementation of 
> Damerau-Levenshtein that requires only O(3*min(m,n)). Is anybody 
> interested in discussion modifying std.algorithm to make use of this?
> 
> Here's C++ implementation:
> 
> 
> template<class T> inline void perm3_231(T& a, T& b, T& c) {
>      T _t=a; a=b; b=c; c=_t;
> }
> template<class T> inline pure bool is_min(const T& a) {
>      return a == pnw::minof<T>();
> }
> template<class T> inline pure bool is_max(const T& a) {
>      return a == pnw::maxof<T>();
> }
> 
> template<class T, class D = size_t>
> inline pure
> D damerau_levenshtein(const T& s_, const T& t_,
>                        D max_distance = std::numeric_limits<D>::max(),
>                        D insert_weight = static_cast<D>(10),
>                        D delete_weight = static_cast<D>(7),
>                        D replace_weight = static_cast<D>(5),
>                        D transposition_weight = static_cast<D>(3))
> {
>      // reorder s and t to minimize memory usage
>      bool ook = s_.size() >= t_.size(); // argument ordering ok flag
>      const T& s = ook ? s_ : t_; // assure \c s becomes the \em longest
>      const T& t = ook ? t_ : s_; // assure \c t becomes the \em shortest
> 
>      const D m = s.size();
>      const D n = t.size();
> 
>      if (m == 0) { return n; }
>      if (n == 0) { return m; }
> 
>      // Adapt the algorithm to use less space, O(3*min(n,m)) instead of O(mn),
>      // since it only requires that the previous row/column and current 
> row/column be stored at
>      // any one time.
> #ifdef HAVE_C99_VARIABLE_LENGTH_ARRAYS
>      D cc_[n+1], pc_[n+1], sc_[n+1]; // current, previous and 
> second-previous column on stack
> #elif HAVE_CXX11_UNIQUE_PTR
>      std::unique_ptr<D[]> cc_(new D[n+1]);      // current column
>      std::unique_ptr<D[]> pc_(new D[n+1]);      // previous column
>      std::unique_ptr<D[]> sc_(new D[n+1]);      // second previous column
> #else
>      auto cc_ = new D[n+1];      // current column
>      auto pc_ = new D[n+1];      // previous column
>      auto sc_ = new D[n+1];      // second previous column
>      //std::vector<D> cc_(n+1), pc_(n+1), sc_(n+1); // current, 
> previous and second previous column
> #endif
>      D * cc = &cc_[0], * pc = &pc_[0], * sc = &sc_[0]; // pointers for 
> efficient swapping
> 
>      // initialize previous column
>      for (D i = 0; i < n+1; ++i) { pc[i] = i * insert_weight; }
> 
>      // second previous column \c sc will be defined in second \c i 
> iteration in outer-loop
> 
>      const auto D_max = std::numeric_limits<D>::max();
> 
>      // Computing the Levenshtein distance is based on the observation 
> that if we
>      // reserve a matrix to hold the Levenshtein distances between all prefixes
>      // of the first string and all prefixes of the second, then we can compute
>      // the values in the matrix by flood filling the matrix, and thus find the
>      // distance between the two full strings as the last value computed.
>      // This algorithm, an example of bottom-up dynamic programming, is
>      // discussed, with variants, in the 1974 article The String-to-string
>      // correction problem by Robert A. Wagner and Michael J.Fischer.
>      for (D i = 0; i < m; ++i) {
>          cc[0] = i+insert_weight;
>          auto tmin = D_max; // row/column total min
>          for (D j = 0; j < n; ++j) {
>              // TODO Use sub_dist
>              //auto sub_dist = damerau_levenshtein(s[i], t[j]); // 
> recurse if for example T is an std::vector<std::string>
>              cc[j+1] = pnw::min(pc[j+1] + insert_weight,         // insertion
>                                 cc[j] + delete_weight,           // deletion
>                                 pc[j] + (s[i] == t[j] ? 0 : 
> replace_weight)); // substitution
> 
>              // transposition
>              if (not is_max(transposition_weight)) { // if 
> transposition should be allowed
>                  if (i > 0 and j > 0 and // we need at least two characters
>                      s[i-1] == t[j] and  // and first must be equal second
>                      s[i]   == t[j-1]    // and vice versa
>                      ) {
>                      cc[j+1] = std::min(cc[j+1],
>                                         sc[j-1] + transposition_weight);
>                  }
>              }
> 
>              if (not is_max(max_distance)) {
>                  tmin = std::min(tmin, cc[j+1]);
>              }
>          }
> 
>          if ((not is_max(max_distance)) and
>              tmin >= max_distance) {
>              // if no element is smaller than \p max_distance
>              return max_distance;
>          }
> 
>          if (transposition_weight) {
>              perm3_231(pc, cc, sc); // rotate pointers
>          } else {
>              std::swap(cc, pc);
>          }
>      }
> 
> #if !(defined(HAVE_C99_VARIABLE_LENGTH_ARRAYS) || 
> defined(HAVE_CXX11_UNIQUE_PTR))
>      delete [] cc_;
>      delete [] pc_;
>      delete [] sc_;
> #endif
>      return pc[n];
> }
> 
> 
> /*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em 
> sequences \p s and \p t.
>   * Computing LD is also called Optimal String Alignment (OSA).
>   */
> template<class T, class D = size_t>
> inline pure
> D levenshtein(const T& s, const T& t,
>                D max_distance = std::numeric_limits<D>::max(),
>                D insert_weight = static_cast<D>(10),
>                D delete_weight = static_cast<D>(7),
>                D replace_weight = static_cast<D>(5))
> {
>      return damerau_levenshtein(s, t, max_distance, insert_weight, 
> delete_weight, replace_weight,
>                                 std::numeric_limits<D>::max());
> }
> 
> /*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em 
> arrays \p s and \p t.
>   * Computing LD is also called Optimal String Alignment (OSA).
>   */
> template<class D = size_t>
> inline pure
> D levenshtein(const char * s, const char * t,
>                D max_distance = std::numeric_limits<D>::max(),
>                D insert_weight = static_cast<D>(10),
>                D delete_weight = static_cast<D>(7),
>                D replace_weight = static_cast<D>(5))
> {
>      return levenshtein(csc(s),
>                         csc(t),
>                         max_distance, insert_weight, delete_weight, 
> replace_weight);
> }
> 
> /* ---------------------------- Group Separator ---------------------------- */
> 
> template<class T, class D = size_t>
> inline pure
> D test_levenshtein_symmetry(const T& s, const T& t,
>                              D max_distance = std::numeric_limits<D>::max())
> {
>      D st = levenshtein(s, t, max_distance, 
> static_cast<D>(1),static_cast<D>(1),static_cast<D>(1));
>      D ts = levenshtein(t, s, max_distance, 
> static_cast<D>(1),static_cast<D>(1),static_cast<D>(1));
>      bool sym = (st == ts); // symmetry
>      return sym ? st : std::numeric_limits<D>::max();
> }




More information about the Digitalmars-d mailing list