Unnecessary Large Memory Usage of Levenshtein Distance in Phobos

James Carr via Digitalmars-d digitalmars-d at puremagic.com
Thu May 22 09:25:47 PDT 2014


I've begun work on this and my implementation in D passes all the
std.algorithm unit tests, but because it now uses a single array instead of
a matrix, path() no longer provides the correct answer.

I'm working on trying to amend it so that there is consistency.


On Thu, May 22, 2014 at 12:26 PM, Max Klyga via Digitalmars-d <
digitalmars-d at puremagic.com> wrote:

> You should make a pull request with this implementation adapted to
> std.algorithm interface
>
>
> On 2014-05-22 09:49:09 +0000, Nordlöw said:
>
>  I justd discovered that the std.algorithm implementation of Levenshtein
>> Distance requires O(m*n) memory usage.
>>
>> This is not neccessary. I have a C++-implementation of
>> Damerau-Levenshtein that requires only O(3*min(m,n)). Is anybody interested
>> in discussion modifying std.algorithm to make use of this?
>>
>> Here's C++ implementation:
>>
>>
>> template<class T> inline void perm3_231(T& a, T& b, T& c) {
>>      T _t=a; a=b; b=c; c=_t;
>> }
>> template<class T> inline pure bool is_min(const T& a) {
>>      return a == pnw::minof<T>();
>> }
>> template<class T> inline pure bool is_max(const T& a) {
>>      return a == pnw::maxof<T>();
>> }
>>
>> template<class T, class D = size_t>
>> inline pure
>> D damerau_levenshtein(const T& s_, const T& t_,
>>                        D max_distance = std::numeric_limits<D>::max(),
>>                        D insert_weight = static_cast<D>(10),
>>                        D delete_weight = static_cast<D>(7),
>>                        D replace_weight = static_cast<D>(5),
>>                        D transposition_weight = static_cast<D>(3))
>> {
>>      // reorder s and t to minimize memory usage
>>      bool ook = s_.size() >= t_.size(); // argument ordering ok flag
>>      const T& s = ook ? s_ : t_; // assure \c s becomes the \em longest
>>      const T& t = ook ? t_ : s_; // assure \c t becomes the \em shortest
>>
>>      const D m = s.size();
>>      const D n = t.size();
>>
>>      if (m == 0) { return n; }
>>      if (n == 0) { return m; }
>>
>>      // Adapt the algorithm to use less space, O(3*min(n,m)) instead of
>> O(mn),
>>      // since it only requires that the previous row/column and current
>> row/column be stored at
>>      // any one time.
>> #ifdef HAVE_C99_VARIABLE_LENGTH_ARRAYS
>>      D cc_[n+1], pc_[n+1], sc_[n+1]; // current, previous and
>> second-previous column on stack
>> #elif HAVE_CXX11_UNIQUE_PTR
>>      std::unique_ptr<D[]> cc_(new D[n+1]);      // current column
>>      std::unique_ptr<D[]> pc_(new D[n+1]);      // previous column
>>      std::unique_ptr<D[]> sc_(new D[n+1]);      // second previous column
>> #else
>>      auto cc_ = new D[n+1];      // current column
>>      auto pc_ = new D[n+1];      // previous column
>>      auto sc_ = new D[n+1];      // second previous column
>>      //std::vector<D> cc_(n+1), pc_(n+1), sc_(n+1); // current, previous
>> and second previous column
>> #endif
>>      D * cc = &cc_[0], * pc = &pc_[0], * sc = &sc_[0]; // pointers for
>> efficient swapping
>>
>>      // initialize previous column
>>      for (D i = 0; i < n+1; ++i) { pc[i] = i * insert_weight; }
>>
>>      // second previous column \c sc will be defined in second \c i
>> iteration in outer-loop
>>
>>      const auto D_max = std::numeric_limits<D>::max();
>>
>>      // Computing the Levenshtein distance is based on the observation
>> that if we
>>      // reserve a matrix to hold the Levenshtein distances between all
>> prefixes
>>      // of the first string and all prefixes of the second, then we can
>> compute
>>      // the values in the matrix by flood filling the matrix, and thus
>> find the
>>      // distance between the two full strings as the last value computed.
>>      // This algorithm, an example of bottom-up dynamic programming, is
>>      // discussed, with variants, in the 1974 article The String-to-string
>>      // correction problem by Robert A. Wagner and Michael J.Fischer.
>>      for (D i = 0; i < m; ++i) {
>>          cc[0] = i+insert_weight;
>>          auto tmin = D_max; // row/column total min
>>          for (D j = 0; j < n; ++j) {
>>              // TODO Use sub_dist
>>              //auto sub_dist = damerau_levenshtein(s[i], t[j]); //
>> recurse if for example T is an std::vector<std::string>
>>              cc[j+1] = pnw::min(pc[j+1] + insert_weight,         //
>> insertion
>>                                 cc[j] + delete_weight,           //
>> deletion
>>                                 pc[j] + (s[i] == t[j] ? 0 :
>> replace_weight)); // substitution
>>
>>              // transposition
>>              if (not is_max(transposition_weight)) { // if transposition
>> should be allowed
>>                  if (i > 0 and j > 0 and // we need at least two
>> characters
>>                      s[i-1] == t[j] and  // and first must be equal second
>>                      s[i]   == t[j-1]    // and vice versa
>>                      ) {
>>                      cc[j+1] = std::min(cc[j+1],
>>                                         sc[j-1] + transposition_weight);
>>                  }
>>              }
>>
>>              if (not is_max(max_distance)) {
>>                  tmin = std::min(tmin, cc[j+1]);
>>              }
>>          }
>>
>>          if ((not is_max(max_distance)) and
>>              tmin >= max_distance) {
>>              // if no element is smaller than \p max_distance
>>              return max_distance;
>>          }
>>
>>          if (transposition_weight) {
>>              perm3_231(pc, cc, sc); // rotate pointers
>>          } else {
>>              std::swap(cc, pc);
>>          }
>>      }
>>
>> #if !(defined(HAVE_C99_VARIABLE_LENGTH_ARRAYS) ||
>> defined(HAVE_CXX11_UNIQUE_PTR))
>>      delete [] cc_;
>>      delete [] pc_;
>>      delete [] sc_;
>> #endif
>>      return pc[n];
>> }
>>
>>
>> /*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em
>> sequences \p s and \p t.
>>   * Computing LD is also called Optimal String Alignment (OSA).
>>   */
>> template<class T, class D = size_t>
>> inline pure
>> D levenshtein(const T& s, const T& t,
>>                D max_distance = std::numeric_limits<D>::max(),
>>                D insert_weight = static_cast<D>(10),
>>                D delete_weight = static_cast<D>(7),
>>                D replace_weight = static_cast<D>(5))
>> {
>>      return damerau_levenshtein(s, t, max_distance, insert_weight,
>> delete_weight, replace_weight,
>>                                 std::numeric_limits<D>::max());
>> }
>>
>> /*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em
>> arrays \p s and \p t.
>>   * Computing LD is also called Optimal String Alignment (OSA).
>>   */
>> template<class D = size_t>
>> inline pure
>> D levenshtein(const char * s, const char * t,
>>                D max_distance = std::numeric_limits<D>::max(),
>>                D insert_weight = static_cast<D>(10),
>>                D delete_weight = static_cast<D>(7),
>>                D replace_weight = static_cast<D>(5))
>> {
>>      return levenshtein(csc(s),
>>                         csc(t),
>>                         max_distance, insert_weight, delete_weight,
>> replace_weight);
>> }
>>
>> /* ---------------------------- Group Separator
>> ---------------------------- */
>>
>> template<class T, class D = size_t>
>> inline pure
>> D test_levenshtein_symmetry(const T& s, const T& t,
>>                              D max_distance =
>> std::numeric_limits<D>::max())
>> {
>>      D st = levenshtein(s, t, max_distance, static_cast<D>(1),static_cast<
>> D>(1),static_cast<D>(1));
>>      D ts = levenshtein(t, s, max_distance, static_cast<D>(1),static_cast<
>> D>(1),static_cast<D>(1));
>>      bool sym = (st == ts); // symmetry
>>      return sym ? st : std::numeric_limits<D>::max();
>> }
>>
>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.puremagic.com/pipermail/digitalmars-d/attachments/20140522/8b70548c/attachment.html>


More information about the Digitalmars-d mailing list