Unnecessary Large Memory Usage of Levenshtein Distance in Phobos
Max Klyga via Digitalmars-d
digitalmars-d at puremagic.com
Thu May 22 04:26:23 PDT 2014
You should make a pull request with this implementation adapted to
std.algorithm interface
On 2014-05-22 09:49:09 +0000, Nordlöw said:
> I justd discovered that the std.algorithm implementation of Levenshtein
> Distance requires O(m*n) memory usage.
>
> This is not neccessary. I have a C++-implementation of
> Damerau-Levenshtein that requires only O(3*min(m,n)). Is anybody
> interested in discussion modifying std.algorithm to make use of this?
>
> Here's C++ implementation:
>
>
> template<class T> inline void perm3_231(T& a, T& b, T& c) {
> T _t=a; a=b; b=c; c=_t;
> }
> template<class T> inline pure bool is_min(const T& a) {
> return a == pnw::minof<T>();
> }
> template<class T> inline pure bool is_max(const T& a) {
> return a == pnw::maxof<T>();
> }
>
> template<class T, class D = size_t>
> inline pure
> D damerau_levenshtein(const T& s_, const T& t_,
> D max_distance = std::numeric_limits<D>::max(),
> D insert_weight = static_cast<D>(10),
> D delete_weight = static_cast<D>(7),
> D replace_weight = static_cast<D>(5),
> D transposition_weight = static_cast<D>(3))
> {
> // reorder s and t to minimize memory usage
> bool ook = s_.size() >= t_.size(); // argument ordering ok flag
> const T& s = ook ? s_ : t_; // assure \c s becomes the \em longest
> const T& t = ook ? t_ : s_; // assure \c t becomes the \em shortest
>
> const D m = s.size();
> const D n = t.size();
>
> if (m == 0) { return n; }
> if (n == 0) { return m; }
>
> // Adapt the algorithm to use less space, O(3*min(n,m)) instead of O(mn),
> // since it only requires that the previous row/column and current
> row/column be stored at
> // any one time.
> #ifdef HAVE_C99_VARIABLE_LENGTH_ARRAYS
> D cc_[n+1], pc_[n+1], sc_[n+1]; // current, previous and
> second-previous column on stack
> #elif HAVE_CXX11_UNIQUE_PTR
> std::unique_ptr<D[]> cc_(new D[n+1]); // current column
> std::unique_ptr<D[]> pc_(new D[n+1]); // previous column
> std::unique_ptr<D[]> sc_(new D[n+1]); // second previous column
> #else
> auto cc_ = new D[n+1]; // current column
> auto pc_ = new D[n+1]; // previous column
> auto sc_ = new D[n+1]; // second previous column
> //std::vector<D> cc_(n+1), pc_(n+1), sc_(n+1); // current,
> previous and second previous column
> #endif
> D * cc = &cc_[0], * pc = &pc_[0], * sc = &sc_[0]; // pointers for
> efficient swapping
>
> // initialize previous column
> for (D i = 0; i < n+1; ++i) { pc[i] = i * insert_weight; }
>
> // second previous column \c sc will be defined in second \c i
> iteration in outer-loop
>
> const auto D_max = std::numeric_limits<D>::max();
>
> // Computing the Levenshtein distance is based on the observation
> that if we
> // reserve a matrix to hold the Levenshtein distances between all prefixes
> // of the first string and all prefixes of the second, then we can compute
> // the values in the matrix by flood filling the matrix, and thus find the
> // distance between the two full strings as the last value computed.
> // This algorithm, an example of bottom-up dynamic programming, is
> // discussed, with variants, in the 1974 article The String-to-string
> // correction problem by Robert A. Wagner and Michael J.Fischer.
> for (D i = 0; i < m; ++i) {
> cc[0] = i+insert_weight;
> auto tmin = D_max; // row/column total min
> for (D j = 0; j < n; ++j) {
> // TODO Use sub_dist
> //auto sub_dist = damerau_levenshtein(s[i], t[j]); //
> recurse if for example T is an std::vector<std::string>
> cc[j+1] = pnw::min(pc[j+1] + insert_weight, // insertion
> cc[j] + delete_weight, // deletion
> pc[j] + (s[i] == t[j] ? 0 :
> replace_weight)); // substitution
>
> // transposition
> if (not is_max(transposition_weight)) { // if
> transposition should be allowed
> if (i > 0 and j > 0 and // we need at least two characters
> s[i-1] == t[j] and // and first must be equal second
> s[i] == t[j-1] // and vice versa
> ) {
> cc[j+1] = std::min(cc[j+1],
> sc[j-1] + transposition_weight);
> }
> }
>
> if (not is_max(max_distance)) {
> tmin = std::min(tmin, cc[j+1]);
> }
> }
>
> if ((not is_max(max_distance)) and
> tmin >= max_distance) {
> // if no element is smaller than \p max_distance
> return max_distance;
> }
>
> if (transposition_weight) {
> perm3_231(pc, cc, sc); // rotate pointers
> } else {
> std::swap(cc, pc);
> }
> }
>
> #if !(defined(HAVE_C99_VARIABLE_LENGTH_ARRAYS) ||
> defined(HAVE_CXX11_UNIQUE_PTR))
> delete [] cc_;
> delete [] pc_;
> delete [] sc_;
> #endif
> return pc[n];
> }
>
>
> /*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em
> sequences \p s and \p t.
> * Computing LD is also called Optimal String Alignment (OSA).
> */
> template<class T, class D = size_t>
> inline pure
> D levenshtein(const T& s, const T& t,
> D max_distance = std::numeric_limits<D>::max(),
> D insert_weight = static_cast<D>(10),
> D delete_weight = static_cast<D>(7),
> D replace_weight = static_cast<D>(5))
> {
> return damerau_levenshtein(s, t, max_distance, insert_weight,
> delete_weight, replace_weight,
> std::numeric_limits<D>::max());
> }
>
> /*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em
> arrays \p s and \p t.
> * Computing LD is also called Optimal String Alignment (OSA).
> */
> template<class D = size_t>
> inline pure
> D levenshtein(const char * s, const char * t,
> D max_distance = std::numeric_limits<D>::max(),
> D insert_weight = static_cast<D>(10),
> D delete_weight = static_cast<D>(7),
> D replace_weight = static_cast<D>(5))
> {
> return levenshtein(csc(s),
> csc(t),
> max_distance, insert_weight, delete_weight,
> replace_weight);
> }
>
> /* ---------------------------- Group Separator ---------------------------- */
>
> template<class T, class D = size_t>
> inline pure
> D test_levenshtein_symmetry(const T& s, const T& t,
> D max_distance = std::numeric_limits<D>::max())
> {
> D st = levenshtein(s, t, max_distance,
> static_cast<D>(1),static_cast<D>(1),static_cast<D>(1));
> D ts = levenshtein(t, s, max_distance,
> static_cast<D>(1),static_cast<D>(1),static_cast<D>(1));
> bool sym = (st == ts); // symmetry
> return sym ? st : std::numeric_limits<D>::max();
> }
More information about the Digitalmars-d
mailing list