Unnecessary Large Memory Usage of Levenshtein Distance in Phobos
James Carr via Digitalmars-d
digitalmars-d at puremagic.com
Thu May 22 09:25:47 PDT 2014
I've begun work on this and my implementation in D passes all the
std.algorithm unit tests, but because it now uses a single array instead of
a matrix, path() no longer provides the correct answer.
I'm working on trying to amend it so that there is consistency.
On Thu, May 22, 2014 at 12:26 PM, Max Klyga via Digitalmars-d <
digitalmars-d at puremagic.com> wrote:
> You should make a pull request with this implementation adapted to
> std.algorithm interface
>
>
> On 2014-05-22 09:49:09 +0000, Nordlöw said:
>
> I justd discovered that the std.algorithm implementation of Levenshtein
>> Distance requires O(m*n) memory usage.
>>
>> This is not neccessary. I have a C++-implementation of
>> Damerau-Levenshtein that requires only O(3*min(m,n)). Is anybody interested
>> in discussion modifying std.algorithm to make use of this?
>>
>> Here's C++ implementation:
>>
>>
>> template<class T> inline void perm3_231(T& a, T& b, T& c) {
>> T _t=a; a=b; b=c; c=_t;
>> }
>> template<class T> inline pure bool is_min(const T& a) {
>> return a == pnw::minof<T>();
>> }
>> template<class T> inline pure bool is_max(const T& a) {
>> return a == pnw::maxof<T>();
>> }
>>
>> template<class T, class D = size_t>
>> inline pure
>> D damerau_levenshtein(const T& s_, const T& t_,
>> D max_distance = std::numeric_limits<D>::max(),
>> D insert_weight = static_cast<D>(10),
>> D delete_weight = static_cast<D>(7),
>> D replace_weight = static_cast<D>(5),
>> D transposition_weight = static_cast<D>(3))
>> {
>> // reorder s and t to minimize memory usage
>> bool ook = s_.size() >= t_.size(); // argument ordering ok flag
>> const T& s = ook ? s_ : t_; // assure \c s becomes the \em longest
>> const T& t = ook ? t_ : s_; // assure \c t becomes the \em shortest
>>
>> const D m = s.size();
>> const D n = t.size();
>>
>> if (m == 0) { return n; }
>> if (n == 0) { return m; }
>>
>> // Adapt the algorithm to use less space, O(3*min(n,m)) instead of
>> O(mn),
>> // since it only requires that the previous row/column and current
>> row/column be stored at
>> // any one time.
>> #ifdef HAVE_C99_VARIABLE_LENGTH_ARRAYS
>> D cc_[n+1], pc_[n+1], sc_[n+1]; // current, previous and
>> second-previous column on stack
>> #elif HAVE_CXX11_UNIQUE_PTR
>> std::unique_ptr<D[]> cc_(new D[n+1]); // current column
>> std::unique_ptr<D[]> pc_(new D[n+1]); // previous column
>> std::unique_ptr<D[]> sc_(new D[n+1]); // second previous column
>> #else
>> auto cc_ = new D[n+1]; // current column
>> auto pc_ = new D[n+1]; // previous column
>> auto sc_ = new D[n+1]; // second previous column
>> //std::vector<D> cc_(n+1), pc_(n+1), sc_(n+1); // current, previous
>> and second previous column
>> #endif
>> D * cc = &cc_[0], * pc = &pc_[0], * sc = &sc_[0]; // pointers for
>> efficient swapping
>>
>> // initialize previous column
>> for (D i = 0; i < n+1; ++i) { pc[i] = i * insert_weight; }
>>
>> // second previous column \c sc will be defined in second \c i
>> iteration in outer-loop
>>
>> const auto D_max = std::numeric_limits<D>::max();
>>
>> // Computing the Levenshtein distance is based on the observation
>> that if we
>> // reserve a matrix to hold the Levenshtein distances between all
>> prefixes
>> // of the first string and all prefixes of the second, then we can
>> compute
>> // the values in the matrix by flood filling the matrix, and thus
>> find the
>> // distance between the two full strings as the last value computed.
>> // This algorithm, an example of bottom-up dynamic programming, is
>> // discussed, with variants, in the 1974 article The String-to-string
>> // correction problem by Robert A. Wagner and Michael J.Fischer.
>> for (D i = 0; i < m; ++i) {
>> cc[0] = i+insert_weight;
>> auto tmin = D_max; // row/column total min
>> for (D j = 0; j < n; ++j) {
>> // TODO Use sub_dist
>> //auto sub_dist = damerau_levenshtein(s[i], t[j]); //
>> recurse if for example T is an std::vector<std::string>
>> cc[j+1] = pnw::min(pc[j+1] + insert_weight, //
>> insertion
>> cc[j] + delete_weight, //
>> deletion
>> pc[j] + (s[i] == t[j] ? 0 :
>> replace_weight)); // substitution
>>
>> // transposition
>> if (not is_max(transposition_weight)) { // if transposition
>> should be allowed
>> if (i > 0 and j > 0 and // we need at least two
>> characters
>> s[i-1] == t[j] and // and first must be equal second
>> s[i] == t[j-1] // and vice versa
>> ) {
>> cc[j+1] = std::min(cc[j+1],
>> sc[j-1] + transposition_weight);
>> }
>> }
>>
>> if (not is_max(max_distance)) {
>> tmin = std::min(tmin, cc[j+1]);
>> }
>> }
>>
>> if ((not is_max(max_distance)) and
>> tmin >= max_distance) {
>> // if no element is smaller than \p max_distance
>> return max_distance;
>> }
>>
>> if (transposition_weight) {
>> perm3_231(pc, cc, sc); // rotate pointers
>> } else {
>> std::swap(cc, pc);
>> }
>> }
>>
>> #if !(defined(HAVE_C99_VARIABLE_LENGTH_ARRAYS) ||
>> defined(HAVE_CXX11_UNIQUE_PTR))
>> delete [] cc_;
>> delete [] pc_;
>> delete [] sc_;
>> #endif
>> return pc[n];
>> }
>>
>>
>> /*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em
>> sequences \p s and \p t.
>> * Computing LD is also called Optimal String Alignment (OSA).
>> */
>> template<class T, class D = size_t>
>> inline pure
>> D levenshtein(const T& s, const T& t,
>> D max_distance = std::numeric_limits<D>::max(),
>> D insert_weight = static_cast<D>(10),
>> D delete_weight = static_cast<D>(7),
>> D replace_weight = static_cast<D>(5))
>> {
>> return damerau_levenshtein(s, t, max_distance, insert_weight,
>> delete_weight, replace_weight,
>> std::numeric_limits<D>::max());
>> }
>>
>> /*! Get \em Levenshtein (Edit) Distance (LD) metric between the \em
>> arrays \p s and \p t.
>> * Computing LD is also called Optimal String Alignment (OSA).
>> */
>> template<class D = size_t>
>> inline pure
>> D levenshtein(const char * s, const char * t,
>> D max_distance = std::numeric_limits<D>::max(),
>> D insert_weight = static_cast<D>(10),
>> D delete_weight = static_cast<D>(7),
>> D replace_weight = static_cast<D>(5))
>> {
>> return levenshtein(csc(s),
>> csc(t),
>> max_distance, insert_weight, delete_weight,
>> replace_weight);
>> }
>>
>> /* ---------------------------- Group Separator
>> ---------------------------- */
>>
>> template<class T, class D = size_t>
>> inline pure
>> D test_levenshtein_symmetry(const T& s, const T& t,
>> D max_distance =
>> std::numeric_limits<D>::max())
>> {
>> D st = levenshtein(s, t, max_distance, static_cast<D>(1),static_cast<
>> D>(1),static_cast<D>(1));
>> D ts = levenshtein(t, s, max_distance, static_cast<D>(1),static_cast<
>> D>(1),static_cast<D>(1));
>> bool sym = (st == ts); // symmetry
>> return sym ? st : std::numeric_limits<D>::max();
>> }
>>
>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.puremagic.com/pipermail/digitalmars-d/attachments/20140522/8b70548c/attachment.html>
More information about the Digitalmars-d
mailing list