/**This library implements a basic power-of-two fast Fourier transform (FFT)
 * and inverse FFT in relatively simple, readable, portable D code.  It's
 * not blindingly fast, but it's not unreasonably slow either.  A size 2 ^^ 20
 * double[] -> Complex!double FFT takes about 790 milliseconds on my Athlon XP
 * 5200+.  For comparison, the (probably heavily optimized and compiled with a
 * compiler that's better at optimizing floating point arithmetic) R statistics
 * package does it in about 610 milliseconds on the same machine.
 *
 * The algorithm used under the hood is the radix-2 Cooley-Tukey algorithm.
 *
 * References:  http://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm
 *
 * Copyright (C) 2010 David Simcha
 *
 * License:
 * Boost Software License - Version 1.0 - August 17th, 2003
 *
 * Permission is hereby granted, free of charge, to any person or organization
 * obtaining a copy of the software and accompanying documentation covered by
 * this license (the "Software") to use, reproduce, display, distribute,
 * execute, and transmit the Software, and to prepare derivative works of the
 * Software, and to permit third-parties to whom the Software is furnished to
 * do so, all subject to the following:
 *
 * The copyright notices in the Software and this entire statement, including
 * the above license grant, this restriction and the following disclaimer,
 * must be included in all copies of the Software, in whole or in part, and
 * all derivative works of the Software, unless such copies or derivative
 * works are solely in the form of machine-executable object code generated by
 * a source language processor.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

import std.math, std.stdio, std.contracts, std.range,
    std.perf, std.algorithm, std.complex, std.traits, std.random,
    std.intrinsic;

version(unittest) {
    void main() {}
}

void main() { doTiming(); }

void doTiming() {
    auto arr = new double[2 ^^ 20];
    foreach(ref elem; arr) {
        elem = uniform(-1.0, 1.0);
    }

    auto pc = new PerformanceCounter;
    pc.start;

    auto foo = fft(arr);

    pc.stop;
    writeln(pc.milliseconds, " ms for forward transform.");

    pc.start;
    auto bar = inverseFft(foo);
    pc.stop;
    writeln(pc.milliseconds, " ms for inverse transform.");
    enforce(approxEqual(map!"a.re"(bar), arr));
    writeln("Inverse is approx. equal original.");
}

/**Compute the Fourier transform of range using the O(N log N) Cooley-Tukey
 * Algorithm.  range must be a random-access range with a size
 * that is a power of two.  The contents of range can be either numeric types,
 * which will be interpreted as pure real values, or complex types with
 * properties or members .re and .im that can be read.
 *
 * Returns:  An array of complex numbers representing the transformed data in
 *           the frequency domain.
 */
Complex!F[] fft(F = double, R)(R range) {
    enforce(isPowerOfTwo(range.length),
        "Can only do FFTs on ranges with a size that is a power of two.");
    Complex!F[] ret;
    if(range.length == 0) {
        return ret;
    }

    ret.length = range.length;

    fft(range,  ret);
    return ret;
}

/**Same as the overload, but allows for the results to be stored in a user-
 * provided buffer.  The buffer must be of the same length as range, must be
 * a random-access range, must have slicing, and must contain elements that are
 * complex-like.  This means that they must have a .re and a .im member or
 * property that can be both read and written and is a floating point number.
 */
void fft(Ret, R)(R range, Ret buf)
if(isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) && hasSlicing!Ret) {
    enforce(buf.length == range.length);
    enforce(isPowerOfTwo(range.length),
        "Can only do FFTs on ranges with a size that is a power of two.");

    static if(is(R : Stride!R)) {
        return fftImpl(range, buf);
    } else {
        return fftImpl(Stride!R(range, 1), buf);
    }
}

/**Computes the inverse Fourier transform of a range.  The range must be a
 * random access range, have a power of two size, and contain
 * elements that are either of type std.complex.Complex or have essentially
 * the same compile-time interface.
 *
 * Returns:  The time-domain signal.
 */
Complex!F[] inverseFft(F = double, R)(R range)
if(isRandomAccessRange!R && isComplexLike!(ElementType!R)) {
    enforce(isPowerOfTwo(range.length),
        "Can only do inverse FFTs on ranges with a size that is a power of two.");
    Complex!F[] ret;
    if(range.length == 0) {
        return ret;
    }

    ret.length = range.length;
    inverseFft(range, ret);
    return ret;
}

/**Inverse FFT that allows a user-supplied buffer to be provided.  The buffer
 * must be a random access range with slicing, and its elements
 * must be some complex-like type.
 */
void inverseFft(Ret, R)(R range, Ret buf)
if(isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) && hasSlicing!Ret) {
    enforce(isPowerOfTwo(range.length),
        "Can only do inverse FFTs on ranges with a size that is a power of two.");

    // BUGS:  This should be done w/o copying to an array first.  On the next
    // version of DMD this will be do-able easily because new and improved Map
    // will be in std.algorithm.
    auto swapped = array(map!swapRealImag(range));
    fft(swapped,  buf);
    foreach(ref elem; buf) {
        elem = swapRealImag(elem) / buf.length;
    }
}

unittest {
    // Test values from R.
    auto arr = [1,2,3,4,5,6,7,8];
    auto fft1 = fft(arr);
    assert(approxEqual(map!"a.re"(fft1),
        [36.0, -4, -4, -4, -4, -4, -4, -4]));
    assert(approxEqual(map!"a.im"(fft1),
        [0, 9.6568, 4, 1.6568, 0, -1.6568, -4, -9.6568]));

    alias Complex!float C;
    auto arr2 = [C(1,2), C(3,4), C(5,6), C(7,8), C(9,10),
        C(11,12), C(13,14), C(15,16)];
    auto fft2 = fft(arr2);
    assert(approxEqual(map!"a.re"(fft2),
        [64.0, -27.3137, -16, -11.3137, -8, -4.6862, 0, 11.3137]));
    assert(approxEqual(map!"a.im"(fft2),
        [72, 11.3137, 0, -4.686, -8, -11.3137, -16, -27.3137]));

    auto inv1 = inverseFft(fft1);
    assert(approxEqual(map!"a.re"(inv1), arr));
    assert(reduce!max(map!"a.im"(inv1)) < 1e-10);

    auto inv2 = inverseFft(fft2);
    assert(approxEqual(map!"a.re"(inv2), map!"a.re"(arr2)));
    assert(approxEqual(map!"a.im"(inv2), map!"a.im"(arr2)));
}

// Swaps the real and imaginary parts of a complex number.  This is useful
// for inverse FFTs.
C swapRealImag(C)(C input) {
    return C(input.im, input.re);
}

private:

// The reason I couldn't use std.algorithm was b/c its stride length isn't
// modifiable on the fly.
struct Stride(R) {
    R range;
    size_t _nSteps;
    size_t _length;
    alias ElementType!(R) E;

    this(R range, size_t nStepsIn) {
        this.range = range;
       _nSteps = nStepsIn;
       _length = (range.length + _nSteps - 1) / nSteps;
    }

    size_t length() const @property {
        return _length;
    }

    E opIndex(size_t index) {
        return range[index * _nSteps];
    }

    E front() {
        return range[0];
    }

    void popFront() {
        static if(hasSlicing!R) {
            if(range.length >= _nSteps) {
                range = range[_nSteps..$];
                _length--;
            } else {
                range.length = 0;
                _length = 0;
            }
        } else {
            foreach(i; 0.._nSteps) {
                if(range.empty) {
                    break;
                }
                range.popFront();
            }

            _length--;
        }
    }

    bool empty() const @property {
        return length == 0;
    }

    size_t nSteps() const @property {
        return _nSteps;
    }

    size_t nSteps(size_t newVal) @property {
        _nSteps = newVal;
        _length = (range.length + _nSteps - 1) / nSteps;
        return newVal;
    }
}

///////////////////////////////////////////////////////////////////////////////
// The following code caches sine and cosine terms for small sub-FFTs.  For
// larger FFTs it's faster to compute these terms on the fly because
// of cache effects and because they're not used as often, but for smaller ones,
// we use CTFE and store pre-computed values in an immutable array.

// This cache size seems fairly optimial because:
//
// 1.  It will fit in most L1 data caches.
//     (It leads to a total cache size of 32K)
//
// 2.  DMD blows up if I go up to 8192 and uses several hundred megs of RAM.
//
// 3.  It seems empirically like bigger caches don't give much, if any, speedup.
enum nCache = 4096;

double[nCache] computeCache() pure nothrow {
    // This computes a negative sine cache by shifting sines by PI / 2.  It's
    // meant to be evaluated at compile time.
    double[nCache] ret;

    foreach(i; 0..ret.length) {
        ret[(i + nCache / 2) % nCache] = sin(i * 2 * PI / nCache);
    }
    return ret;
}

immutable double[nCache] trigCache = computeCache();

double negSinFromCache(size_t index) pure nothrow {
    return trigCache[index];
}

double cosFromCache(size_t index) pure nothrow {
    // cos is just -sin shifted by PI * 3 / 4.  Trusting the compiler to
    // optimize the readable but inefficient but clearly compiler-optimizable
    // index calculation.
    return trigCache[(index + nCache * 3 / 4) % nCache];
}

///////////////////////////////////////////////////////////////////////////////

void fftImpl(Ret, R)(Stride!R range, Ret buf) {
    if(range.length <= 2) {
        slowFourier2(range, buf);
        return;
    }

    auto left = range;
    auto right = range;
    right.popFront;
    left.nSteps = left.nSteps * 2;
    right.nSteps = right.nSteps * 2;
    assert(left.length == right.length);

    auto leftBuf = buf[0..$ / 2];
    auto rightBuf = buf[$ / 2..$];

    fftImpl(left, leftBuf);
    fftImpl(right, rightBuf);

    immutable halfLen = range.length / 2;
    void doLoop(bool useCache)() {
        static if(useCache) {
            assert(range.length <= nCache);
            immutable multiplier = nCache / range.length;
        } else {
            immutable step = 2.0 * PI / range.length;
        }

        // Using ptrdiff_t here b/c converting a signed int to a float is actually
        // measurably faster than converting an unsigned int to a float in this
        // context.
        foreach(ptrdiff_t k; 0..halfLen) {

            static if(useCache) {
                immutable cosTwiddle = cosFromCache(k * multiplier);
                immutable sinTwiddle = negSinFromCache(k * multiplier);
            } else {
                immutable loc = step * k;
                immutable cosTwiddle = cos(loc);
                immutable sinTwiddle = -sin(loc);
            }
            immutable upperIndex = k + halfLen;

            immutable realLower = buf[k].re;
            immutable imagLower = buf[k].im;
            immutable realUpper = buf[upperIndex].re;
            immutable imagUpper = buf[upperIndex].im;

            immutable realAdd = cosTwiddle * realUpper - sinTwiddle * imagUpper;
            immutable imagAdd = sinTwiddle * realUpper + cosTwiddle * imagUpper;

            buf[k].re += realAdd;
            buf[k].im += imagAdd;
            buf[upperIndex].re = realLower - realAdd;
            buf[upperIndex].im = imagLower - imagAdd;
        }
    }

    if(range.length > nCache) {
        doLoop!false();
    } else {
        doLoop!true();
    }
}

// Hard-coded base case for FFT of size 2.  This is actually a TON faster than
// using a generic slow DFT.
void slowFourier2(Ret, R)(R range, Complex!Ret[] buf) {
    buf[0] = range[0] + range[1];
    buf[1] = range[0] - range[1];
}

bool isPowerOfTwo(size_t num) {
    // BUGS:  std.intrinsic takes a uint, not a size_t.  Therefore, this
    //        won't work on 64-bit unless std.intrinsic is fixed.
    return bsr(num) == bsf(num);
}

template isComplexLike(T) {
    enum bool isComplexLike = is(typeof(T.init.re)) &&
        is(typeof(T.init.im));
}

unittest {
    static assert(isComplexLike!(Complex!double));
    static assert(!isComplexLike!(uint));
}