/**This library implements a basic power-of-two fast Fourier transform (FFT) * and inverse FFT in relatively simple, readable, portable D code. It's * not blindingly fast, but it's not unreasonably slow either. A size 2 ^^ 20 * double[] -> Complex!double FFT takes about 790 milliseconds on my Athlon XP * 5200+. For comparison, the (probably heavily optimized and compiled with a * compiler that's better at optimizing floating point arithmetic) R statistics * package does it in about 610 milliseconds on the same machine. * * The algorithm used under the hood is the radix-2 Cooley-Tukey algorithm. * * References: http://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm * * Copyright (C) 2010 David Simcha * * License: * Boost Software License - Version 1.0 - August 17th, 2003 * * Permission is hereby granted, free of charge, to any person or organization * obtaining a copy of the software and accompanying documentation covered by * this license (the "Software") to use, reproduce, display, distribute, * execute, and transmit the Software, and to prepare derivative works of the * Software, and to permit third-parties to whom the Software is furnished to * do so, all subject to the following: * * The copyright notices in the Software and this entire statement, including * the above license grant, this restriction and the following disclaimer, * must be included in all copies of the Software, in whole or in part, and * all derivative works of the Software, unless such copies or derivative * works are solely in the form of machine-executable object code generated by * a source language processor. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ import std.math, std.stdio, std.contracts, std.range, std.perf, std.algorithm, std.complex, std.traits, std.random, std.intrinsic; version(unittest) { void main() {} } void main() { doTiming(); } void doTiming() { auto arr = new double[2 ^^ 20]; foreach(ref elem; arr) { elem = uniform(-1.0, 1.0); } auto pc = new PerformanceCounter; pc.start; auto foo = fft(arr); pc.stop; writeln(pc.milliseconds, " ms for forward transform."); pc.start; auto bar = inverseFft(foo); pc.stop; writeln(pc.milliseconds, " ms for inverse transform."); enforce(approxEqual(map!"a.re"(bar), arr)); writeln("Inverse is approx. equal original."); } /**Compute the Fourier transform of range using the O(N log N) Cooley-Tukey * Algorithm. range must be a random-access range with a size * that is a power of two. The contents of range can be either numeric types, * which will be interpreted as pure real values, or complex types with * properties or members .re and .im that can be read. * * Returns: An array of complex numbers representing the transformed data in * the frequency domain. */ Complex!F[] fft(F = double, R)(R range) { enforce(isPowerOfTwo(range.length), "Can only do FFTs on ranges with a size that is a power of two."); Complex!F[] ret; if(range.length == 0) { return ret; } ret.length = range.length; fft(range, ret); return ret; } /**Same as the overload, but allows for the results to be stored in a user- * provided buffer. The buffer must be of the same length as range, must be * a random-access range, must have slicing, and must contain elements that are * complex-like. This means that they must have a .re and a .im member or * property that can be both read and written and is a floating point number. */ void fft(Ret, R)(R range, Ret buf) if(isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) && hasSlicing!Ret) { enforce(buf.length == range.length); enforce(isPowerOfTwo(range.length), "Can only do FFTs on ranges with a size that is a power of two."); static if(is(R : Stride!R)) { return fftImpl(range, buf); } else { return fftImpl(Stride!R(range, 1), buf); } } /**Computes the inverse Fourier transform of a range. The range must be a * random access range, have a power of two size, and contain * elements that are either of type std.complex.Complex or have essentially * the same compile-time interface. * * Returns: The time-domain signal. */ Complex!F[] inverseFft(F = double, R)(R range) if(isRandomAccessRange!R && isComplexLike!(ElementType!R)) { enforce(isPowerOfTwo(range.length), "Can only do inverse FFTs on ranges with a size that is a power of two."); Complex!F[] ret; if(range.length == 0) { return ret; } ret.length = range.length; inverseFft(range, ret); return ret; } /**Inverse FFT that allows a user-supplied buffer to be provided. The buffer * must be a random access range with slicing, and its elements * must be some complex-like type. */ void inverseFft(Ret, R)(R range, Ret buf) if(isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) && hasSlicing!Ret) { enforce(isPowerOfTwo(range.length), "Can only do inverse FFTs on ranges with a size that is a power of two."); // BUGS: This should be done w/o copying to an array first. On the next // version of DMD this will be do-able easily because new and improved Map // will be in std.algorithm. auto swapped = array(map!swapRealImag(range)); fft(swapped, buf); foreach(ref elem; buf) { elem = swapRealImag(elem) / buf.length; } } unittest { // Test values from R. auto arr = [1,2,3,4,5,6,7,8]; auto fft1 = fft(arr); assert(approxEqual(map!"a.re"(fft1), [36.0, -4, -4, -4, -4, -4, -4, -4])); assert(approxEqual(map!"a.im"(fft1), [0, 9.6568, 4, 1.6568, 0, -1.6568, -4, -9.6568])); alias Complex!float C; auto arr2 = [C(1,2), C(3,4), C(5,6), C(7,8), C(9,10), C(11,12), C(13,14), C(15,16)]; auto fft2 = fft(arr2); assert(approxEqual(map!"a.re"(fft2), [64.0, -27.3137, -16, -11.3137, -8, -4.6862, 0, 11.3137])); assert(approxEqual(map!"a.im"(fft2), [72, 11.3137, 0, -4.686, -8, -11.3137, -16, -27.3137])); auto inv1 = inverseFft(fft1); assert(approxEqual(map!"a.re"(inv1), arr)); assert(reduce!max(map!"a.im"(inv1)) < 1e-10); auto inv2 = inverseFft(fft2); assert(approxEqual(map!"a.re"(inv2), map!"a.re"(arr2))); assert(approxEqual(map!"a.im"(inv2), map!"a.im"(arr2))); } // Swaps the real and imaginary parts of a complex number. This is useful // for inverse FFTs. C swapRealImag(C)(C input) { return C(input.im, input.re); } private: // The reason I couldn't use std.algorithm was b/c its stride length isn't // modifiable on the fly. struct Stride(R) { R range; size_t _nSteps; size_t _length; alias ElementType!(R) E; this(R range, size_t nStepsIn) { this.range = range; _nSteps = nStepsIn; _length = (range.length + _nSteps - 1) / nSteps; } size_t length() const @property { return _length; } E opIndex(size_t index) { return range[index * _nSteps]; } E front() { return range[0]; } void popFront() { static if(hasSlicing!R) { if(range.length >= _nSteps) { range = range[_nSteps..$]; _length--; } else { range.length = 0; _length = 0; } } else { foreach(i; 0.._nSteps) { if(range.empty) { break; } range.popFront(); } _length--; } } bool empty() const @property { return length == 0; } size_t nSteps() const @property { return _nSteps; } size_t nSteps(size_t newVal) @property { _nSteps = newVal; _length = (range.length + _nSteps - 1) / nSteps; return newVal; } } /////////////////////////////////////////////////////////////////////////////// // The following code caches sine and cosine terms for small sub-FFTs. For // larger FFTs it's faster to compute these terms on the fly because // of cache effects and because they're not used as often, but for smaller ones, // we use CTFE and store pre-computed values in an immutable array. // This cache size seems fairly optimial because: // // 1. It will fit in most L1 data caches. // (It leads to a total cache size of 32K) // // 2. DMD blows up if I go up to 8192 and uses several hundred megs of RAM. // // 3. It seems empirically like bigger caches don't give much, if any, speedup. enum nCache = 4096; double[nCache] computeCache() pure nothrow { // This computes a negative sine cache by shifting sines by PI / 2. It's // meant to be evaluated at compile time. double[nCache] ret; foreach(i; 0..ret.length) { ret[(i + nCache / 2) % nCache] = sin(i * 2 * PI / nCache); } return ret; } immutable double[nCache] trigCache = computeCache(); double negSinFromCache(size_t index) pure nothrow { return trigCache[index]; } double cosFromCache(size_t index) pure nothrow { // cos is just -sin shifted by PI * 3 / 4. Trusting the compiler to // optimize the readable but inefficient but clearly compiler-optimizable // index calculation. return trigCache[(index + nCache * 3 / 4) % nCache]; } /////////////////////////////////////////////////////////////////////////////// void fftImpl(Ret, R)(Stride!R range, Ret buf) { if(range.length <= 2) { slowFourier2(range, buf); return; } auto left = range; auto right = range; right.popFront; left.nSteps = left.nSteps * 2; right.nSteps = right.nSteps * 2; assert(left.length == right.length); auto leftBuf = buf[0..$ / 2]; auto rightBuf = buf[$ / 2..$]; fftImpl(left, leftBuf); fftImpl(right, rightBuf); immutable halfLen = range.length / 2; void doLoop(bool useCache)() { static if(useCache) { assert(range.length <= nCache); immutable multiplier = nCache / range.length; } else { immutable step = 2.0 * PI / range.length; } // Using ptrdiff_t here b/c converting a signed int to a float is actually // measurably faster than converting an unsigned int to a float in this // context. foreach(ptrdiff_t k; 0..halfLen) { static if(useCache) { immutable cosTwiddle = cosFromCache(k * multiplier); immutable sinTwiddle = negSinFromCache(k * multiplier); } else { immutable loc = step * k; immutable cosTwiddle = cos(loc); immutable sinTwiddle = -sin(loc); } immutable upperIndex = k + halfLen; immutable realLower = buf[k].re; immutable imagLower = buf[k].im; immutable realUpper = buf[upperIndex].re; immutable imagUpper = buf[upperIndex].im; immutable realAdd = cosTwiddle * realUpper - sinTwiddle * imagUpper; immutable imagAdd = sinTwiddle * realUpper + cosTwiddle * imagUpper; buf[k].re += realAdd; buf[k].im += imagAdd; buf[upperIndex].re = realLower - realAdd; buf[upperIndex].im = imagLower - imagAdd; } } if(range.length > nCache) { doLoop!false(); } else { doLoop!true(); } } // Hard-coded base case for FFT of size 2. This is actually a TON faster than // using a generic slow DFT. void slowFourier2(Ret, R)(R range, Complex!Ret[] buf) { buf[0] = range[0] + range[1]; buf[1] = range[0] - range[1]; } bool isPowerOfTwo(size_t num) { // BUGS: std.intrinsic takes a uint, not a size_t. Therefore, this // won't work on 64-bit unless std.intrinsic is fixed. return bsr(num) == bsf(num); } template isComplexLike(T) { enum bool isComplexLike = is(typeof(T.init.re)) && is(typeof(T.init.im)); } unittest { static assert(isComplexLike!(Complex!double)); static assert(!isComplexLike!(uint)); }