/**This library implements a basic out-of-place power-of-two fast Fourier * transform (FFT) and inverse FFT in relatively simple, readable, portable * D code. It's not blindingly fast, but it's not unreasonably slow either. * A size 2 ^^ 20 double[] -> Complex!double FFT takes about 340 milliseconds * on my Athlon XP 5200+, plus about 70 milliseconds to build the lookup table, * which can be amortized over several FFTs. For comparison, the R statistics * package takes about 610 milliseconds for a size 2 ^^ 20 pure real FFT. * FFTW takes an amazing 53 milliseconds to do it, but takes 137 seconds (!) * to construct its plan, though the plan construction can be amortized over * several FFTs of the same size. * * The algorithm used under the hood is the radix-2 Cooley-Tukey algorithm. * * References: http://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm * * Copyright (C) 2010 David Simcha * * License: * Boost Software License - Version 1.0 - August 17th, 2003 * * Permission is hereby granted, free of charge, to any person or organization * obtaining a copy of the software and accompanying documentation covered by * this license (the "Software") to use, reproduce, display, distribute, * execute, and transmit the Software, and to prepare derivative works of the * Software, and to permit third-parties to whom the Software is furnished to * do so, all subject to the following: * * The copyright notices in the Software and this entire statement, including * the above license grant, this restriction and the following disclaimer, * must be included in all copies of the Software, in whole or in part, and * all derivative works of the Software, unless such copies or derivative * works are solely in the form of machine-executable object code generated by * a source language processor. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ import std.math, std.stdio, std.contracts, std.range, std.perf, std.algorithm, std.complex, std.traits, std.random, std.intrinsic, core.memory, core.cpuid, std.conv; void main() { doTiming(); } void doTiming() { auto pc = new PerformanceCounter; alias double fft_t; // Allows easy playing w/ precision effects. auto arr = new fft_t[2^^20]; foreach(ref elem; arr) { elem = uniform(-1.0, 1.0); } pc.start; auto fftObj = new Fft(arr.length); pc.stop; writeln(pc.milliseconds, " ms for lookup table construction."); pc.start; Complex!fft_t[] foo; foo = fftObj.fft!fft_t(arr); pc.stop; writeln(pc.milliseconds, " ms for forward transform."); pc.start; auto bar = fftObj.inverseFft!fft_t(foo); pc.stop; writeln(pc.milliseconds, " ms for inverse transform."); enforce(approxEqual(map!"a.re"(bar), arr)); writeln("Inverse is approx. equal original."); } /**A class for performing fast Fourier transforms. This class encapsulates * a large amount of state that is reusable when performing several FFTs of * the same size. This makes performing numerous FFTs of the same size faster * than a free function API would allow. However, a free function API is * provided for convenience if you need to perform a one-off FFT. */ final class Fft { private: // This is to make tweaking the speed/size vs. accuracy tradeoff easy, // though floats seem accurate enough for all practical purposes, since // they pass the "approxEqual(inverseFft(fft(arr)), arr)" test even for // size 2 ^^ 22. alias float lookup_t; immutable lookup_t[][] negSinLookup; void enforceSize(R)(R range) const { enforce(range.length == size, text( "FFT size mismatch. Expected ", size, ", got ", range.length)); } void fftImpl(Ret, R)(Stride!R range, Ret buf) const { assert(range.length >= 2); if(range.length == 2) { slowFourier2(range, buf); return; } assert(isPowerOfTwo(range.length)); immutable localLookup = negSinLookup[bsr(range.length)]; assert(localLookup.length == range.length); immutable cosMask = range.length - 1; immutable cosAdd = range.length * 3 / 4; lookup_t negSinFromLookup(size_t index) pure nothrow { return localLookup[index]; } lookup_t cosFromLookup(size_t index) pure nothrow { // cos is just -sin shifted by PI * 3 / 2. return localLookup[(index + cosAdd) & cosMask]; } auto left = range; auto right = range; right.popFront; left.nSteps = left.nSteps * 2; right.nSteps = right.nSteps * 2; assert(left.length == right.length); auto leftBuf = buf[0..$ / 2]; auto rightBuf = buf[$ / 2..$]; fftImpl(left, leftBuf); fftImpl(right, rightBuf); immutable halfLen = range.length / 2; foreach(k; 0..halfLen) { immutable cosTwiddle = cosFromLookup(k); immutable sinTwiddle = negSinFromLookup(k); immutable realLower = buf[k].re; immutable imagLower = buf[k].im; immutable upperIndex = k + halfLen; immutable realUpper = buf[upperIndex].re; immutable imagUpper = buf[upperIndex].im; immutable realAdd = cosTwiddle * realUpper - sinTwiddle * imagUpper; immutable imagAdd = sinTwiddle * realUpper + cosTwiddle * imagUpper; buf[k].re += realAdd; buf[k].im += imagAdd; buf[upperIndex].re = realLower - realAdd; buf[upperIndex].im = imagLower - imagAdd; } } public: /**Create an Fft object for computing fast Fourier transforms of the * provided size. */ this(size_t size) { /* Create a lookup table of all negative sine values at a resolution of * size and all smaller power of two resolutions. This may seem * inefficient, but having all the lookups be next to each other in * memory at every level of iteration is a huge win performance-wise. */ if(size == 0) { return; } enforce(isPowerOfTwo(size), "Can only do FFTs on ranges with a size that is a power of two."); auto table = new lookup_t[][bsf(size) + 1]; table[$ - 1] = (cast(lookup_t*) GC.malloc(lookup_t.sizeof * size, GC.BlkAttr.NO_SCAN)) [0..size]; immutable multiplier = 1.0 / size; auto lastRow = table[$ - 1]; lastRow[0] = 0; // -sin(0) == 0. foreach(ptrdiff_t i; 1..size) { // The hard coded cases are for improved accuracy and to prevent // annoying non-zeroness when stuff should be zero. if(i == size / 4) { lastRow[i] = -1; // -sin(pi / 2) == -1. } else if(i == size / 2) { lastRow[i] = 0; // -sin(pi) == 0. } else if(i == size * 3 / 4) { lastRow[i] = 1; // -sin(pi * 3 / 2) == 1 } else { lastRow[i] = -sin(multiplier * i * 2.0 * PI); } } // Fill in all the other rows with strided versions. foreach(i; 1..table.length - 1) { immutable strideLength = size / (2 ^^ i); table[i] = array( Stride!(lookup_t[])(lastRow, strideLength)); } negSinLookup = cast(immutable) table; } @property size_t size() const { return (negSinLookup is null) ? 0 : negSinLookup[$ - 1].length; } /**Compute the Fourier transform of range using the O(N log N) Cooley-Tukey * Algorithm. range must be a random-access range with slicing and a size * that is a power of two. The contents of range can be either numeric types, * which will be interpreted as pure real values, or complex types with * properties or members .re and .im that can be read. * * Returns: An array of complex numbers representing the transformed data in * the frequency domain. */ Complex!F[] fft(F = double, R)(R range) const { enforceSize(range); Complex!F[] ret; if(range.length == 0) { return ret; } // Don't waste time initializing the memory for ret. ret = (cast(Complex!(F)*) GC.malloc(range.length * (Complex!(F)).sizeof, GC.BlkAttr.NO_SCAN))[0..range.length]; fft(range, ret); return ret; } /**Same as the overload, but allows for the results to be stored in a user- * provided buffer. The buffer must be of the same length as range, must be * a random-access range, must have slicing, and must contain elements that are * complex-like. This means that they must have a .re and a .im member or * property that can be both read and written and is a floating point number. */ void fft(Ret, R)(R range, Ret buf) const if(isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) && hasSlicing!Ret) { enforce(buf.length == range.length); enforceSize(range); if(range.length == 0) { return; } else if(range.length == 1) { buf[0] = range[0]; return; } else if(range.length == 2) { slowFourier2(range, buf); return; } else { static if(is(R : Stride!R)) { return fftImpl(range, buf); } else { return fftImpl(Stride!R(range, 1), buf); } } } /**Computes the inverse Fourier transform of a range. The range must be a * random access range with slicing, have a power of two size, and contain * elements that are either of type std.complex.Complex or have essentially * the same compile-time interface. * * Returns: The time-domain signal. */ Complex!F[] inverseFft(F = double, R)(R range) const if(isRandomAccessRange!R && isComplexLike!(ElementType!R)) { enforceSize(range); Complex!F[] ret; if(range.length == 0) { return ret; } // Don't waste time initializing the memory for ret. ret = (cast(Complex!(F)*) GC.malloc(range.length * (Complex!(F)).sizeof, GC.BlkAttr.NO_SCAN))[0..range.length]; inverseFft(range, ret); return ret; } /**Inverse FFT that allows a user-supplied buffer to be provided. The buffer * must be a random access range with slicing, and its elements * must be some complex-like type. */ void inverseFft(Ret, R)(R range, Ret buf) const if(isRandomAccessRange!Ret && isComplexLike!(ElementType!Ret) && hasSlicing!Ret) { enforceSize(range); // BUGS: This should be done w/o copying to an array first. On the next // version of DMD this will be do-able easily because new and improved Map // will be in std.algorithm. auto swapped = array(map!swapRealImag(range)); fft(swapped, buf); immutable lenNeg1 = 1.0 / buf.length; foreach(ref elem; buf) { elem = swapRealImag(elem) * lenNeg1; } } } /**Convenience functions that creates an Fft object, runs the FFT or inverse * FFT and return the result. Useful for one-off FFTs. */ Complex!F[] fft(F = double, R)(R range) { return (new Fft(range.length)).fft!(F, R)(range); } /// void fft(Ret, R)(R range, Ret buf) { return (new Fft(range.length)).fft!(Ret, R)(range, buf); } /// Complex!F[] inverseFft(F = double, R)(R range) { return (new Fft(range.length)).inverseFft!(F, R)(range); } /// void inverseFft(Ret, R)(R range, Ret buf) { return (new Fft(range.length)).inverseFft!(Ret, R)(range, buf); } unittest { // Test values from R. auto arr = [1,2,3,4,5,6,7,8]; auto fft1 = fft(arr); assert(approxEqual(map!"a.re"(fft1), [36.0, -4, -4, -4, -4, -4, -4, -4])); assert(approxEqual(map!"a.im"(fft1), [0, 9.6568, 4, 1.6568, 0, -1.6568, -4, -9.6568])); alias Complex!float C; auto arr2 = [C(1,2), C(3,4), C(5,6), C(7,8), C(9,10), C(11,12), C(13,14), C(15,16)]; auto fft2 = fft(arr2); assert(approxEqual(map!"a.re"(fft2), [64.0, -27.3137, -16, -11.3137, -8, -4.6862, 0, 11.3137])); assert(approxEqual(map!"a.im"(fft2), [72, 11.3137, 0, -4.686, -8, -11.3137, -16, -27.3137])); auto inv1 = inverseFft(fft1); assert(approxEqual(map!"a.re"(inv1), arr)); assert(reduce!max(map!"a.im"(inv1)) < 1e-10); auto inv2 = inverseFft(fft2); assert(approxEqual(map!"a.re"(inv2), map!"a.re"(arr2))); assert(approxEqual(map!"a.im"(inv2), map!"a.im"(arr2))); } // Swaps the real and imaginary parts of a complex number. This is useful // for inverse FFTs. C swapRealImag(C)(C input) { return C(input.im, input.re); } private: // The reason I couldn't use std.algorithm was b/c its stride length isn't // modifiable on the fly. struct Stride(R) { R range; size_t _nSteps; size_t _length; alias ElementType!(R) E; this(R range, size_t nStepsIn) { this.range = range; _nSteps = nStepsIn; _length = (range.length + _nSteps - 1) / nSteps; } size_t length() const @property { return _length; } typeof(this) save() @property { auto ret = this; ret.range = ret.range.save; return ret; } E opIndex(size_t index) { return range[index * _nSteps]; } E front() { return range[0]; } void popFront() { if(range.length >= _nSteps) { range = range[_nSteps..$]; _length--; } else { range.length = 0; _length = 0; } } bool empty() const @property { return length == 0; } size_t nSteps() const @property { return _nSteps; } size_t nSteps(size_t newVal) @property { _nSteps = newVal; _length = (range.length + _nSteps - 1) / nSteps; return newVal; } } // Hard-coded base case for FFT of size 2. This is actually a TON faster than // using a generic slow DFT. This seems to be the best base case. (Size 1 // can be coded inline as buf[0] = range[0]. void slowFourier2(Ret, R)(R range, Ret buf) { assert(range.length == 2); assert(buf.length == 2); buf[0] = range[0] + range[1]; buf[1] = range[0] - range[1]; } // Hard-coded base case for FFT of size 4. Doesn't work as well as the size // 2 case. void slowFourier4(Ret, R)(R range, Ret buf) { alias ElementType!Ret C; assert(range.length == 4); assert(buf.length == 4); buf[0] = range[0] + range[1] + range[2] + range[3]; buf[1] = range[0] - range[1] * C(0, 1) - range[2] + range[3] * C(0, 1); buf[2] = range[0] - range[1] + range[2] - range[3]; buf[3] = range[0] + range[1] * C(0, 1) - range[2] - range[3] * C(0, 1); } bool isPowerOfTwo(size_t num) { // BUGS: std.intrinsic takes a uint, not a size_t. Therefore, this // won't work on 64-bit unless std.intrinsic is fixed. return bsr(num) == bsf(num); } size_t roundDownToPowerOf2(size_t num) { return num & (1 << bsr(num)); } unittest { assert(roundDownToPowerOf2(7) == 4); assert(roundDownToPowerOf2(4) == 4); } template isComplexLike(T) { enum bool isComplexLike = is(typeof(T.init.re)) && is(typeof(T.init.im)); } unittest { static assert(isComplexLike!(Complex!double)); static assert(!isComplexLike!(uint)); }