1 // Fast integer multiplication using FFT over the complex numbers.
2 // [Donald Ervin Knuth: The Art of Computer Programming, Vol. II:
3 // Seminumerical Algorithms, second edition. Section 4.3.3, p. 290-294.]
4 // Bruno Haible 6.5.1996, 24.-25.8.1996
6 // FFT in the complex domain has the drawback that it needs careful round-off
7 // error analysis. But for CPUs with good floating-point performance it might
8 // nevertheless be better than FFT mod Z/mZ.
10 // It is important to have precise round-off error estimates. Although
11 // (by laws of statistics) in the average the actual round-off error makes up
12 // only half of the bits provided for round-off protection, we cannot rely
13 // on this average behaviour, but have to produce correct results.
15 // Knuth's formula (42), p. 294, says:
16 // If we want to multiply l-bit words using an FFT(2^k), our floating point
17 // numbers shall have m >= 2(k+l) + k + log_2 k + 3.5 mantissa bits.
19 // Here is a more careful analysis, using absolute error estimates.
21 // 1. We assume floating point numbers with radix 2, with the properties:
22 // (i) Multiplication with 2^n and 2^-n is exact.
23 // (ii) Negation x -> -x is exact.
24 // (iii) Addition: When adding x and y, with |x| <= 2^a, |y| <= 2^a,
25 // the result |x+y| <= 2^(a+1) has an error <= e*2^(a+1-m).
26 // (iv) Multiplication: When multiplying x and y, with |x| <= 2^a,
27 // |y| <= 2^b, the result |x*y| <= 2^(a+b) has an error <= e*2^(a+b-m).
28 // Here e = 1 for a truncating arithmetic, but e = 1/2 for a rounding
29 // arithmetic like IEEE single and double floats.
30 // 2. Let's introduce some notation: err(x) means |x'-x| where x is the
31 // exact mathematical value and x' is its representation in the machine.
32 // 3. From 1. we get for real numbers x,y:
33 // (i) err(2^n*x) = 2^n * err(x),
34 // (ii) err(-x) = err(x),
35 // (iii) |x| <= 2^a, |y| <= 2^a, then
36 // err(x+y) <= err(x) + err(y) + e*2^(a+1-m),
37 // [or .... ............... + e*2^(a-m) if |x+y| <= 2^a].
38 // (iv) |x| <= 2^a, |y| <= 2^b, then
39 // err(x*y) <= 2^a * err(y) + 2^b * err(x) + e*2^(a+b-m).
40 // 4. Our complex arithmetic will be based on the formulas:
41 // (i) 2^n*(x+iy) = (2^n*x)+i(2^n*y)
42 // (ii) -(x+iy) = (-x)+i(-y)
43 // (iii) (x+iy)+(u+iv) = (x+u)+i(y+v)
44 // (iv) (x+iy)*(u+iv) = (x*u-y*v)+i(x*v+y*u)
45 // The notation err(z) means |z'-z|, as above, with |.| being the usual
46 // absolute value on complex numbers (_not_ the L^1 norm).
47 // 5. From 3. and 4. we get for complex numbers x,y:
48 // (i) err(2^n*x) = 2^n * err(x),
49 // (ii) err(-x) = err(x),
50 // (iii) |x| <= 2^a, |y| <= 2^a, then
51 // err(x+y) <= err(x) + err(y) + e*2^(a+3/2-m),
52 // (iv) |x| <= 2^a, |y| <= 2^b, then
53 // err(x*y) <= 2^a * err(y) + 2^b * err(x) + 3*e*2^(a+b+1/2-m).
54 // 6. We start out with precomputed roots of unity:
55 // |exp(2 pi i/2^n)| <= 1,
56 // err(exp(2 pi i/2^n)) <= e*2^(1/2-m), (even err(..)=0 for n=0,1,2),
57 // and compute exp(2 pi i * j/2^k) according to the binary digits of j.
58 // This way, each root of unity will be a product of at most k precomputed
59 // roots. If (j mod 2^(k-2)) has n bits, then exp(2 pi i * j/2^k) will
60 // be computed using n factors, i.e. n-1 complex multiplications, and by
62 // err(exp(2 pi i * j/2^k)) <= n*e*2^(1/2-m) + max(n-1,0)*3*e*2^(1/2-m)
63 // = max(4*n-3,0)*e*2^(1/2-m).
64 // Hence the maximum roots-of-unity error is (set n=k-2)
65 // err(w^j) <= (4*k-11)*e*2^(1/2-m),
66 // and the average roots-of-unity error is (set n=(k-2)/2)
67 // < 2*(k-2)*e*2^(1/2-m).
68 // 7. Now we start the FFT.
69 // Before the first step, x_i are integral, |x_i| < 2^l and err(x_i) = 0.
70 // After the first butterfly, which replaces (x(i1),x(i2)) by
71 // (x(i1) + x(i2), x(i1) - x(i2)), we have |x_i| < 2^(l+1) and err(x_i) = 0.
72 // Then, for each of the remaining k-1 steps, a butterfly replaces
73 // (x(i1),x(i2)) by (x(i1) + w^exp*x(i2), x(i1) - w^exp*x(i2)). Thus,
74 // after n steps we have |x_i| < 2^(l+n) and err(x_i) <= E_i where
77 // E_(n+1) = (E_n + 2^(l+n)*(4*k-11)*e*2^(1/2-m) + 3*e*2^(l+n+1/2-m))
78 // + E_n + e*2^(l+n+3/2-m)
79 // = 2*E_n + (4*k-6)*e*2^(l+n+1/2-m)
80 // hence E_n = (2^n-2)*(4*k-6)*e*2^(l+1/2-m).
81 // Setting n = k, we have proved that after the FFT ends, we have
82 // |x_i| < 2^(l+k) and err(x_i) <= (4*k-6)*e*2^(l+k+1/2-m).
83 // 8. The same error analysis holds for the y_i and their FFT. After we
84 // multiply z_i := x_i * y_i, we have
85 // |z_i| < 2^(2*l+2*k) and err(z_i) <= (8*k-9)*e*2^(2*l+2*k+1/2-m).
86 // 9. Then an inverse FFT on z_i is done, which is the same as an FFT
87 // followed by a permutation and a division by 2^k. After n steps of
88 // the FFT, we have |z_i| < 2^(2*l+2*k+n) and err(z_i) <= E_i where
89 // E_0 = (8*k-9)*e*2^(2*l+2*k+1/2-m),
90 // E_(n+1) = (E_n + 2^(2*l+2*k+n)*(4*k-11)*e*2^(1/2-m)
91 // + 3*e*2^(2*l+2*k+n+1/2-m))
92 // + E_n + e*2^(2*l+2*k+n+3/2-m)
93 // = 2*E_n + (4*k-6)*e*2^(2*l+2*k+n+1/2-m)
94 // hence E_n = 2^n*(8*k-9)*e*2^(2*l+2*k+1/2-m)
95 // + (2^n-1)*(4*k-6)*e*2^(2*l+2*k+1/2-m).
96 // So, after the FFT, we have (set n=k) |z_i| < 2^(2*l+3*k) and
97 // err(z_i) <= (12*k-15)*e*2^(2*l+3*k+1/2-m).
98 // Permutation doesn't change the estimates. After division by 2^k, we get
99 // |z_i| < 2^(2*l+2*k) and
100 // err(z_i) <= (12*k-15)*e*2^(2*l+2*k+1/2-m).
101 // 10. When converting the z_i back to integers, we know that z_i should be
102 // real, integral, and |z_i| < 2^(2*l+k). We can only guarantee that we
103 // can find the integral z_i from the floating-point computation if
104 // (12*k-15)*e*2^(2*l+2*k+1/2-m) < 1/2.
105 // 11. Assuming e = 1/2 and m = 53 (typical values for IEEE double arithmetic),
106 // we get the constraint 2*l < m - 1/2 - 2*k - log_2(12*k-15).
126 // Assuming e = 1/2 and m = 64 ("long double" arithmetic on i387/i486/i586),
127 // we get the constraint 2*l < m - 1/2 - 2*k - log_2(12*k-15).
155 #error "complex fft implemented only for intDsize==32"
159 #include "cln/floatparam.h"
160 #include "cln/exception.h"
162 #if defined(HAVE_LONGDOUBLE) && (long_double_mant_bits > double_mant_bits) && (defined(__i386__) || defined(__m68k__) || (defined(__sparc__) && 0))
163 // Only these CPUs have fast "long double"s in hardware.
164 // On SPARC, "long double"s are emulated in software and don't work.
165 typedef long double fftc_real;
166 #define fftc_real_mant_bits long_double_mant_bits
167 #define fftc_real_rounds long_double_rounds
169 typedef double fftc_real;
170 #define fftc_real_mant_bits double_mant_bits
171 #define fftc_real_rounds double_rounds
174 typedef struct fftc_complex
181 static const fftc_complex fftc_roots_of_1 [32+1] =
182 // roots_of_1[n] is a (2^n)th root of unity in C.
183 // Also roots_of_1[n-1] = roots_of_1[n]^2.
184 // For simplicity we choose roots_of_1[n] = exp(2 pi i/2^n).
186 #if (fftc_real_mant_bits == double_mant_bits)
187 // These values have 64 bit precision.
191 { 0.7071067811865475244, 0.7071067811865475244 },
192 { 0.9238795325112867561, 0.38268343236508977172 },
193 { 0.9807852804032304491, 0.19509032201612826784 },
194 { 0.99518472667219688623, 0.098017140329560601996 },
195 { 0.9987954562051723927, 0.049067674327418014254 },
196 { 0.9996988186962042201, 0.024541228522912288032 },
197 { 0.99992470183914454094, 0.0122715382857199260795 },
198 { 0.99998117528260114264, 0.0061358846491544753597 },
199 { 0.9999952938095761715, 0.00306795676296597627 },
200 { 0.99999882345170190993, 0.0015339801862847656123 },
201 { 0.99999970586288221914, 7.6699031874270452695e-4 },
202 { 0.99999992646571785114, 3.8349518757139558907e-4 },
203 { 0.9999999816164292938, 1.9174759731070330744e-4 },
204 { 0.9999999954041073129, 9.5873799095977345874e-5 },
205 { 0.99999999885102682754, 4.793689960306688455e-5 },
206 { 0.99999999971275670683, 2.3968449808418218729e-5 },
207 { 0.9999999999281891767, 1.1984224905069706422e-5 },
208 { 0.99999999998204729416, 5.9921124526424278428e-6 },
209 { 0.99999999999551182357, 2.9960562263346607504e-6 },
210 { 0.99999999999887795586, 1.4980281131690112288e-6 },
211 { 0.999999999999719489, 7.4901405658471572114e-7 },
212 { 0.99999999999992987223, 3.7450702829238412391e-7 },
213 { 0.99999999999998246807, 1.8725351414619534487e-7 },
214 { 0.999999999999995617, 9.36267570730980828e-8 },
215 { 0.99999999999999890425, 4.6813378536549092695e-8 },
216 { 0.9999999999999997261, 2.340668926827455276e-8 },
217 { 0.99999999999999993153, 1.1703344634137277181e-8 },
218 { 0.99999999999999998287, 5.8516723170686386908e-9 },
219 { 0.9999999999999999957, 2.925836158534319358e-9 },
220 { 0.9999999999999999989, 1.4629180792671596806e-9 }
221 #else (fftc_real_mant_bits > double_mant_bits)
222 // These values have 128 bit precision.
226 { 0.707106781186547524400844362104849039284L, 0.707106781186547524400844362104849039284L },
227 { 0.923879532511286756128183189396788286823L, 0.38268343236508977172845998403039886676L },
228 { 0.980785280403230449126182236134239036975L, 0.195090322016128267848284868477022240928L },
229 { 0.995184726672196886244836953109479921574L, 0.098017140329560601994195563888641845861L },
230 { 0.998795456205172392714771604759100694444L, 0.0490676743274180142549549769426826583147L },
231 { 0.99969881869620422011576564966617219685L, 0.0245412285229122880317345294592829250654L },
232 { 0.99992470183914454092164649119638322435L, 0.01227153828571992607940826195100321214037L },
233 { 0.999981175282601142656990437728567716173L, 0.00613588464915447535964023459037258091705L },
234 { 0.999995293809576171511580125700119899554L, 0.00306795676296597627014536549091984251894L },
235 { 0.99999882345170190992902571017152601905L, 0.001533980186284765612303697150264079079954L },
236 { 0.999999705862882219160228217738765677117L, 7.66990318742704526938568357948576643142e-4L },
237 { 0.99999992646571785114473148070738785695L, 3.83495187571395589072461681181381263396e-4L },
238 { 0.999999981616429293808346915402909714504L, 1.91747597310703307439909561989000933469e-4L },
239 { 0.99999999540410731289097193313960614896L, 9.58737990959773458705172109764763511872e-5L },
240 { 0.9999999988510268275626733077945541084L, 4.79368996030668845490039904946588727468e-5L },
241 { 0.99999999971275670684941397221864177609L, 2.39684498084182187291865771650218200947e-5L },
242 { 0.999999999928189176709775095883850490262L, 1.198422490506970642152156159698898480473e-5L },
243 { 0.99999999998204729417728262414778410738L, 5.99211245264242784287971180889086172999e-6L },
244 { 0.99999999999551182354431058417299732444L, 2.99605622633466075045481280835705981183e-6L },
245 { 0.999999999998877955886077016551752536504L, 1.49802811316901122885427884615536112069e-6L },
246 { 0.999999999999719488971519214794719584451L, 7.49014056584715721130498566730655637157e-7L },
247 { 0.99999999999992987224287980123972873676L, 3.74507028292384123903169179084633177398e-7L },
248 { 0.99999999999998246806071995015624773673L, 1.8725351414619534486882457659356361712e-7L },
249 { 0.999999999999995617015179987529456656217L, 9.3626757073098082799067286680885620193e-8L },
250 { 0.999999999999998904253794996881763834182L, 4.68133785365490926951155181385400969594e-8L },
251 { 0.99999999999999972606344874922040343793L, 2.34066892682745527595054934190348440379e-8L },
252 { 0.999999999999999931515862187305098514444L, 1.170334463413727718124621350323810379807e-8L },
253 { 0.999999999999999982878965546826274482047L, 5.8516723170686386908097901008341396944e-9L },
254 { 0.999999999999999995719741386706568611352L, 2.92583615853431935792823046906895590202e-9L },
255 { 0.999999999999999998929935346676642152265L, 1.46291807926715968052953216186596371037e-9L }
259 // Define this for (cheap) consistency checks.
262 // Define the algorithm of the backward FFT:
263 // Either FORWARD (a normal FFT followed by a permutation)
264 // or RECIPROOT (an FFT with reciprocal root of unity)
265 // or CLEVER (an FFT with reciprocal root of unity but clever computation
266 // of the reciprocals).
267 // Drawback of FORWARD: the permutation pass.
268 // Drawback of RECIPROOT: need all the powers of the root, not only half of them.
272 #define FFTC_BACKWARD CLEVER
274 static fftc_real fftc_pow2_table[64] = // table of powers of 2
333 144115188075855872.0,
334 288230376151711744.0,
335 576460752303423488.0,
336 1152921504606846976.0,
337 2305843009213693952.0,
338 4611686018427387904.0,
339 9223372036854775808.0
342 // For a constant expression n (0 <= n < 128), returns 2^n of type fftc_real.
343 #define fftc_pow2(n) \
344 (((n) & 64 ? (fftc_real)18446744073709551616.0 : (fftc_real)1.0) \
345 * ((n) & 32 ? (fftc_real)4294967296.0 : (fftc_real)1.0) \
346 * ((n) & 16 ? (fftc_real)65536.0 : (fftc_real)1.0) \
347 * ((n) & 8 ? (fftc_real)256.0 : (fftc_real)1.0) \
348 * ((n) & 4 ? (fftc_real)16.0 : (fftc_real)1.0) \
349 * ((n) & 2 ? (fftc_real)4.0 : (fftc_real)1.0) \
350 * ((n) & 1 ? (fftc_real)2.0 : (fftc_real)1.0) \
354 static inline void add (const fftc_complex& a, const fftc_complex& b, fftc_complex& r)
361 static inline void sub (const fftc_complex& a, const fftc_complex& b, fftc_complex& r)
368 static inline void mul (fftc_real a, const fftc_complex& b, fftc_complex& r)
375 static inline void mul (const fftc_complex& a, const fftc_complex& b, fftc_complex& r)
377 var fftc_real r_re = a.re * b.re - a.im * b.im;
378 var fftc_real r_im = a.re * b.im + a.im * b.re;
385 // Reverse an n-bit number x. n>0.
386 static uintC bit_reverse (uintL n, uintC x)
393 } while (!(--n == 0));
398 // Compute a complex convolution using FFT: z[0..N-1] := x[0..N-1] * y[0..N-1].
399 static void fftc_convolution (const uintL n, const uintC N, // N = 2^n
400 fftc_complex * x, // N numbers
401 fftc_complex * y, // N numbers
402 fftc_complex * z // N numbers result
406 #if (FFTC_BACKWARD == RECIPROOT) || defined(DEBUG_FFTC)
407 var fftc_complex* const w = cl_alloc_array(fftc_complex,N);
409 var fftc_complex* const w = cl_alloc_array(fftc_complex,(N>>1)+1);
412 // Initialize w[i] to w^i, w a primitive N-th root of unity.
413 w[0] = fftc_roots_of_1[0];
414 #if (FFTC_BACKWARD == RECIPROOT) || defined(DEBUG_FFTC)
417 for (j = n-1; j>=0; j--) {
418 var fftc_complex r_j = fftc_roots_of_1[n-j];
420 for (i = (2<<j); i < N; i += (2<<j))
421 mul(w[i],r_j, w[i+(1<<j)]);
427 for (j = n-1; j>=0; j--) {
428 var fftc_complex r_j = fftc_roots_of_1[n-j];
430 for (i = (2<<j); i < N>>1; i += (2<<j))
431 mul(w[i],r_j, w[i+(1<<j)]);
436 // Check that w is really a primitive N-th root of unity.
438 var fftc_real epsilon;
439 epsilon = (fftc_real)(n>=3 ? 4*n-11 : 0);
440 epsilon = epsilon / fftc_pow2(fftc_real_mant_bits);
441 if (fftc_real_rounds == rounds_to_nearest)
442 epsilon = 0.5*epsilon;
443 epsilon = 1.414*epsilon;
444 // epsilon = (4*k-11)*e*2^(1/2-m).
446 var fftc_complex& w_N = w[N>>1];
447 part = w_N.re - (fftc_real)(-1.0);
448 if (part > epsilon || part < -epsilon)
449 throw runtime_exception();
451 if (part > epsilon || part < -epsilon)
452 throw runtime_exception();
455 var fftc_complex w_N;
456 mul(w[N-1],fftc_roots_of_1[n], w_N);
457 // Since there was one more multiplication,
458 // we have to replace n by (n+1) in the epsilon above.
459 var fftc_real epsilon;
460 epsilon = (fftc_real)(4*n-7);
461 epsilon = epsilon / fftc_pow2(fftc_real_mant_bits);
462 if (fftc_real_rounds == rounds_to_nearest)
463 epsilon = 0.5*epsilon;
464 epsilon = 1.414*epsilon;
465 // epsilon = (4*k-7)*e*2^(1/2-m).
467 part = w_N.re - (fftc_real)1.0;
468 if (part > epsilon || part < -epsilon)
469 throw runtime_exception();
471 if (part > epsilon || part < -epsilon)
472 throw runtime_exception();
475 var bool squaring = (x == y);
476 // Do an FFT of length N on x.
480 var const uintC tmax = N>>1; // tmax = 2^(n-1)
481 for (var uintC t = 0; t < tmax; t++) {
483 var uintC i2 = i1 + tmax;
484 // Butterfly: replace (x(i1),x(i2)) by
485 // (x(i1) + x(i2), x(i1) - x(i2)).
486 var fftc_complex tmp;
488 sub(x[i1],tmp, x[i2]);
489 add(x[i1],tmp, x[i1]);
492 for (l = n-2; l>=0; l--) {
493 var const uintC smax = (uintC)1 << (n-1-l);
494 var const uintC tmax = (uintC)1 << l;
495 for (var uintC s = 0; s < smax; s++) {
496 var uintC exp = bit_reverse(n-1-l,s) << l;
497 for (var uintC t = 0; t < tmax; t++) {
498 var uintC i1 = (s << (l+1)) + t;
499 var uintC i2 = i1 + tmax;
500 // Butterfly: replace (x(i1),x(i2)) by
501 // (x(i1) + w^exp*x(i2), x(i1) - w^exp*x(i2)).
502 var fftc_complex tmp;
503 mul(x[i2],w[exp], tmp);
504 sub(x[i1],tmp, x[i2]);
505 add(x[i1],tmp, x[i1]);
510 // Do an FFT of length N on y.
514 var uintC const tmax = N>>1; // tmax = 2^(n-1)
515 for (var uintC t = 0; t < tmax; t++) {
517 var uintC i2 = i1 + tmax;
518 // Butterfly: replace (y(i1),y(i2)) by
519 // (y(i1) + y(i2), y(i1) - y(i2)).
520 var fftc_complex tmp;
522 sub(y[i1],tmp, y[i2]);
523 add(y[i1],tmp, y[i1]);
526 for (l = n-2; l>=0; l--) {
527 var const uintC smax = (uintC)1 << (n-1-l);
528 var const uintC tmax = (uintC)1 << l;
529 for (var uintC s = 0; s < smax; s++) {
530 var uintC exp = bit_reverse(n-1-l,s) << l;
531 for (var uintC t = 0; t < tmax; t++) {
532 var uintC i1 = (s << (l+1)) + t;
533 var uintC i2 = i1 + tmax;
534 // Butterfly: replace (y(i1),y(i2)) by
535 // (y(i1) + w^exp*y(i2), y(i1) - w^exp*y(i2)).
536 var fftc_complex tmp;
537 mul(y[i2],w[exp], tmp);
538 sub(y[i1],tmp, y[i2]);
539 add(y[i1],tmp, y[i1]);
544 // Multiply the transformed vectors into z.
545 for (i = 0; i < N; i++)
546 mul(x[i],y[i], z[i]);
547 // Undo an FFT of length N on z.
550 for (l = 0; l < n-1; l++) {
551 var const uintC smax = (uintC)1 << (n-1-l);
552 var const uintC tmax = (uintC)1 << l;
553 #if FFTC_BACKWARD != CLEVER
554 for (var uintC s = 0; s < smax; s++) {
555 var uintC exp = bit_reverse(n-1-l,s) << l;
556 #if FFTC_BACKWARD == RECIPROOT
558 exp = N - exp; // negate exp (use w^-1 instead of w)
560 for (var uintC t = 0; t < tmax; t++) {
561 var uintC i1 = (s << (l+1)) + t;
562 var uintC i2 = i1 + tmax;
563 // Inverse Butterfly: replace (z(i1),z(i2)) by
564 // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)).
565 // Do the division by 2 later.
566 var fftc_complex sum;
567 var fftc_complex diff;
568 add(z[i1],z[i2], sum);
569 sub(z[i1],z[i2], diff);
571 mul(diff,w[exp], z[i2]);
574 #else // FFTC_BACKWARD == CLEVER: clever handling of negative exponents
575 /* s = 0, exp = 0 */ {
576 for (var uintC t = 0; t < tmax; t++) {
578 var uintC i2 = i1 + tmax;
579 // Inverse Butterfly: replace (z(i1),z(i2)) by
580 // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)),
582 // Do the division by 2 later.
583 var fftc_complex sum;
584 var fftc_complex diff;
585 add(z[i1],z[i2], sum);
586 sub(z[i1],z[i2], diff);
591 for (var uintC s = 1; s < smax; s++) {
592 var uintC exp = bit_reverse(n-1-l,s) << l;
593 exp = (N>>1) - exp; // negate exp (use w^-1 instead of w)
594 for (var uintC t = 0; t < tmax; t++) {
595 var uintC i1 = (s << (l+1)) + t;
596 var uintC i2 = i1 + tmax;
597 // Inverse Butterfly: replace (z(i1),z(i2)) by
598 // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)),
599 // with exp <-- (N/2 - exp).
600 // Do the division by 2 later.
601 var fftc_complex sum;
602 var fftc_complex diff;
603 add(z[i1],z[i2], sum);
604 sub(z[i2],z[i1], diff); // note that w^(N/2) = -1
606 mul(diff,w[exp], z[i2]);
612 var const fftc_real pow2 = (fftc_real)1 / (fftc_real)N;
613 var const uintC tmax = N>>1; // tmax = 2^(n-1)
614 for (var uintC t = 0; t < tmax; t++) {
616 var uintC i2 = i1 + tmax;
617 // Inverse Butterfly: replace (z(i1),z(i2)) by
618 // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/2).
619 // Do all the divisions by 2 now.
620 var fftc_complex sum;
621 var fftc_complex diff;
622 add(z[i1],z[i2], sum);
623 sub(z[i1],z[i2], diff);
624 mul(pow2,sum, z[i1]);
625 mul(pow2,diff, z[i2]);
629 #if FFTC_BACKWARD == FORWARD
630 // Swap z[i] and z[N-i] for 0 < i < N/2.
631 for (i = (N>>1)-1; i > 0; i--) {
632 var fftc_complex tmp = z[i];
639 // For a given k >= 2, the maximum l is determined by
640 // 2*l < m - 1/2 - 2*k - log_2(12*k-15) - (1 if e=1.0, 0 if e=0.5).
641 // This is a decreasing function of k.
643 (int)((fftc_real_mant_bits \
645 - ((k)<=2 ? 4 : (k)<=3 ? 5 : (k)<=5 ? 6 : (k)<=8 ? 7 : (k)<=16 ? 8 : (k)<=31 ? 9 : 10) \
646 - (fftc_real_rounds == rounds_to_nearest ? 0 : 1)) \
648 static int max_l_table[32+1] =
649 { 0, 0, max_l(2), max_l(3), max_l(4), max_l(5), max_l(6),
650 max_l(7), max_l(8), max_l(9), max_l(10), max_l(11), max_l(12), max_l(13),
651 max_l(14), max_l(15), max_l(16), max_l(17), max_l(18), max_l(19), max_l(20),
652 max_l(21), max_l(22), max_l(23), max_l(24), max_l(25), max_l(26), max_l(27),
653 max_l(28), max_l(29), max_l(30), max_l(31), max_l(32)
656 // Split len uintD's below sourceptr into chunks of l bits, thus filling
657 // N complex numbers at x.
658 static void fill_factor (uintC N, fftc_complex* x, uintL l,
659 const uintD* sourceptr, uintC len)
662 if (max_l(2) > intDsize && l > intDsize) {
664 if (max_l(2) > 64 && l > 64) {
665 throw runtime_exception("FFT problem: l > 64 not supported by pow2_table");
667 var fftc_real carry = 0;
668 var sintL carrybits = 0; // number of bits in carry (>=0, <l)
671 var uintD digit = lsprefnext(sourceptr);
672 if (carrybits+intDsize >= l) {
673 x[i].re = carry + (fftc_real)(digit & bitm(l-carrybits)) * fftc_pow2_table[carrybits];
674 x[i].im = (fftc_real)0;
676 carry = (l-carrybits == intDsize ? (fftc_real)0 : (fftc_real)(digit >> (l-carrybits)));
677 carrybits = carrybits+intDsize-l;
679 carry = carry + (fftc_real)digit * fftc_pow2_table[carrybits];
680 carrybits = carrybits+intDsize;
686 x[i].im = (fftc_real)0;
690 throw runtime_exception();
691 } else if (max_l(2) >= intDsize && l == intDsize) {
694 throw runtime_exception();
695 for (i = 0; i < len; i++) {
696 var uintD digit = lsprefnext(sourceptr);
697 x[i].re = (fftc_real)digit;
698 x[i].im = (fftc_real)0;
702 var const uintD l_mask = bit(l)-1;
704 var sintL carrybits = 0; // number of bits in carry (>=0, <intDsize)
705 for (i = 0; i < N; i++) {
706 if (carrybits >= l) {
707 x[i].re = (fftc_real)(carry & l_mask);
708 x[i].im = (fftc_real)0;
715 var uintD digit = lsprefnext(sourceptr);
716 x[i].re = (fftc_real)((carry | (digit << carrybits)) & l_mask);
717 x[i].im = (fftc_real)0;
718 carry = digit >> (l-carrybits);
719 carrybits = intDsize - (l-carrybits);
722 while (carrybits > 0) {
724 throw runtime_exception();
725 x[i].re = (fftc_real)(carry & l_mask);
726 x[i].im = (fftc_real)0;
732 throw runtime_exception();
734 for ( ; i < N; i++) {
735 x[i].re = (fftc_real)0;
736 x[i].im = (fftc_real)0;
740 // Given a not too large floating point number, round it to the nearest integer.
741 static inline fftc_real fftc_fround (fftc_real x)
744 #if (fftc_real_rounds == rounds_to_nearest)
745 (x + (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2)))
746 - (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2));
747 #elif (fftc_real_rounds == rounds_to_infinity)
748 (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2))
749 - ((fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2))
750 - (x + (fftc_real)0.5));
751 #else // rounds_to_zero, rounds_to_minus_infinity
752 ((x + (fftc_real)0.5)
753 + (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2)))
754 - (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2));
758 // Given a not too large floating point number, round it down.
759 static inline fftc_real fftc_ffloor (fftc_real x)
761 #if (fftc_real_rounds == rounds_to_nearest)
763 (x + (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2)))
764 - (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2));
768 return y - (fftc_real)1.0;
769 #elif (fftc_real_rounds == rounds_to_infinity)
771 (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2))
772 - ((fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2))
774 #else // rounds_to_zero, rounds_to_minus_infinity
776 (x + (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2)))
777 - (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2));
781 // Combine the N complex numbers at z into uintD's below destptr.
782 // The z[i] are known to be approximately integers >= 0, < N*2^(2*l).
783 // Assumes room for floor(N*l/intDsize)+(1+ceiling((n+2*l)/intDsize)) uintD's
784 // below destptr. Fills len digits and returns (destptr lspop len).
785 static uintD* unfill_product (uintL n, uintC N, // N = 2^n
786 const fftc_complex * z, uintL l,
790 if (n + 2*l <= intDsize) {
791 // 2-digit carry is sufficient, l < intDsize
792 var uintD carry0 = 0;
793 var uintD carry1 = 0;
794 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
795 for (i = 0; i < N; i++) {
796 // Fetch z[i].re and round it to the nearest uintD.
797 var uintD digit = (uintD)(z[i].re + (fftc_real)0.5);
799 carry1 += digit >> (intDsize-shift);
800 digit = digit << shift;
802 if ((carry0 += digit) < digit)
805 if (shift >= intDsize) {
806 lsprefnext(destptr) = carry0;
812 lsprefnext(destptr) = carry0;
813 lsprefnext(destptr) = carry1;
814 } else if (n + 2*l <= 2*intDsize) {
815 // 3-digit carry is sufficient, l < intDsize
817 var uintDD carry0 = 0;
818 var uintD carry1 = 0;
819 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
820 for (i = 0; i < N; i++) {
821 // Fetch z[i].re and round it to the nearest uintDD.
822 var uintDD digit = (uintDD)(z[i].re + (fftc_real)0.5);
824 carry1 += (uintD)(digit >> (2*intDsize-shift));
825 digit = digit << shift;
827 if ((carry0 += digit) < digit)
830 if (shift >= intDsize) {
831 lsprefnext(destptr) = lowD(carry0);
832 carry0 = highlowDD(carry1,highD(carry0));
837 lsprefnext(destptr) = lowD(carry0);
838 lsprefnext(destptr) = highD(carry0);
839 lsprefnext(destptr) = carry1;
841 var uintD carry0 = 0;
842 var uintD carry1 = 0;
843 var uintD carry2 = 0;
844 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
845 for (i = 0; i < N; i++) {
846 // Fetch z[i].re and round it to the nearest uintDD.
847 var fftc_real zi = z[i].re + (fftc_real)0.5;
848 var const fftc_real pow2_intDsize = fftc_pow2(intDsize);
849 var const fftc_real invpow2_intDsize = (fftc_real)1.0/pow2_intDsize;
850 var uintD digit1 = (uintD)(zi * invpow2_intDsize);
851 var uintD digit0 = (uintD)(zi - (fftc_real)digit1 * pow2_intDsize);
853 carry2 += digit1 >> (intDsize-shift);
854 digit1 = (digit1 << shift) | (digit0 >> (intDsize-shift));
855 digit0 = digit0 << shift;
857 if ((carry0 += digit0) < digit0)
858 if ((carry1 += 1) == 0)
860 if ((carry1 += digit1) < digit1)
863 if (shift >= intDsize) {
864 lsprefnext(destptr) = carry0;
871 lsprefnext(destptr) = carry0;
872 lsprefnext(destptr) = carry1;
873 lsprefnext(destptr) = carry2;
876 // 1-digit+1-float carry is sufficient
877 var uintD carry0 = 0;
878 var fftc_real carry1 = 0;
879 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
880 for (i = 0; i < N; i++) {
881 // Fetch z[i].re and round it to the nearest integer.
882 var fftc_real digit = fftc_fround(z[i].re);
884 if (!(digit >= (fftc_real)0
885 && z[i].re > digit - (fftc_real)0.5
886 && z[i].re < digit + (fftc_real)0.5))
887 throw runtime_exception();
890 digit = digit * fftc_pow2_table[shift];
891 var fftc_real digit1 = fftc_ffloor(digit*((fftc_real)1.0/fftc_pow2(intDsize)));
892 var uintD digit0 = (uintD)(digit - digit1*fftc_pow2(intDsize));
894 if ((carry0 += digit0) < digit0)
895 carry1 += (fftc_real)1.0;
897 while (shift >= intDsize) {
898 lsprefnext(destptr) = carry0;
899 var fftc_real tmp = fftc_ffloor(carry1*((fftc_real)1.0/fftc_pow2(intDsize)));
900 carry0 = (uintD)(carry1 - tmp*fftc_pow2(intDsize));
905 if (carry0 > 0 || carry1 > (fftc_real)0.0) {
906 lsprefnext(destptr) = carry0;
907 while (carry1 > (fftc_real)0.0) {
908 var fftc_real tmp = fftc_ffloor(carry1*((fftc_real)1.0/fftc_pow2(intDsize)));
909 lsprefnext(destptr) = (uintD)(carry1 - tmp*fftc_pow2(intDsize));
917 static inline void mulu_fftcomplex_nocheck (const uintD* sourceptr1, uintC len1,
918 const uintD* sourceptr2, uintC len2,
920 // Es ist 2 <= len1 <= len2.
922 // We have to find parameters l and k such that
923 // ceiling(len1*intDsize/l) + ceiling(len2*intDsize/l) - 1 <= 2^k,
924 // and (12*k-15)*e*2^(2*l+2*k+1/2-m) < 1/2.
925 // Try primarily to minimize k. Minimizing l buys you nothing.
927 // Computing k: If len1 and len2 differ much, we'll split source2 -
928 // hence for the moment just substitute len1 for len2.
930 // First approximation of k: A necessary condition for
931 // 2*ceiling(len1*intDsize/l) - 1 <= 2^k
932 // is 2*len1*intDsize/l_max - 1 <= 2^k.
934 var const int l = max_l(2);
935 var uintC lhs = 2*ceiling(len1*intDsize,l) - 1; // >=1
939 integerlengthC(lhs-1, k=); // k>=2
941 // Try whether this k is ok or whether we have to increase k.
943 if (k >= sizeof(max_l_table)/sizeof(max_l_table[0])
944 || max_l_table[k] <= 0) {
945 throw runtime_exception("FFT problem: numbers too big, floating point precision not sufficient");
947 if (2*ceiling(len1*intDsize,max_l_table[k])-1 <= ((uintC)1 << k))
950 // We could try to reduce l, keeping the same k. But why should we?
951 // Calculate the number of pieces in which source2 will have to be
952 // split. Each of the pieces must satisfy
953 // ceiling(len1*intDsize/l) + ceiling(len2*intDsize/l) - 1 <= 2^k,
955 // Try once with k, once with k+1. Compare them.
957 var uintC remaining_k = ((uintC)1 << k) + 1 - ceiling(len1*intDsize,max_l_table[k]);
958 var uintC max_piecelen_k = floor(remaining_k*max_l_table[k],intDsize);
959 var uintC numpieces_k = ceiling(len2,max_piecelen_k);
960 var uintC remaining_k1 = ((uintC)1 << (k+1)) + 1 - ceiling(len1*intDsize,max_l_table[k+1]);
961 var uintC max_piecelen_k1 = floor(remaining_k1*max_l_table[k+1],intDsize);
962 var uintC numpieces_k1 = ceiling(len2,max_piecelen_k1);
963 if (numpieces_k <= 2*numpieces_k1) {
965 len2p = max_piecelen_k;
969 len2p = max_piecelen_k1;
972 var const uintL l = max_l_table[k];
973 var const uintL n = k;
974 var const uintC N = (uintC)1 << n;
976 var fftc_complex* const x = cl_alloc_array(fftc_complex,N);
977 var fftc_complex* const y = cl_alloc_array(fftc_complex,N);
979 var fftc_complex* const z = cl_alloc_array(fftc_complex,N);
981 var fftc_complex* const z = x; // put z in place of x - saves memory
983 var uintD* const tmpprod1 = cl_alloc_array(uintD,len1+1);
984 var uintC tmpprod_len = floor(l<<n,intDsize)+6;
985 var uintD* const tmpprod = cl_alloc_array(uintD,tmpprod_len);
986 var uintC destlen = len1+len2;
987 clear_loop_lsp(destptr,destlen);
993 var uintD* tmpptr = arrayLSDptr(tmpprod1,len1+1);
994 mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
995 if (addto_loop_lsp(tmpptr,destptr,len1+1))
996 if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
997 throw runtime_exception();
999 var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
1001 fill_factor(N,x,l,sourceptr1,len1);
1004 fill_factor(N,y,l,sourceptr2,len2p);
1007 fftc_convolution(n,N, &x[0], &y[0], &z[0]);
1009 fftc_convolution(n,N, &x[0], &x[0], &z[0]);
1013 var fftc_real re_lo_limit = (fftc_real)(-0.5);
1014 var fftc_real re_hi_limit = (fftc_real)N * fftc_pow2_table[l] * fftc_pow2_table[l] + (fftc_real)0.5;
1015 var fftc_real im_lo_limit = (fftc_real)(-0.5);
1016 var fftc_real im_hi_limit = (fftc_real)0.5;
1017 for (var uintC i = 0; i < N; i++) {
1018 if (!(z[i].im > im_lo_limit
1019 && z[i].im < im_hi_limit))
1020 throw runtime_exception();
1021 if (!(z[i].re > re_lo_limit
1022 && z[i].re < re_hi_limit))
1023 throw runtime_exception();
1027 var uintD* tmpLSDptr = arrayLSDptr(tmpprod,tmpprod_len);
1028 var uintD* tmpMSDptr = unfill_product(n,N,z,l,tmpLSDptr);
1030 #if CL_DS_BIG_ENDIAN_P
1031 tmpLSDptr - tmpMSDptr;
1033 tmpMSDptr - tmpLSDptr;
1035 if (tmplen > tmpprod_len)
1036 throw runtime_exception();
1037 // Add result to destptr[-destlen..-1]:
1038 if (tmplen > destlen) {
1039 if (test_loop_msp(tmpMSDptr,tmplen-destlen))
1040 throw runtime_exception();
1043 if (addto_loop_lsp(tmpLSDptr,destptr,tmplen))
1044 if (inc_loop_lsp(destptr lspop tmplen,destlen-tmplen))
1045 throw runtime_exception();
1048 destptr = destptr lspop len2p;
1050 sourceptr2 = sourceptr2 lspop len2p;
1058 // Compute a checksum: number mod (2^intDsize-1).
1059 static uintD compute_checksum (const uintD* sourceptr, uintC len)
1061 var uintD tmp = ~(uintD)0; // -1-(sum mod 2^intDsize-1), always >0
1063 var uintD digit = lsprefnext(sourceptr);
1065 tmp -= digit; // subtract digit
1067 tmp -= digit+1; // subtract digit-(2^intDsize-1)
1068 } while (--len > 0);
1072 // Multiply two checksums modulo (2^intDsize-1).
1073 static inline uintD multiply_checksum (uintD checksum1, uintD checksum2)
1078 var uintDD cksum = muluD(checksum1,checksum2);
1079 cksum_hi = highD(cksum); checksum = lowD(cksum);
1081 muluD(checksum1,checksum2, cksum_hi =, checksum =);
1083 if ((checksum += cksum_hi) + 1 <= cksum_hi)
1090 static void mulu_fftcomplex (const uintD* sourceptr1, uintC len1,
1091 const uintD* sourceptr2, uintC len2,
1094 // Compute checksums of the arguments and multiply them.
1095 var uintD checksum1 = compute_checksum(sourceptr1,len1);
1096 var uintD checksum2 = compute_checksum(sourceptr2,len2);
1097 var uintD checksum = multiply_checksum(checksum1,checksum2);
1098 mulu_fftcomplex_nocheck(sourceptr1,len1,sourceptr2,len2,destptr);
1099 if (!(checksum == compute_checksum(destptr,len1+len2))) {
1100 throw runtime_exception("FFT problem: checksum error");