1 // Fast integer multiplication using FFT over the complex numbers,
2 // exploiting symmetry.
3 // [Donald Ervin Knuth: The Art of Computer Programming, Vol. II:
4 // Seminumerical Algorithms, second edition. Section 4.3.3, p. 290-294.]
5 // Bruno Haible 6.5.1996, 24.-25.8.1996, 31.8.1996
7 // FFT in the complex domain has the drawback that it needs careful round-off
8 // error analysis. But for CPUs with good floating-point performance it might
9 // nevertheless be better than FFT mod Z/mZ.
11 // The usual FFT(2^n) computes the values of a polynomial p(z) mod (z^(2^n)-1)
12 // at the (2^n)-th roots of unity. For our purposes, we start out with
13 // polynomials with real coefficients. So the values at z_j = exp(2 pi i j/N)
14 // and z_-j = exp(- 2 pi i j/N) will be complex conjugates of each other,
15 // which means that there exists a polynomial r(z) of degree <= 1 with real
16 // coefficients such that p(z_j) = r(z_j) and p(z_-j) = r(z_-j). This
17 // implies that r(z) is the remainder of p(z) divided by
18 // (z - z_j) (z - z_-j) = (z^2 - 2 cos(2 pi j/N) z + 1).
20 // Based on this insight, we replace the usual n FFT steps
22 // (z^(2^n)-1) = prod(j=0..2^(n-m)-1, (z^(2^m) - exp(2 pi j/2^(n-m))))
25 // (z^(2^n)-1) = (z^(2^m)-1) * prod(j=1..2^(n-m)-1, factor_m[j](z))
27 // factor_m[j](z) = prod(k mod 2^n with k == j or k == -j mod 2^(n-m+1),
28 // (z - exp(2 pi i k/N)) )
29 // = prod(k=0..2^(m-1)-1,
30 // (z - exp(2 pi i (j+2^(n-m+1)k)/N))
31 // (z - exp(- 2 pi i (j+2^(n-m+1)k)/N)) )
32 // = (z^(2^(m-1)) - exp(2 pi i j 2^(m-1)/N))
33 // (z^(2^(m-1)) - exp(- 2 pi i j 2^(m-1)/N))
34 // = (z^(2^(m-1)) - exp(2 pi i j/2^(n-m+1)))
35 // (z^(2^(m-1)) - exp(- 2 pi i j/2^(n-m+1)))
36 // = (z^(2^m) - 2 cos(2 pi j/2^(n-m+1)) z^(2^(m-1)) + 1).
37 // The factors and the input are real polynomials, hence all intermediate
38 // and final remainders will be real as well.
40 // However, instead of storing
41 // p(z) mod (z^(2^m) - 2 cos(2 pi j/2^(n-m+1)) z^(2^(m-1)) + 1),
42 // we compute and store
43 // realpart and imagpart of p(z) mod (z^(2^(m-1)) - exp(2 pi i j 2^(m-1)/N)).
44 // This way, the round-off error estimates are better because we don't have
45 // to multiply with numbers > 1, and because during the final reverse FFT, we
46 // don't have to divide by numbers around 2^(-n).
48 // The usual FFT algorithm
49 // Input: polynomial p in x[0..2^n-1].
50 // for l = n-1..0: // step m=l+1 -> m=l
51 // for s in {0,..,2^(n-1-l)-1}:
52 // exp := bit_reverse(n-1-l,s)*2^l,
53 // // chinese remainder algorithm for (z^(2^(l+1)) - w^(2*exp)) =
54 // // = (z^(2^l) - w^exp) * (z^(2^l) - w^(exp+2^(n-1))).
55 // for t in {0,..,2^l-1}:
56 // i1 := s*2^(l+1) + t, i2 := s*2^(l+1) + 2^l + t,
57 // replace (x[i1],x[i2]) by (x[i1] + w^exp*x[i2], x[i1] - w^exp*x[i2])
60 // for j in {0..2^(n-m)-1}:
61 // p(z) mod (z^(2^m) - exp(2 pi i j/2^(n-m)))
62 // in x[bit_reverse(n-m,j)*2^m .. bit_reverse(n-m,j)*2^m+2^m-1].
63 // Output: p(z_j) in x[bit_reverse(n,j)].
64 // is thus replaced by the algorithm
65 // Input: polynomial p in x[0..2^n-1].
66 // for l = n-1..1: // step m=l+1 -> m=l
68 // // chinese remainder algorithm for
69 // // (z^(2^(l+1)) - 1) = (z^(2^l) - 1) * (z^(2^l) + 1).
70 // for t in {0,..,2^l-1}:
71 // i1 := t, i2 := 2^l + t,
72 // replace (x[i1],x[i2]) by (x[i1] + x[i2], x[i1] - x[i2])
73 // // chinese remainder algorithm for
74 // // (z^(2^l) + 1) = (z^(2^(l-1)) - i) * (z^(2^(l-1)) + i).
75 // // Nothing to do because of
76 // // a[0]z^0+...+a[2^l-1]z^(2^l-1) mod (z^(2^(l-1)) - i)
77 // // = (a[0]z^0+...+a[2^(l-1)-1]z^(2^(l-1)-1))
78 // // + i*(a[2^(l-1)]z^0+...+a[2^l-1]z^(2^(l-1)-1))
79 // // and because of the way we store the real parts and imaginary parts.
80 // for s in {1,..,2^(n-1-l)-1}:
81 // exp := shuffle(n-1-l,s)*2^(l-1),
82 // // chinese remainder algorithm for
83 // // (z^(2^l) - w^(2*exp))
84 // // = (z^(2^(l-1)) - w^exp) * conj(z^(2^(l-1)) - w^(2^(n-1)-exp)).
85 // for t in {0,..,2^(l-1)-1}:
86 // i1 := s*2^(l+1) + t, i2 := s*2^(l+1) + 2^(l-1) + t,
87 // i3 := s*2^(l+1) + 2^l + t, i4 := s*2^(l+1) + 2^l + 2^(l-1) + t,
88 // replace (x[i1],x[i2],x[i3],x[i4]) by
89 // (x[i1] + x[i2]*Re(w^exp) - x[i4]*Im(w^exp),
90 // x[i3] + x[i4]*Re(w^exp) + x[i2]*Im(w^exp),
91 // x[i1] - x[i2]*Re(w^exp) + x[i4]*Im(w^exp),
92 // -x[i3] + x[i4]*Re(w^exp) + x[i2]*Im(w^exp))
95 // p(z) mod (z^(2^m) - 1) in x[0..2^m-1],
96 // for j in {1,..,2^(n-m)-1}:
97 // p(z) mod (z^(2^(m-1)) - exp(2 pi i j/2^(n-m+1)))
98 // in x[invshuffle(n-m,j)*2^m + (0 .. 2^(m-1)-1)] (realpart)
99 // and x[invshuffle(n-m,j)*2^m+2^(m-1) + (0 .. 2^(m-1)-1)] (imagpart).
100 // Output: p(z) mod (z^2 - 1) in x[0],x[1],
101 // p(z) mod (z - exp(2 pi i j/2^n)) (0 < j < 2^(n-1))
102 // = x[2*invshuffle(n-1,j)] + i*x[2*invshuffle(n-1,j)+1],
103 // p(z) mod (z - exp(- 2 pi i j/2^n)) (0 < j < 2^(n-1))
104 // = x[2*invshuffle(n-1,j)] - i*x[2*invshuffle(n-1,j)+1].
106 // The shuffle function is defined like this:
107 // shuffle(n,j) defined for n >= 0, 0 < j < 2^n, yields 0 < shuffle(n,j) < 2^n.
108 // Definition by splitting off the least significant bit:
110 // n > 0: shuffle(n,1) = 2^(n-1),
111 // n > 0, 0 < j < 2^(n-1): shuffle(n,2*j) = shuffle(n-1,j),
112 // n > 0, 0 < j < 2^(n-1): shuffle(n,2*j+1) = 2^n - shuffle(n-1,j).
113 // Its inverse function is defined like this:
114 // invshuffle(n,j) defined for n >= 0, 0 < j < 2^n, 0 < invshuffle(n,j) < 2^n.
115 // Definition by splitting off the most significant bit:
117 // n > 0, 0 < j < 2^(n-1): invshuffle(n,j) = invshuffle(n-1,j)*2,
118 // n > 0, j = 2^(n-1): invshuffle(n,j) = 1,
119 // n > 0, 2^(n-1) < j < 2^n: invshuffle(n,j) = invshuffle(n-1,2^n-j)*2+1.
120 // Note that shuffle(n,.) and invshuffle(n,.) are _not_ the same permutation
123 // It is important to have precise round-off error estimates. Although
124 // (by laws of statistics) in the average the actual round-off error makes up
125 // only half of the bits provided for round-off protection, we cannot rely
126 // on this average behaviour, but have to produce correct results.
128 // Knuth's formula (42), p. 294, says:
129 // If we want to multiply l-bit words using an FFT(2^k), our floating point
130 // numbers shall have m >= 2(k+l) + k + log_2 k + 3.5 mantissa bits.
132 // Here is a more careful analysis, using absolute error estimates.
134 // 1. We assume floating point numbers with radix 2, with the properties:
135 // (i) Multiplication with 2^n and 2^-n is exact.
136 // (ii) Negation x -> -x is exact.
137 // (iii) Addition: When adding x and y, with |x| <= 2^a, |y| <= 2^a,
138 // the result |x+y| <= 2^(a+1) has an error <= e*2^(a+1-m).
139 // (iv) Multiplication: When multiplying x and y, with |x| <= 2^a,
140 // |y| <= 2^b, the result |x*y| <= 2^(a+b) has an error <= e*2^(a+b-m).
141 // Here e = 1 for a truncating arithmetic, but e = 1/2 for a rounding
142 // arithmetic like IEEE single and double floats.
143 // 2. Let's introduce some notation: err(x) means |x'-x| where x is the
144 // exact mathematical value and x' is its representation in the machine.
145 // 3. From 1. we get for real numbers x,y:
146 // (i) err(2^n*x) = 2^n * err(x),
147 // (ii) err(-x) = err(x),
148 // (iii) |x| <= 2^a, |y| <= 2^a, then
149 // err(x+y) <= err(x) + err(y) + e*2^(a+1-m),
150 // [or .... ............... + e*2^(a-m) if |x+y| <= 2^a].
151 // (iv) |x| <= 2^a, |y| <= 2^b, then
152 // err(x*y) <= 2^a * err(y) + 2^b * err(x) + e*2^(a+b-m).
153 // 4. Our complex arithmetic will be based on the formulas:
154 // (i) 2^n*(x+iy) = (2^n*x)+i(2^n*y)
155 // (ii) -(x+iy) = (-x)+i(-y)
156 // (iii) (x+iy)+(u+iv) = (x+u)+i(y+v)
157 // (iv) (x+iy)*(u+iv) = (x*u-y*v)+i(x*v+y*u)
158 // The notation err(z) means |z'-z|, as above, with |.| being the usual
159 // absolute value on complex numbers (_not_ the L^1 norm).
160 // 5. From 3. and 4. we get for complex numbers x,y:
161 // (i) err(2^n*x) = 2^n * err(x),
162 // (ii) err(-x) = err(x),
163 // (iii) |x| <= 2^a, |y| <= 2^a, then
164 // err(x+y) <= err(x) + err(y) + e*2^(a+3/2-m),
165 // (iv) |x| <= 2^a, |y| <= 2^b, then
166 // err(x*y) <= 2^a * err(y) + 2^b * err(x) + 3*e*2^(a+b+1/2-m).
167 // 6. We start out with precomputed roots of unity:
168 // |exp(2 pi i/2^n)| <= 1,
169 // err(exp(2 pi i/2^n)) <= e*2^(1/2-m), (even err(..)=0 for n=0,1,2),
170 // and compute exp(2 pi i * j/2^k) according to the binary digits of j.
171 // This way, each root of unity will be a product of at most k precomputed
172 // roots. If (j mod 2^(k-2)) has n bits, then exp(2 pi i * j/2^k) will
173 // be computed using n factors, i.e. n-1 complex multiplications, and by
175 // err(exp(2 pi i * j/2^k)) <= n*e*2^(1/2-m) + max(n-1,0)*3*e*2^(1/2-m)
176 // = max(4*n-3,0)*e*2^(1/2-m).
177 // Hence the maximum roots-of-unity error is (set n=k-2)
178 // err(w^j) <= (4*k-11)*e*2^(1/2-m),
179 // and the average roots-of-unity error is (set n=(k-2)/2)
180 // < 2*(k-2)*e*2^(1/2-m).
181 // 7. Now we start the FFT.
182 // Before the first step, x_i are integral, |x_i| < 2^l and err(x_i) = 0.
183 // After the first butterfly, which replaces (x(i1),x(i2)) by
184 // (x(i1) + x(i2), x(i1) - x(i2)), we have |x_i| < 2^(l+1) and err(x_i) = 0.
185 // Then, for each of the remaining k-1 steps, a butterfly replaces
186 // (x(i1),x(i2)) by (x(i1) + w^exp*x(i2), x(i1) - w^exp*x(i2)). Thus,
187 // after n steps we have |x_i| < 2^(l+n) and err(x_i) <= E_i where
190 // E_(n+1) = (E_n + 2^(l+n)*(4*k-11)*e*2^(1/2-m) + 3*e*2^(l+n+1/2-m))
191 // + E_n + e*2^(l+n+3/2-m)
192 // = 2*E_n + (4*k-6)*e*2^(l+n+1/2-m)
193 // hence E_n = (2^n-2)*(4*k-6)*e*2^(l+1/2-m).
194 // Setting n = k, we have proved that after the FFT ends, we have
195 // |x_i| < 2^(l+k) and err(x_i) <= (4*k-6)*e*2^(l+k+1/2-m).
196 // 8. The same error analysis holds for the y_i and their FFT. After we
197 // multiply z_i := x_i * y_i, we have
198 // |z_i| < 2^(2*l+2*k) and err(z_i) <= (8*k-9)*e*2^(2*l+2*k+1/2-m).
199 // 9. Then an inverse FFT on z_i is done, which is the same as an FFT
200 // followed by a permutation and a division by 2^k. After n steps of
201 // the FFT, we have |z_i| < 2^(2*l+2*k+n) and err(z_i) <= E_i where
202 // E_0 = (8*k-9)*e*2^(2*l+2*k+1/2-m),
203 // E_(n+1) = (E_n + 2^(2*l+2*k+n)*(4*k-11)*e*2^(1/2-m)
204 // + 3*e*2^(2*l+2*k+n+1/2-m))
205 // + E_n + e*2^(2*l+2*k+n+3/2-m)
206 // = 2*E_n + (4*k-6)*e*2^(2*l+2*k+n+1/2-m)
207 // hence E_n = 2^n*(8*k-9)*e*2^(2*l+2*k+1/2-m)
208 // + (2^n-1)*(4*k-6)*e*2^(2*l+2*k+1/2-m).
209 // So, after the FFT, we have (set n=k) |z_i| < 2^(2*l+3*k) and
210 // err(z_i) <= (12*k-15)*e*2^(2*l+3*k+1/2-m).
211 // Permutation doesn't change the estimates. After division by 2^k, we get
212 // |z_i| < 2^(2*l+2*k) and
213 // err(z_i) <= (12*k-15)*e*2^(2*l+2*k+1/2-m).
214 // 10. When converting the z_i back to integers, we know that z_i should be
215 // real, integral, and |z_i| < 2^(2*l+k). We can only guarantee that we
216 // can find the integral z_i from the floating-point computation if
217 // (12*k-15)*e*2^(2*l+2*k+1/2-m) < 1/2.
218 // 11. Assuming e = 1/2 and m = 53 (typical values for IEEE double arithmetic),
219 // we get the constraint 2*l < m - 1/2 - 2*k - log_2(12*k-15).
239 // Assuming e = 1/2 and m = 64 ("long double" arithmetic on i387/i486/i586),
240 // we get the constraint 2*l < m - 1/2 - 2*k - log_2(12*k-15).
268 #error "complex symmetric fft implemented only for intDsize==32"
272 #include "cln/floatparam.h"
274 #include "cln/abort.h"
277 #if defined(HAVE_LONGDOUBLE) && (long_double_mant_bits > double_mant_bits) && (defined(__i386__) || defined(__m68k__) || (defined(__sparc__) && 0))
278 // Only these CPUs have fast "long double"s in hardware.
279 // On SPARC, "long double"s are emulated in software and don't work.
280 typedef long double fftcs_real;
281 #define fftcs_real_mant_bits long_double_mant_bits
282 #define fftcs_real_rounds long_double_rounds
284 typedef double fftcs_real;
285 #define fftcs_real_mant_bits double_mant_bits
286 #define fftcs_real_rounds double_rounds
289 typedef struct fftcs_complex {
294 static const fftcs_complex fftcs_roots_of_1 [32+1] =
295 // roots_of_1[n] is a (2^n)th root of unity in C.
296 // Also roots_of_1[n-1] = roots_of_1[n]^2.
297 // For simplicity we choose roots_of_1[n] = exp(2 pi i/2^n).
299 #if (fftcs_real_mant_bits == double_mant_bits)
300 // These values have 64 bit precision.
304 { 0.7071067811865475244, 0.7071067811865475244 },
305 { 0.9238795325112867561, 0.38268343236508977172 },
306 { 0.9807852804032304491, 0.19509032201612826784 },
307 { 0.99518472667219688623, 0.098017140329560601996 },
308 { 0.9987954562051723927, 0.049067674327418014254 },
309 { 0.9996988186962042201, 0.024541228522912288032 },
310 { 0.99992470183914454094, 0.0122715382857199260795 },
311 { 0.99998117528260114264, 0.0061358846491544753597 },
312 { 0.9999952938095761715, 0.00306795676296597627 },
313 { 0.99999882345170190993, 0.0015339801862847656123 },
314 { 0.99999970586288221914, 7.6699031874270452695e-4 },
315 { 0.99999992646571785114, 3.8349518757139558907e-4 },
316 { 0.9999999816164292938, 1.9174759731070330744e-4 },
317 { 0.9999999954041073129, 9.5873799095977345874e-5 },
318 { 0.99999999885102682754, 4.793689960306688455e-5 },
319 { 0.99999999971275670683, 2.3968449808418218729e-5 },
320 { 0.9999999999281891767, 1.1984224905069706422e-5 },
321 { 0.99999999998204729416, 5.9921124526424278428e-6 },
322 { 0.99999999999551182357, 2.9960562263346607504e-6 },
323 { 0.99999999999887795586, 1.4980281131690112288e-6 },
324 { 0.999999999999719489, 7.4901405658471572114e-7 },
325 { 0.99999999999992987223, 3.7450702829238412391e-7 },
326 { 0.99999999999998246807, 1.8725351414619534487e-7 },
327 { 0.999999999999995617, 9.36267570730980828e-8 },
328 { 0.99999999999999890425, 4.6813378536549092695e-8 },
329 { 0.9999999999999997261, 2.340668926827455276e-8 },
330 { 0.99999999999999993153, 1.1703344634137277181e-8 },
331 { 0.99999999999999998287, 5.8516723170686386908e-9 },
332 { 0.9999999999999999957, 2.925836158534319358e-9 },
333 { 0.9999999999999999989, 1.4629180792671596806e-9 }
334 #else (fftcs_real_mant_bits > double_mant_bits)
335 // These values have 128 bit precision.
339 { 0.707106781186547524400844362104849039284L, 0.707106781186547524400844362104849039284L },
340 { 0.923879532511286756128183189396788286823L, 0.38268343236508977172845998403039886676L },
341 { 0.980785280403230449126182236134239036975L, 0.195090322016128267848284868477022240928L },
342 { 0.995184726672196886244836953109479921574L, 0.098017140329560601994195563888641845861L },
343 { 0.998795456205172392714771604759100694444L, 0.0490676743274180142549549769426826583147L },
344 { 0.99969881869620422011576564966617219685L, 0.0245412285229122880317345294592829250654L },
345 { 0.99992470183914454092164649119638322435L, 0.01227153828571992607940826195100321214037L },
346 { 0.999981175282601142656990437728567716173L, 0.00613588464915447535964023459037258091705L },
347 { 0.999995293809576171511580125700119899554L, 0.00306795676296597627014536549091984251894L },
348 { 0.99999882345170190992902571017152601905L, 0.001533980186284765612303697150264079079954L },
349 { 0.999999705862882219160228217738765677117L, 7.66990318742704526938568357948576643142e-4L },
350 { 0.99999992646571785114473148070738785695L, 3.83495187571395589072461681181381263396e-4L },
351 { 0.999999981616429293808346915402909714504L, 1.91747597310703307439909561989000933469e-4L },
352 { 0.99999999540410731289097193313960614896L, 9.58737990959773458705172109764763511872e-5L },
353 { 0.9999999988510268275626733077945541084L, 4.79368996030668845490039904946588727468e-5L },
354 { 0.99999999971275670684941397221864177609L, 2.39684498084182187291865771650218200947e-5L },
355 { 0.999999999928189176709775095883850490262L, 1.198422490506970642152156159698898480473e-5L },
356 { 0.99999999998204729417728262414778410738L, 5.99211245264242784287971180889086172999e-6L },
357 { 0.99999999999551182354431058417299732444L, 2.99605622633466075045481280835705981183e-6L },
358 { 0.999999999998877955886077016551752536504L, 1.49802811316901122885427884615536112069e-6L },
359 { 0.999999999999719488971519214794719584451L, 7.49014056584715721130498566730655637157e-7L },
360 { 0.99999999999992987224287980123972873676L, 3.74507028292384123903169179084633177398e-7L },
361 { 0.99999999999998246806071995015624773673L, 1.8725351414619534486882457659356361712e-7L },
362 { 0.999999999999995617015179987529456656217L, 9.3626757073098082799067286680885620193e-8L },
363 { 0.999999999999998904253794996881763834182L, 4.68133785365490926951155181385400969594e-8L },
364 { 0.99999999999999972606344874922040343793L, 2.34066892682745527595054934190348440379e-8L },
365 { 0.999999999999999931515862187305098514444L, 1.170334463413727718124621350323810379807e-8L },
366 { 0.999999999999999982878965546826274482047L, 5.8516723170686386908097901008341396944e-9L },
367 { 0.999999999999999995719741386706568611352L, 2.92583615853431935792823046906895590202e-9L },
368 { 0.999999999999999998929935346676642152265L, 1.46291807926715968052953216186596371037e-9L }
372 // Define this for (cheap) consistency checks.
375 static fftcs_real fftcs_pow2_table[64] = // table of powers of 2
434 144115188075855872.0,
435 288230376151711744.0,
436 576460752303423488.0,
437 1152921504606846976.0,
438 2305843009213693952.0,
439 4611686018427387904.0,
440 9223372036854775808.0
443 // For a constant expression n (0 <= n < 128), returns 2^n of type fftcs_real.
444 #define fftcs_pow2(n) \
445 (((n) & 64 ? (fftcs_real)18446744073709551616.0 : (fftcs_real)1.0) \
446 * ((n) & 32 ? (fftcs_real)4294967296.0 : (fftcs_real)1.0) \
447 * ((n) & 16 ? (fftcs_real)65536.0 : (fftcs_real)1.0) \
448 * ((n) & 8 ? (fftcs_real)256.0 : (fftcs_real)1.0) \
449 * ((n) & 4 ? (fftcs_real)16.0 : (fftcs_real)1.0) \
450 * ((n) & 2 ? (fftcs_real)4.0 : (fftcs_real)1.0) \
451 * ((n) & 1 ? (fftcs_real)2.0 : (fftcs_real)1.0) \
455 static inline void mul (const fftcs_complex& a, const fftcs_complex& b, fftcs_complex& r)
457 var fftcs_real r_re = a.re * b.re - a.im * b.im;
458 var fftcs_real r_im = a.re * b.im + a.im * b.re;
463 static uintL shuffle (uintL n, uintL x)
467 // Invariant: y + v*shuffle(n,x).
471 return y + (v << (n-1));
477 } while (!(--n == 0));
482 static uintL invshuffle (uintL n, uintL x)
486 // Invariant: y + v*invshuffle(n,x).
488 if (x == ((uintL)1 << (n-1)))
490 else if (x > ((uintL)1 << (n-1))) {
491 x = ((uintL)1 << n) - x;
495 } while (!(--n == 0));
500 // Compute a real convolution using FFT: z[0..N-1] := x[0..N-1] * y[0..N-1].
501 static void fftcs_convolution (const uintL n, const uintL N, // N = 2^n
502 fftcs_real * x, // N numbers
503 fftcs_real * y, // N numbers
504 fftcs_real * z // N numbers result
508 var fftcs_complex* const w = cl_alloc_array(fftcs_complex,N>>2);
510 // Initialize w[i] to w^i, w a primitive N-th root of unity.
511 w[0] = fftcs_roots_of_1[0];
514 for (j = n-3; j>=0; j--) {
515 var fftcs_complex r_j = fftcs_roots_of_1[n-j];
517 for (i = (2<<j); i < N>>2; i += (2<<j))
518 mul(w[i],r_j, w[i+(1<<j)]);
521 var bool squaring = (x == y);
522 // Do an FFT of length N on x.
525 for (l = n-1; l > 0; l--) {
527 var const uintL tmax = (uintL)1 << l;
528 for (var uintL t = 0; t < tmax; t++) {
530 var uintL i2 = i1 + tmax;
531 // replace (x[i1],x[i2]) by
532 // (x[i1] + x[i2], x[i1] - x[i2])
539 var const uintL smax = (uintL)1 << (n-1-l);
540 var const uintL tmax = (uintL)1 << (l-1);
541 for (var uintL s = 1; s < smax; s++) {
542 var uintL exp = shuffle(n-1-l,s) << (l-1);
543 for (var uintL t = 0; t < tmax; t++) {
544 var uintL i1 = (s << (l+1)) + t;
545 var uintL i2 = i1 + tmax;
546 var uintL i3 = i2 + tmax;
547 var uintL i4 = i3 + tmax;
548 // replace (x[i1],x[i2],x[i3],x[i4]) by
549 // (x[i1] + x[i2]*Re(w^exp) - x[i4]*Im(w^exp),
550 // x[i3] + x[i4]*Re(w^exp) + x[i2]*Im(w^exp),
551 // x[i1] - x[i2]*Re(w^exp) + x[i4]*Im(w^exp),
552 // -x[i3] + x[i4]*Re(w^exp) + x[i2]*Im(w^exp))
557 diff = x[i2] * w[exp].re - x[i4] * w[exp].im;
558 sum = x[i4] * w[exp].re + x[i2] * w[exp].im;
569 // replace (x[0],x[1]) by (x[0]+x[1], x[0]-x[1])
576 // Do an FFT of length N on y.
579 for (l = n-1; l > 0; l--) {
581 var const uintL tmax = (uintL)1 << l;
582 for (var uintL t = 0; t < tmax; t++) {
584 var uintL i2 = i1 + tmax;
585 // replace (y[i1],y[i2]) by
586 // (y[i1] + y[i2], y[i1] - y[i2])
593 var const uintL smax = (uintL)1 << (n-1-l);
594 var const uintL tmax = (uintL)1 << (l-1);
595 for (var uintL s = 1; s < smax; s++) {
596 var uintL exp = shuffle(n-1-l,s) << (l-1);
597 for (var uintL t = 0; t < tmax; t++) {
598 var uintL i1 = (s << (l+1)) + t;
599 var uintL i2 = i1 + tmax;
600 var uintL i3 = i2 + tmax;
601 var uintL i4 = i3 + tmax;
602 // replace (y[i1],y[i2],y[i3],y[i4]) by
603 // (y[i1] + y[i2]*Re(w^exp) - y[i4]*Im(w^exp),
604 // y[i3] + y[i4]*Re(w^exp) + y[i2]*Im(w^exp),
605 // y[i1] - y[i2]*Re(w^exp) + y[i4]*Im(w^exp),
606 // -y[i3] + y[i4]*Re(w^exp) + y[i2]*Im(w^exp))
611 diff = y[i2] * w[exp].re - y[i4] * w[exp].im;
612 sum = y[i4] * w[exp].re + y[i2] * w[exp].im;
623 // replace (y[0],y[1]) by (y[0]+y[1], y[0]-y[1])
630 // Multiply the transformed vectors into z.
632 // Multiplication mod (z-1).
634 // Multiplication mod (z+1).
636 for (i = 2; i < N; i += 2)
637 // Multiplication mod (z - exp(2 pi i j/2^n)), j = shuffle(n-1,i/2).
638 mul(*(fftcs_complex*)&x[i],*(fftcs_complex*)&y[i], *(fftcs_complex*)&z[i]);
640 // Undo an FFT of length N on z.
644 // replace (z[0],z[1]) by ((z[0]+z[1])/2, (z[0]-z[1])/2)
647 z[1] = (z[0] - tmp) * (fftcs_real)0.5;
648 z[0] = (z[0] + tmp) * (fftcs_real)0.5;
650 for (l = 1; l < n; l++) {
652 var const uintL tmax = (uintL)1 << l;
653 for (var uintL t = 0; t < tmax; t++) {
655 var uintL i2 = i1 + tmax;
656 // replace (z[i1],z[i2]) by
657 // ((z[i1]+z[i2])/2, (z[i1]-z[i2])/2)
658 // Do the division by 2 later.
665 var const uintL smax = (uintL)1 << (n-1-l);
666 var const uintL tmax = (uintL)1 << (l-1);
667 for (var uintL s = 1; s < smax; s++) {
668 var uintL exp = shuffle(n-1-l,s) << (l-1);
669 for (var uintL t = 0; t < tmax; t++) {
670 var uintL i1 = (s << (l+1)) + t;
671 var uintL i2 = i1 + tmax;
672 var uintL i3 = i2 + tmax;
673 var uintL i4 = i3 + tmax;
674 // replace (z[i1],z[i2],z[i3],z[i4]) by
676 // (z[i1]-z[i3])/2*Re(w^exp)+(z[i2]+z[i4])/2*Im(w^exp),
678 // (z[i2]+z[i4])/2*Re(w^exp)-(z[i1]-z[i3])/2*Im(w^exp))
679 // Do the division by 2 later.
680 var fftcs_real diff13;
681 var fftcs_real sum24;
684 diff13 = z[i1] - z[i3];
685 sum24 = z[i2] + z[i4];
686 tmp1 = z[i1] + z[i3];
687 tmp3 = z[i2] - z[i4];
689 z[i2] = diff13 * w[exp].re + sum24 * w[exp].im;
691 z[i4] = sum24 * w[exp].re - diff13 * w[exp].im;
695 // Do all divisions by 2 now.
697 var fftcs_real f = (fftcs_real)2.0 / (fftcs_real)N; // 2^-(n-1)
698 for (i = 0; i < N; i++)
704 // For a given k >= 2, the maximum l is determined by
705 // 2*l < m - 1/2 - 2*k - log_2(12*k-15) - (1 if e=1.0, 0 if e=0.5).
706 // This is a decreasing function of k.
708 (int)((fftcs_real_mant_bits \
710 - ((k)<=2 ? 4 : (k)<=3 ? 5 : (k)<=5 ? 6 : (k)<=8 ? 7 : (k)<=16 ? 8 : (k)<=31 ? 9 : 10) \
711 - (fftcs_real_rounds == rounds_to_nearest ? 0 : 1)) \
713 static int max_l_table[32+1] =
714 { 0, 0, max_l(2), max_l(3), max_l(4), max_l(5), max_l(6),
715 max_l(7), max_l(8), max_l(9), max_l(10), max_l(11), max_l(12), max_l(13),
716 max_l(14), max_l(15), max_l(16), max_l(17), max_l(18), max_l(19), max_l(20),
717 max_l(21), max_l(22), max_l(23), max_l(24), max_l(25), max_l(26), max_l(27),
718 max_l(28), max_l(29), max_l(30), max_l(31), max_l(32)
721 // Split len uintD's below sourceptr into chunks of l bits, thus filling
722 // N real numbers at x.
723 static void fill_factor (uintL N, fftcs_real* x, uintL l,
724 const uintD* sourceptr, uintL len)
727 if (max_l(2) > intDsize && l > intDsize) {
729 if (max_l(2) > 64 && l > 64) {
730 fprint(stderr, "FFT problem: l > 64 not supported by pow2_table\n");
733 var fftcs_real carry = 0;
734 var sintL carrybits = 0; // number of bits in carry (>=0, <l)
737 var uintD digit = lsprefnext(sourceptr);
738 if (carrybits+intDsize >= l) {
739 x[i] = carry + (fftcs_real)(digit & bitm(l-carrybits)) * fftcs_pow2_table[carrybits];
741 carry = (l-carrybits == intDsize ? (fftcs_real)0 : (fftcs_real)(digit >> (l-carrybits)));
742 carrybits = carrybits+intDsize-l;
744 carry = carry + (fftcs_real)digit * fftcs_pow2_table[carrybits];
745 carrybits = carrybits+intDsize;
755 } else if (max_l(2) >= intDsize && l == intDsize) {
759 for (i = 0; i < len; i++) {
760 var uintD digit = lsprefnext(sourceptr);
761 x[i] = (fftcs_real)digit;
765 var const uintD l_mask = bit(l)-1;
767 var sintL carrybits = 0; // number of bits in carry (>=0, <intDsize)
768 for (i = 0; i < N; i++) {
769 if (carrybits >= l) {
770 x[i] = (fftcs_real)(carry & l_mask);
777 var uintD digit = lsprefnext(sourceptr);
778 x[i] = (fftcs_real)((carry | (digit << carrybits)) & l_mask);
779 carry = digit >> (l-carrybits);
780 carrybits = intDsize - (l-carrybits);
783 while (carrybits > 0) {
786 x[i] = (fftcs_real)(carry & l_mask);
795 x[i] = (fftcs_real)0;
798 // Given a not too large floating point number, round it to the nearest integer.
799 static inline fftcs_real fftcs_fround (fftcs_real x)
802 #if (fftcs_real_rounds == rounds_to_nearest)
803 (x + (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2)))
804 - (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2));
805 #elif (fftcs_real_rounds == rounds_to_infinity)
806 (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2))
807 - ((fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2))
808 - (x + (fftcs_real)0.5));
809 #else // rounds_to_zero, rounds_to_minus_infinity
810 ((x + (fftcs_real)0.5)
811 + (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2)))
812 - (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2));
816 // Given a not too large floating point number, round it down.
817 static inline fftcs_real fftcs_ffloor (fftcs_real x)
819 #if (fftcs_real_rounds == rounds_to_nearest)
821 (x + (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2)))
822 - (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2));
826 return y - (fftcs_real)1.0;
827 #elif (fftcs_real_rounds == rounds_to_infinity)
829 (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2))
830 - ((fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2))
832 #else // rounds_to_zero, rounds_to_minus_infinity
834 (x + (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2)))
835 - (fftcs_pow2(fftcs_real_mant_bits-1)+fftcs_pow2(fftcs_real_mant_bits-2));
839 // Combine the N real numbers at z into uintD's below destptr.
840 // The z[i] are known to be approximately integers >= 0, < N*2^(2*l).
841 // Assumes room for floor(N*l/intDsize)+(1+ceiling((n+2*l)/intDsize)) uintD's
842 // below destptr. Fills len digits and returns (destptr lspop len).
843 static uintD* unfill_product (uintL n, uintL N, // N = 2^n
844 const fftcs_real * z, uintL l,
848 if (n + 2*l <= intDsize) {
849 // 2-digit carry is sufficient, l < intDsize
850 var uintD carry0 = 0;
851 var uintD carry1 = 0;
852 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
853 for (i = 0; i < N; i++) {
854 // Fetch z[i] and round it to the nearest uintD.
855 var uintD digit = (uintD)(z[i] + (fftcs_real)0.5);
857 carry1 += digit >> (intDsize-shift);
858 digit = digit << shift;
860 if ((carry0 += digit) < digit)
863 if (shift >= intDsize) {
864 lsprefnext(destptr) = carry0;
870 lsprefnext(destptr) = carry0;
871 lsprefnext(destptr) = carry1;
872 } else if (n + 2*l <= 2*intDsize) {
873 // 3-digit carry is sufficient, l < intDsize
875 var uintDD carry0 = 0;
876 var uintD carry1 = 0;
877 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
878 for (i = 0; i < N; i++) {
879 // Fetch z[i] and round it to the nearest uintDD.
880 var uintDD digit = (uintDD)(z[i] + (fftcs_real)0.5);
882 carry1 += (uintD)(digit >> (2*intDsize-shift));
883 digit = digit << shift;
885 if ((carry0 += digit) < digit)
888 if (shift >= intDsize) {
889 lsprefnext(destptr) = lowD(carry0);
890 carry0 = highlowDD(carry1,highD(carry0));
895 lsprefnext(destptr) = lowD(carry0);
896 lsprefnext(destptr) = highD(carry0);
897 lsprefnext(destptr) = carry1;
899 var uintD carry0 = 0;
900 var uintD carry1 = 0;
901 var uintD carry2 = 0;
902 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
903 for (i = 0; i < N; i++) {
904 // Fetch z[i] and round it to the nearest uintDD.
905 var fftcs_real zi = z[i] + (fftcs_real)0.5;
906 var const fftcs_real pow2_intDsize = fftcs_pow2(intDsize);
907 var const fftcs_real invpow2_intDsize = (fftcs_real)1.0/pow2_intDsize;
908 var uintD digit1 = (uintD)(zi * invpow2_intDsize);
909 var uintD digit0 = (uintD)(zi - (fftcs_real)digit1 * pow2_intDsize);
911 carry2 += digit1 >> (intDsize-shift);
912 digit1 = (digit1 << shift) | (digit0 >> (intDsize-shift));
913 digit0 = digit0 << shift;
915 if ((carry0 += digit0) < digit0)
916 if ((carry1 += 1) == 0)
918 if ((carry1 += digit1) < digit1)
921 if (shift >= intDsize) {
922 lsprefnext(destptr) = carry0;
929 lsprefnext(destptr) = carry0;
930 lsprefnext(destptr) = carry1;
931 lsprefnext(destptr) = carry2;
934 // 1-digit+1-float carry is sufficient
935 var uintD carry0 = 0;
936 var fftcs_real carry1 = 0;
937 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
938 for (i = 0; i < N; i++) {
939 // Fetch z[i] and round it to the nearest integer.
940 var fftcs_real digit = fftcs_fround(z[i]);
942 if (!(digit >= (fftcs_real)0
943 && z[i] > digit - (fftcs_real)0.5
944 && z[i] < digit + (fftcs_real)0.5))
948 digit = digit * fftcs_pow2_table[shift];
949 var fftcs_real digit1 = fftcs_ffloor(digit*((fftcs_real)1.0/fftcs_pow2(intDsize)));
950 var uintD digit0 = (uintD)(digit - digit1*fftcs_pow2(intDsize));
952 if ((carry0 += digit0) < digit0)
953 carry1 += (fftcs_real)1.0;
955 while (shift >= intDsize) {
956 lsprefnext(destptr) = carry0;
957 var fftcs_real tmp = fftcs_ffloor(carry1*((fftcs_real)1.0/fftcs_pow2(intDsize)));
958 carry0 = (uintD)(carry1 - tmp*fftcs_pow2(intDsize));
963 if (carry0 > 0 || carry1 > (fftcs_real)0.0) {
964 lsprefnext(destptr) = carry0;
965 while (carry1 > (fftcs_real)0.0) {
966 var fftcs_real tmp = fftcs_ffloor(carry1*((fftcs_real)1.0/fftcs_pow2(intDsize)));
967 lsprefnext(destptr) = (uintD)(carry1 - tmp*fftcs_pow2(intDsize));
975 static inline void mulu_fftcs_nocheck (const uintD* sourceptr1, uintC len1,
976 const uintD* sourceptr2, uintC len2,
978 // Es ist 2 <= len1 <= len2.
980 // We have to find parameters l and k such that
981 // ceiling(len1*intDsize/l) + ceiling(len2*intDsize/l) - 1 <= 2^k,
982 // and (12*k-15)*e*2^(2*l+2*k+1/2-m) < 1/2.
983 // Try primarily to minimize k. Minimizing l buys you nothing.
985 // Computing k: If len1 and len2 differ much, we'll split source2 -
986 // hence for the moment just substitute len1 for len2.
988 // First approximation of k: A necessary condition for
989 // 2*ceiling(len1*intDsize/l) - 1 <= 2^k
990 // is 2*len1*intDsize/l_max - 1 <= 2^k.
992 var const int l = max_l(2);
993 var uintL lhs = 2*ceiling((uintL)len1*intDsize,l) - 1; // >=1
997 integerlength32(lhs-1, k=); // k>=2
999 // Try whether this k is ok or whether we have to increase k.
1001 if (k >= sizeof(max_l_table)/sizeof(max_l_table[0])
1002 || max_l_table[k] <= 0) {
1003 fprint(stderr, "FFT problem: numbers too big, floating point precision not sufficient\n");
1006 if (2*ceiling((uintL)len1*intDsize,max_l_table[k])-1 <= ((uintL)1 << k))
1009 // We could try to reduce l, keeping the same k. But why should we?
1010 // Calculate the number of pieces in which source2 will have to be
1011 // split. Each of the pieces must satisfy
1012 // ceiling(len1*intDsize/l) + ceiling(len2*intDsize/l) - 1 <= 2^k,
1014 // Try once with k, once with k+1. Compare them.
1016 var uintL remaining_k = ((uintL)1 << k) + 1 - ceiling((uintL)len1*intDsize,max_l_table[k]);
1017 var uintL max_piecelen_k = floor(remaining_k*max_l_table[k],intDsize);
1018 var uintL numpieces_k = ceiling(len2,max_piecelen_k);
1019 var uintL remaining_k1 = ((uintL)1 << (k+1)) + 1 - ceiling((uintL)len1*intDsize,max_l_table[k+1]);
1020 var uintL max_piecelen_k1 = floor(remaining_k1*max_l_table[k+1],intDsize);
1021 var uintL numpieces_k1 = ceiling(len2,max_piecelen_k1);
1022 if (numpieces_k <= 2*numpieces_k1) {
1024 len2p = max_piecelen_k;
1028 len2p = max_piecelen_k1;
1031 var const uintL l = max_l_table[k];
1032 var const uintL n = k;
1033 var const uintL N = (uintL)1 << n;
1035 var fftcs_real* const x = cl_alloc_array(fftcs_real,N);
1036 var fftcs_real* const y = cl_alloc_array(fftcs_real,N);
1038 var fftcs_real* const z = cl_alloc_array(fftcs_real,N);
1040 var fftcs_real* const z = x; // put z in place of x - saves memory
1042 var uintD* const tmpprod1 = cl_alloc_array(uintD,len1+1);
1043 var uintL tmpprod_len = floor(l<<n,intDsize)+6;
1044 var uintD* const tmpprod = cl_alloc_array(uintD,tmpprod_len);
1045 var uintL destlen = len1+len2;
1046 clear_loop_lsp(destptr,destlen);
1052 var uintD* tmpptr = arrayLSDptr(tmpprod1,len1+1);
1053 mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
1054 if (addto_loop_lsp(tmpptr,destptr,len1+1))
1055 if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
1058 var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
1060 fill_factor(N,x,l,sourceptr1,len1);
1063 fill_factor(N,y,l,sourceptr2,len2p);
1066 fftcs_convolution(n,N, &x[0], &y[0], &z[0]);
1068 fftcs_convolution(n,N, &x[0], &x[0], &z[0]);
1072 var fftcs_real re_lo_limit = (fftcs_real)(-0.5);
1073 var fftcs_real re_hi_limit = (fftcs_real)N * fftcs_pow2_table[l] * fftcs_pow2_table[l] + (fftcs_real)0.5;
1074 for (var uintL i = 0; i < N; i++)
1075 if (!(z[i] > re_lo_limit
1076 && z[i] < re_hi_limit))
1080 var uintD* tmpLSDptr = arrayLSDptr(tmpprod,tmpprod_len);
1081 var uintD* tmpMSDptr = unfill_product(n,N,z,l,tmpLSDptr);
1083 #if CL_DS_BIG_ENDIAN_P
1084 tmpLSDptr - tmpMSDptr;
1086 tmpMSDptr - tmpLSDptr;
1088 if (tmplen > tmpprod_len)
1090 // Add result to destptr[-destlen..-1]:
1091 if (tmplen > destlen) {
1092 if (test_loop_msp(tmpMSDptr,tmplen-destlen))
1096 if (addto_loop_lsp(tmpLSDptr,destptr,tmplen))
1097 if (inc_loop_lsp(destptr lspop tmplen,destlen-tmplen))
1101 destptr = destptr lspop len2p;
1103 sourceptr2 = sourceptr2 lspop len2p;
1111 // Compute a checksum: number mod (2^intDsize-1).
1112 static uintD compute_checksum (const uintD* sourceptr, uintC len)
1114 var uintD tmp = ~(uintD)0; // -1-(sum mod 2^intDsize-1), always >0
1116 var uintD digit = lsprefnext(sourceptr);
1118 tmp -= digit; // subtract digit
1120 tmp -= digit+1; // subtract digit-(2^intDsize-1)
1121 } while (--len > 0);
1125 // Multiply two checksums modulo (2^intDsize-1).
1126 static inline uintD multiply_checksum (uintD checksum1, uintD checksum2)
1131 var uintDD cksum = muluD(checksum1,checksum2);
1132 cksum_hi = highD(cksum); checksum = lowD(cksum);
1134 muluD(checksum1,checksum2, cksum_hi =, checksum =);
1136 if ((checksum += cksum_hi) + 1 <= cksum_hi)
1143 static void mulu_fftcs (const uintD* sourceptr1, uintC len1,
1144 const uintD* sourceptr2, uintC len2,
1147 // Compute checksums of the arguments and multiply them.
1148 var uintD checksum1 = compute_checksum(sourceptr1,len1);
1149 var uintD checksum2 = compute_checksum(sourceptr2,len2);
1150 var uintD checksum = multiply_checksum(checksum1,checksum2);
1151 mulu_fftcs_nocheck(sourceptr1,len1,sourceptr2,len2,destptr);
1152 if (!(checksum == compute_checksum(destptr,len1+len2))) {
1153 fprint(stderr, "FFT problem: checksum error\n");