]> www.ginac.de Git - cln.git/blob - src/base/digitseq/cl_DS_mul_fftr.h
* Add table of contents in TeX output.
[cln.git] / src / base / digitseq / cl_DS_mul_fftr.h
1 // Fast integer multiplication using FFT over the complex numbers, modified
2 // to work with real numbers only in the implementation.
3 // [Donald Ervin Knuth: The Art of Computer Programming, Vol. II:
4 //  Seminumerical Algorithms, second edition. Section 4.3.3, p. 290-294.]
5 // Bruno Haible 6.5.1996, 24.-25.8.1996, 27.-30.8.1996
6
7 // FFT in the complex domain has the drawback that it needs careful round-off
8 // error analysis. But for CPUs with good floating-point performance it might
9 // nevertheless be better than FFT mod Z/mZ.
10
11 // The usual FFT(2^n) computes the values of a polynomial p(z) mod (z^(2^n)-1)
12 // at the (2^n)-th roots of unity. For our purposes, we start out with
13 // polynomials with real coefficients. So the values at  z_j = exp(2 pi i j/N)
14 // and  z_-j = exp(- 2 pi i j/N)  will be complex conjugates of each other,
15 // which means that there exists a polynomial r(z) of degree <= 1 with real
16 // coefficients such that  p(z_j) = r(z_j)  and  p(z_-j) = r(z_-j). This
17 // implies that  r(z)  is the remainder of p(z) divided by
18 // (z - z_j) (z - z_-j) = (z^2 - 2 cos(2 pi j/N) z + 1).
19 //
20 // Based on this insight, we replace the usual n FFT steps
21 //    for m = n...0:
22 //      (z^(2^n)-1) = prod(j=0..2^(n-m)-1, (z^(2^m) - exp(2 pi j/2^(n-m))))
23 // by
24 //    for m = n...1:
25 //      (z^(2^n)-1) = (z^(2^m)-1) * prod(j=1..2^(n-m)-1, factor_m[j](z))
26 //      where
27 //      factor_m[j](z) = prod(k mod 2^n with k == j or k == -j mod 2^(n-m+1),
28 //                            (z - exp(2 pi i k/N)) )
29 //                     = prod(k=0..2^(m-1)-1,
30 //                            (z - exp(2 pi i (j+2^(n-m+1)k)/N))
31 //                            (z - exp(- 2 pi i (j+2^(n-m+1)k)/N)) )
32 //                     = (z^(2^(m-1)) - exp(2 pi i j 2^(m-1)/N))
33 //                       (z^(2^(m-1)) - exp(- 2 pi i j 2^(m-1)/N))
34 //                     = (z^(2^(m-1)) - exp(2 pi i j/2^(n-m+1)))
35 //                       (z^(2^(m-1)) - exp(- 2 pi i j/2^(n-m+1)))
36 //                     = (z^(2^m) - 2 cos(2 pi j/2^(n-m+1)) z^(2^(m-1)) + 1).
37 // The factors and the input are real polynomials, hence all intermediate
38 // and final remainders will be real as well.
39 //
40 // The usual FFT algorithm
41 //    Input: polynomial p in x[0..2^n-1].
42 //    for l = n-1..0: // step m=l+1 -> m=l
43 //      for s in {0,..,2^(n-1-l)-1}:
44 //        exp := bit_reverse(n-1-l,s)*2^l,
45 //        // chinese remainder algorithm for (z^(2^(l+1)) - w^(2*exp)) =
46 //        // = (z^(2^l) - w^exp) * (z^(2^l) - w^(exp+2^(n-1))).
47 //        for t in {0,..,2^l-1}:
48 //          i1 := s*2^(l+1) + t, i2 := s*2^(l+1) + 2^l + t,
49 //          replace (x[i1],x[i2]) by (x[i1] + w^exp*x[i2], x[i1] - w^exp*x[i2])
50 //    Invariant:
51 //      for m = n..0:
52 //        for j in {0..2^(n-m)-1}:
53 //          p(z) mod (z^(2^m) - exp(2 pi i j/2^(n-m)))
54 //          in x[bit_reverse(n-m,j)*2^m .. bit_reverse(n-m,j)*2^m+2^m-1].
55 //    Output: p(z_j) in x[bit_reverse(n,j)].
56 // is thus replaced by the algorithm
57 //    Input: polynomial p in x[0..2^n-1].
58 //    for l = n-1..1: // step m=l+1 -> m=l
59 //      for s in {0}:
60 //        // chinese remainder algorithm for (z^(2^(l+1)) - 1) =
61 //        // = (z^(2^l) - 1) * (z^(2^l) + 1).
62 //        for t in {0,..,2^l-1}:
63 //          i1 := t, i2 := 2^l + t,
64 //          replace (x[i1],x[i2]) by (x[i1] + x[i2], x[i1] - x[i2])
65 //      for s in {1,..,2^(n-1-l)-1}:
66 //        exp := shuffle(n-1-l,s)*2^(l-1),
67 //        // chinese remainder algorithm for
68 //        //   (z^(2^(l+1)) - 2 cos(2 pi 2*exp/N) z^(2^l) + 1) =
69 //        //   =   (z^(2^l) - 2 cos(2 pi exp/N) z^(2^(l-1)) + 1)
70 //        //     * (z^(2^l) - 2 cos(2 pi (2^(n-1)-exp)/N) z^(2^(l-1)) + 1)
71 //        gam := 2 cos(2 pi exp/N),
72 //        for t in {0,..,2^(l-1)-1}:
73 //          i1 := s*2^(l+1) + t, i2 := s*2^(l+1) + 2^(l-1) + t,
74 //          i3 := s*2^(l+1) + 2^l + t, i4 := s*2^(l+1) + 2^l + 2^(l-1) + t,
75 //          replace (x[i1],x[i2],x[i3],x[i4]) by
76 //          (x[i1]-x[i3] - x[i4]*gam,   x[i3]*gam + x[i2]+x[i4]*(gam^2-1),
77 //           x[i1]-x[i3] + x[i4]*gam, - x[i3]*gam + x[i2]+x[i4]*(gam^2-1))
78 //    Invariant:
79 //      for m = n..1:
80 //        p(z) mod (z^(2^m) - 1) in x[0..2^m-1],
81 //        for j in {1,..,2^(n-m)-1}:
82 //          p(z) mod (z^(2^m) - 2 cos(2 pi j/2^(n-m+1)) z^(2^(m-1)) + 1)
83 //          in x[invshuffle(n-m,j)*2^m .. invshuffle(n-m,j)*2^m+2^m-1].
84 //    Output: p(z) mod (z^2 - 1) in x[0],x[1],
85 //            p(z) mod (z^2 - 2 cos(2 pi j/2^n) z + 1)  (0 < j < 2^(n-1))
86 //            in x[2*invshuffle(n-1,j)],x[2*invshuffle(n-1,j)+1].
87 //
88 // The shuffle function is defined like this:
89 // shuffle(n,j) defined for n >= 0, 0 < j < 2^n, yields 0 < shuffle(n,j) < 2^n.
90 // Definition by splitting off the least significant bit:
91 //   n = 0: void.
92 //   n > 0: shuffle(n,1) = 2^(n-1),
93 //   n > 0, 0 < j < 2^(n-1): shuffle(n,2*j) = shuffle(n-1,j),
94 //   n > 0, 0 < j < 2^(n-1): shuffle(n,2*j+1) = 2^n - shuffle(n-1,j).
95 // Its inverse function is defined like this:
96 // invshuffle(n,j) defined for n >= 0, 0 < j < 2^n, 0 < invshuffle(n,j) < 2^n.
97 // Definition by splitting off the most significant bit:
98 //   n = 0: void.
99 //   n > 0, 0 < j < 2^(n-1): invshuffle(n,j) = invshuffle(n-1,j)*2,
100 //   n > 0, j = 2^(n-1): invshuffle(n,j) = 1,
101 //   n > 0, 2^(n-1) < j < 2^n: invshuffle(n,j) = invshuffle(n-1,2^n-j)*2+1.
102 // Note that shuffle(n,.) and invshuffle(n,.) are _not_ the same permutation
103 // for n>=4.
104
105 // It is important to have precise round-off error estimates. Although
106 // (by laws of statistics) in the average the actual round-off error makes up
107 // only half of the bits provided for round-off protection, we cannot rely
108 // on this average behaviour, but have to produce correct results.
109 //
110 // Knuth's formula (42), p. 294, says:
111 // If we want to multiply l-bit words using an FFT(2^k), our floating point
112 // numbers shall have m >= 2(k+l) + k + log_2 k + 3.5 mantissa bits.
113 //
114 // Here is a more careful analysis, using absolute error estimates.
115 //
116 // 1. We assume floating point numbers with radix 2, with the properties:
117 //    (i)   Multiplication with 2^n and 2^-n is exact.
118 //    (ii)  Negation x -> -x is exact.
119 //    (iii) Addition: When adding x and y, with |x| <= 2^a, |y| <= 2^a,
120 //          the result |x+y| <= 2^(a+1) has an error <= e*2^(a+1-m).
121 //    (iv)  Multiplication: When multiplying x and y, with |x| <= 2^a,
122 //          |y| <= 2^b, the result |x*y| <= 2^(a+b) has an error <= e*2^(a+b-m).
123 //    (v)   Division: When dividing x by y, with |x| <= 2^a,
124 //          2^b <= |y| <= 2^(b+1), the result |x/y| <= 2^(a-b) has an error
125 //          <= e*2^(a-b-m).
126 //    Here e = 1 for a truncating arithmetic, but e = 1/2 for a rounding
127 //    arithmetic like IEEE single and double floats.
128 // 2. Let's introduce some notation: err(x) means |x'-x| where x is the
129 //    exact mathematical value and x' is its representation in the machine.
130 // 3. From 1. we get for real numbers x,y:
131 //    (i)   err(2^n*x) = 2^n * err(x),
132 //    (ii)  err(-x) = err(x),
133 //    (iii) |x| <= 2^a, |y| <= 2^a, then
134 //          err(x+y) <= err(x) + err(y) + e*2^(a+1-m),
135 //          [or ....    ............... + e*2^(a-m) if |x+y| <= 2^a].
136 //    (iv)  |x| <= 2^a, |y| <= 2^b, then
137 //          err(x*y) <= 2^a * err(y) + 2^b * err(x) + e*2^(a+b-m).
138 //    (v)   |x| <= 2^a, 2^b <= |y| <= 2^(b+1), then
139 //          err(x/y) <= 2^(-b) * err(x) + 2^(a-2b) * err(y) + e*2^(a-b-m).
140 // 4. Our complex arithmetic will be based on the formulas:
141 //    (i)   2^n*(x+iy) = (2^n*x)+i(2^n*y)
142 //    (ii)  -(x+iy) = (-x)+i(-y)
143 //    (iii) (x+iy)+(u+iv) = (x+u)+i(y+v)
144 //    (iv)  (x+iy)*(u+iv) = (x*u-y*v)+i(x*v+y*u)
145 //    The notation err(z) means |z'-z|, as above, with |.| being the usual
146 //    absolute value on complex numbers (_not_ the L^1 norm).
147 // 5. From 3. and 4. we get for complex numbers x,y:
148 //    (i)   err(2^n*x) = 2^n * err(x),
149 //    (ii)  err(-x) = err(x),
150 //    (iii) |x| <= 2^a, |y| <= 2^a, then
151 //          err(x+y) <= err(x) + err(y) + e*2^(a+3/2-m),
152 //    (iv)  |x| <= 2^a, |y| <= 2^b, then
153 //          err(x*y) <= 2^a * err(y) + 2^b * err(x) + 3*e*2^(a+b+1/2-m).
154 // 6. We start out with precomputed roots of unity:
155 //          |exp(2 pi i/2^n)| <= 1,
156 //          err(exp(2 pi i/2^n)) <= e*2^(1/2-m), (even err(..)=0 for n=0,1,2),
157 //    and compute  exp(2 pi i * j/2^k)  according to the binary digits of j.
158 //    This way, each root of unity will be a product of at most k precomputed
159 //    roots. If (j mod 2^(k-2)) has n bits, then  exp(2 pi i * j/2^k)  will
160 //    be computed using n factors, i.e. n-1 complex multiplications, and by
161 //    5.iv. we'll have
162 //          err(exp(2 pi i * j/2^k)) <= n*e*2^(1/2-m) + max(n-1,0)*3*e*2^(1/2-m)
163 //                                    = max(4*n-3,0)*e*2^(1/2-m).
164 //    Hence the maximum roots-of-unity error is (set n=k-2)
165 //          err(w^j) <= (4*k-11)*e*2^(1/2-m),
166 //    and the average roots-of-unity error is (set n=(k-2)/2)
167 //          < 2*(k-2)*e*2^(1/2-m).
168 // 7. We use precomputed values of gam = 2*cos(2*pi*j/2^k) (0 < j < 2^(k-2)),
169 //    12*2^-k <= |gam| <= 2,
170 //          err(gam) <= (4*k-11)*e*2^(3/2-m),
171 //    and
172 //          err(gam-1) <= (4*k-11)*e*2^(3/2-m),
173 //          err(gam+1) <= (4*k-9)*e*2^(3/2-m),
174 //          err(gam^2-1) <= (20*k-51)*e*2^(3/2-m),
175 //          err(1/gam) <= (4*k-10)*e*(2*k-9/2-m).
176 // 8. Now we start the FFT.
177 //    Before the first step, x_i are integral, |x_i| < 2^l and err(x_i) = 0.
178 //    After the first butterfly, which replaces (x(i1),x(i2)) by
179 //    (x(i1) + x(i2), x(i1) - x(i2)), we have |x_i| < 2^(l+1) and err(x_i) = 0.
180 //    Then, for each of the remaining k-2 steps, a butterfly replaces
181 //    (x[i1],x[i2],x[i3],x[i4]) by
182 //    (x[i1]-x[i3] - x[i4]*gam,   x[i3]*gam + x[i2]+x[i4]*(gam^2-1),
183 //     x[i1]-x[i3] + x[i4]*gam, - x[i3]*gam + x[i2]+x[i4]*(gam^2-1)).
184 //    Thus, after n steps we have |x_i| < 2^(l+3n) and err(x_i) <= E_i where
185 //          E_0 = 0,
186 //          E_1 = 0,
187 //          E_(n+1) = ...
188 //
189 // This is just too complicated. We use fftc's round-off error estimates.
190 // As a consequence, sometimes the algorithm bails out, saying
191 // "FFT problem: checksum error"!!!
192
193
194 #if !(intDsize==32)
195 #error "real fft implemented only for intDsize==32"
196 #endif
197
198
199 #include "cln/floatparam.h"
200 #include "cln/exception.h"
201
202
203 #if defined(HAVE_LONGDOUBLE) && (long_double_mant_bits > double_mant_bits) && (defined(__i386__) || defined(__m68k__) || (defined(__sparc__) && 0))
204 // Only these CPUs have fast "long double"s in hardware.
205 // On SPARC, "long double"s are emulated in software and don't work.
206 typedef long double fftr_real;
207 #define fftr_real_mant_bits long_double_mant_bits
208 #define fftr_real_rounds long_double_rounds
209 #else
210 typedef double fftr_real;
211 #define fftr_real_mant_bits double_mant_bits
212 #define fftr_real_rounds double_rounds
213 #endif
214
215 // We need complex numbers for precomputing the roots of unity.
216 typedef struct fftr_complex {
217         fftr_real re;
218         fftr_real im;
219 } fftr_complex;
220
221 // For every integer exp, we need to precompute  gam = 2 cos(2 pi exp/N).
222 typedef union {
223         fftr_complex c; // exp(2 pi i exp/N)
224         struct {
225                 fftr_real d; // gam = 2 cos(2 pi exp/N)
226                 fftr_real e; // gam^2-1
227                 fftr_real f; // 1/gam
228         } gam;
229 } fftr_cosinus;
230
231 static const fftr_complex fftr_roots_of_1 [32+1] =
232   // roots_of_1[n] is a (2^n)th root of unity in C.
233   // Also roots_of_1[n-1] = roots_of_1[n]^2.
234   // For simplicity we choose  roots_of_1[n] = exp(2 pi i/2^n).
235   {
236    #if (fftr_real_mant_bits == double_mant_bits)
237     // These values have 64 bit precision.
238     { 1.0,                    0.0 },
239     { -1.0,                   0.0 },
240     { 0.0,                    1.0 },
241     { 0.7071067811865475244,  0.7071067811865475244 },
242     { 0.9238795325112867561,  0.38268343236508977172 },
243     { 0.9807852804032304491,  0.19509032201612826784 },
244     { 0.99518472667219688623, 0.098017140329560601996 },
245     { 0.9987954562051723927,  0.049067674327418014254 },
246     { 0.9996988186962042201,  0.024541228522912288032 },
247     { 0.99992470183914454094, 0.0122715382857199260795 },
248     { 0.99998117528260114264, 0.0061358846491544753597 },
249     { 0.9999952938095761715,  0.00306795676296597627 },
250     { 0.99999882345170190993, 0.0015339801862847656123 },
251     { 0.99999970586288221914, 7.6699031874270452695e-4 },
252     { 0.99999992646571785114, 3.8349518757139558907e-4 },
253     { 0.9999999816164292938,  1.9174759731070330744e-4 },
254     { 0.9999999954041073129,  9.5873799095977345874e-5 },
255     { 0.99999999885102682754, 4.793689960306688455e-5 },
256     { 0.99999999971275670683, 2.3968449808418218729e-5 },
257     { 0.9999999999281891767,  1.1984224905069706422e-5 },
258     { 0.99999999998204729416, 5.9921124526424278428e-6 },
259     { 0.99999999999551182357, 2.9960562263346607504e-6 },
260     { 0.99999999999887795586, 1.4980281131690112288e-6 },
261     { 0.999999999999719489,   7.4901405658471572114e-7 },
262     { 0.99999999999992987223, 3.7450702829238412391e-7 },
263     { 0.99999999999998246807, 1.8725351414619534487e-7 },
264     { 0.999999999999995617,   9.36267570730980828e-8 },
265     { 0.99999999999999890425, 4.6813378536549092695e-8 },
266     { 0.9999999999999997261,  2.340668926827455276e-8 },
267     { 0.99999999999999993153, 1.1703344634137277181e-8 },
268     { 0.99999999999999998287, 5.8516723170686386908e-9 },
269     { 0.9999999999999999957,  2.925836158534319358e-9 },
270     { 0.9999999999999999989,  1.4629180792671596806e-9 }
271    #else (fftr_real_mant_bits > double_mant_bits)
272     // These values have 128 bit precision.
273     { 1.0L,                                       0.0L },
274     { -1.0L,                                      0.0L },
275     { 0.0L,                                       1.0L },
276     { 0.707106781186547524400844362104849039284L, 0.707106781186547524400844362104849039284L },
277     { 0.923879532511286756128183189396788286823L, 0.38268343236508977172845998403039886676L },
278     { 0.980785280403230449126182236134239036975L, 0.195090322016128267848284868477022240928L },
279     { 0.995184726672196886244836953109479921574L, 0.098017140329560601994195563888641845861L },
280     { 0.998795456205172392714771604759100694444L, 0.0490676743274180142549549769426826583147L },
281     { 0.99969881869620422011576564966617219685L,  0.0245412285229122880317345294592829250654L },
282     { 0.99992470183914454092164649119638322435L,  0.01227153828571992607940826195100321214037L },
283     { 0.999981175282601142656990437728567716173L, 0.00613588464915447535964023459037258091705L },
284     { 0.999995293809576171511580125700119899554L, 0.00306795676296597627014536549091984251894L },
285     { 0.99999882345170190992902571017152601905L,  0.001533980186284765612303697150264079079954L },
286     { 0.999999705862882219160228217738765677117L, 7.66990318742704526938568357948576643142e-4L },
287     { 0.99999992646571785114473148070738785695L,  3.83495187571395589072461681181381263396e-4L },
288     { 0.999999981616429293808346915402909714504L, 1.91747597310703307439909561989000933469e-4L },
289     { 0.99999999540410731289097193313960614896L,  9.58737990959773458705172109764763511872e-5L },
290     { 0.9999999988510268275626733077945541084L,   4.79368996030668845490039904946588727468e-5L },
291     { 0.99999999971275670684941397221864177609L,  2.39684498084182187291865771650218200947e-5L },
292     { 0.999999999928189176709775095883850490262L, 1.198422490506970642152156159698898480473e-5L },
293     { 0.99999999998204729417728262414778410738L,  5.99211245264242784287971180889086172999e-6L },
294     { 0.99999999999551182354431058417299732444L,  2.99605622633466075045481280835705981183e-6L },
295     { 0.999999999998877955886077016551752536504L, 1.49802811316901122885427884615536112069e-6L },
296     { 0.999999999999719488971519214794719584451L, 7.49014056584715721130498566730655637157e-7L },
297     { 0.99999999999992987224287980123972873676L,  3.74507028292384123903169179084633177398e-7L },
298     { 0.99999999999998246806071995015624773673L,  1.8725351414619534486882457659356361712e-7L },
299     { 0.999999999999995617015179987529456656217L, 9.3626757073098082799067286680885620193e-8L },
300     { 0.999999999999998904253794996881763834182L, 4.68133785365490926951155181385400969594e-8L },
301     { 0.99999999999999972606344874922040343793L,  2.34066892682745527595054934190348440379e-8L },
302     { 0.999999999999999931515862187305098514444L, 1.170334463413727718124621350323810379807e-8L },
303     { 0.999999999999999982878965546826274482047L, 5.8516723170686386908097901008341396944e-9L },
304     { 0.999999999999999995719741386706568611352L, 2.92583615853431935792823046906895590202e-9L },
305     { 0.999999999999999998929935346676642152265L, 1.46291807926715968052953216186596371037e-9L }
306    #endif
307   };
308
309 // Define this for (cheap) consistency checks.
310 #define DEBUG_FFTR
311
312 static fftr_real fftr_pow2_table[64] = // table of powers of 2
313   {
314                       1.0,
315                       2.0,
316                       4.0,
317                       8.0,
318                      16.0,
319                      32.0,
320                      64.0,
321                     128.0,
322                     256.0,
323                     512.0,
324                    1024.0,
325                    2048.0,
326                    4096.0,
327                    8192.0,
328                   16384.0,
329                   32768.0,
330                   65536.0,
331                  131072.0,
332                  262144.0,
333                  524288.0,
334                 1048576.0,
335                 2097152.0,
336                 4194304.0,
337                 8388608.0,
338                16777216.0,
339                33554432.0,
340                67108864.0,
341               134217728.0,
342               268435456.0,
343               536870912.0,
344              1073741824.0,
345              2147483648.0,
346              4294967296.0,
347              8589934592.0,
348             17179869184.0,
349             34359738368.0,
350             68719476736.0,
351            137438953472.0,
352            274877906944.0,
353            549755813888.0,
354           1099511627776.0,
355           2199023255552.0,
356           4398046511104.0,
357           8796093022208.0,
358          17592186044416.0,
359          35184372088832.0,
360          70368744177664.0,
361         140737488355328.0,
362         281474976710656.0,
363         562949953421312.0,
364        1125899906842624.0,
365        2251799813685248.0,
366        4503599627370496.0,
367        9007199254740992.0,
368       18014398509481984.0,
369       36028797018963968.0,
370       72057594037927936.0,
371      144115188075855872.0,
372      288230376151711744.0,
373      576460752303423488.0,
374     1152921504606846976.0,
375     2305843009213693952.0,
376     4611686018427387904.0,
377     9223372036854775808.0
378   };
379
380 // For a constant expression n (0 <= n < 128), returns 2^n of type fftr_real.
381 #define fftr_pow2(n)  \
382   (((n) & 64 ? (fftr_real)18446744073709551616.0 : (fftr_real)1.0)      \
383    * ((n) & 32 ? (fftr_real)4294967296.0 : (fftr_real)1.0)              \
384    * ((n) & 16 ? (fftr_real)65536.0 : (fftr_real)1.0)                   \
385    * ((n) & 8 ? (fftr_real)256.0 : (fftr_real)1.0)                      \
386    * ((n) & 4 ? (fftr_real)16.0 : (fftr_real)1.0)                       \
387    * ((n) & 2 ? (fftr_real)4.0 : (fftr_real)1.0)                        \
388    * ((n) & 1 ? (fftr_real)2.0 : (fftr_real)1.0)                        \
389   )
390
391 // r := a * b
392 static inline void mul (const fftr_complex& a, const fftr_complex& b, fftr_complex& r)
393 {
394         var fftr_real r_re = a.re * b.re - a.im * b.im;
395         var fftr_real r_im = a.re * b.im + a.im * b.re;
396         r.re = r_re;
397         r.im = r_im;
398 }
399
400 static uintC shuffle (uintL n, uintC x)
401 {
402         var uintC y = 0;
403         var sintC v = 1;
404         // Invariant: y + v*shuffle(n,x).
405         do {
406                 if (x & 1)
407                         if (x == 1)
408                                 return y + (v << (n-1));
409                         else {
410                                 y = y + (v << n);
411                                 v = -v;
412                         }
413                 x >>= 1;
414         } while (!(--n == 0));
415         throw runtime_exception();
416 }
417
418 #if 0 // unused
419 static uintC invshuffle (uintL n, uintC x)
420 {
421         var uintC y = 0;
422         var uintC v = 1;
423         // Invariant: y + v*invshuffle(n,x).
424         do {
425                 if (x == ((uintC)1 << (n-1)))
426                         return y + v;
427                 else if (x > ((uintC)1 << (n-1))) {
428                         x = ((uintC)1 << n) - x;
429                         y = y+v;
430                 }
431                 v <<= 1;
432         } while (!(--n == 0));
433         throw runtime_exception();
434 }
435 #endif
436
437 // Compute a real convolution using FFT: z[0..N-1] := x[0..N-1] * y[0..N-1].
438 static void fftr_convolution (const uintL n, const uintC N, // N = 2^n
439                               fftr_real * x, // N numbers
440                               fftr_real * y, // N numbers
441                               fftr_real * z  // N numbers result
442                              )
443 {
444         CL_ALLOCA_STACK;
445         var fftr_cosinus* const w = cl_alloc_array(fftr_cosinus,N>>2);
446         var uintC i;
447         // Initialize w[exp].c to exp(2 pi i exp/N).
448         w[0].c = fftr_roots_of_1[0];
449         {
450                 var int j;
451                 for (j = n-3; j>=0; j--) {
452                         var fftr_complex r_j = fftr_roots_of_1[n-j];
453                         w[1<<j].c = r_j;
454                         for (i = (2<<j); i < N>>2; i += (2<<j))
455                                 mul(w[i].c,r_j, w[i+(1<<j)].c);
456                 }
457         }
458         // Initialize w[exp] to 2 cos(2 pi exp/N).
459         for (i = 0; i < N>>2; i++) {
460                 var fftr_real gam = w[i].c.re * (fftr_real)2.0;
461                 w[i].gam.d = gam;
462                 w[i].gam.e = (gam - (fftr_real)1.0) * (gam + (fftr_real)1.0);
463                 w[i].gam.f = (fftr_real)1.0 / gam;
464         }
465         var bool squaring = (x == y);
466         // Do an FFT of length N on x.
467         {
468                 var uintL l;
469                 for (l = n-1; l > 0; l--) {
470                         /* s = 0 */ {
471                                 var const uintC tmax = (uintC)1 << l;
472                                 for (var uintC t = 0; t < tmax; t++) {
473                                         var uintC i1 = t;
474                                         var uintC i2 = i1 + tmax;
475                                         // replace (x[i1],x[i2]) by
476                                         // (x[i1] + x[i2], x[i1] - x[i2])
477                                         var fftr_real tmp;
478                                         tmp = x[i2];
479                                         x[i2] = x[i1] - tmp;
480                                         x[i1] = x[i1] + tmp;
481                                 }
482                         }
483                         var const uintC smax = (uintC)1 << (n-1-l);
484                         var const uintC tmax = (uintC)1 << (l-1);
485                         for (var uintC s = 1; s < smax; s++) {
486                                 var uintC exp = shuffle(n-1-l,s) << (l-1);
487                                 for (var uintC t = 0; t < tmax; t++) {
488                                         var uintC i1 = (s << (l+1)) + t;
489                                         var uintC i2 = i1 + tmax;
490                                         var uintC i3 = i2 + tmax;
491                                         var uintC i4 = i3 + tmax;
492                                         // replace (x[i1],x[i2],x[i3],x[i4]) by
493                                         // (x[i1]-x[i3] - x[i4]*gam,
494                                         //  x[i3]*gam + x[i2]+x[i4]*(gam^2-1),
495                                         //  x[i1]-x[i3] + x[i4]*gam,
496                                         //  - x[i3]*gam + x[i2]+x[i4]*(gam^2-1))
497                                         var fftr_real sum13;
498                                         var fftr_real sum24;
499                                         var fftr_real tmp3;
500                                         var fftr_real tmp4;
501                                         sum13 = x[i1] - x[i3];
502                                         tmp3 = x[i3]*w[exp].gam.d;
503                                         tmp4 = x[i4]*w[exp].gam.d;
504                                         sum24 = x[i2]+x[i4]*w[exp].gam.e;
505                                         x[i1] = sum13 - tmp4;
506                                         x[i3] = sum13 + tmp4;
507                                         x[i2] = sum24 + tmp3;
508                                         x[i4] = sum24 - tmp3;
509                                 }
510                         }
511                 }
512                 /* l = 0 */ {
513                         // replace (x[0],x[1]) by (x[0]+x[1], x[0]-x[1])
514                         var fftr_real tmp;
515                         tmp = x[1];
516                         x[1] = x[0] - tmp;
517                         x[0] = x[0] + tmp;
518                 }
519         }
520         // Do an FFT of length N on y.
521         if (!squaring) {
522                 var uintL l;
523                 for (l = n-1; l > 0; l--) {
524                         /* s = 0 */ {
525                                 var const uintC tmax = (uintC)1 << l;
526                                 for (var uintC t = 0; t < tmax; t++) {
527                                         var uintC i1 = t;
528                                         var uintC i2 = i1 + tmax;
529                                         // replace (y[i1],y[i2]) by
530                                         // (y[i1] + y[i2], y[i1] - y[i2])
531                                         var fftr_real tmp;
532                                         tmp = y[i2];
533                                         y[i2] = y[i1] - tmp;
534                                         y[i1] = y[i1] + tmp;
535                                 }
536                         }
537                         var const uintC smax = (uintC)1 << (n-1-l);
538                         var const uintC tmax = (uintC)1 << (l-1);
539                         for (var uintC s = 1; s < smax; s++) {
540                                 var uintC exp = shuffle(n-1-l,s) << (l-1);
541                                 for (var uintC t = 0; t < tmax; t++) {
542                                         var uintC i1 = (s << (l+1)) + t;
543                                         var uintC i2 = i1 + tmax;
544                                         var uintC i3 = i2 + tmax;
545                                         var uintC i4 = i3 + tmax;
546                                         // replace (y[i1],y[i2],y[i3],y[i4]) by
547                                         // (y[i1]-y[i3] - y[i4]*gam,
548                                         //  y[i3]*gam + y[i2]+y[i4]*(gam^2-1),
549                                         //  y[i1]-y[i3] + y[i4]*gam,
550                                         //  - y[i3]*gam + y[i2]+y[i4]*(gam^2-1))
551                                         var fftr_real sum13;
552                                         var fftr_real sum24;
553                                         var fftr_real tmp3;
554                                         var fftr_real tmp4;
555                                         sum13 = y[i1] - y[i3];
556                                         tmp3 = y[i3]*w[exp].gam.d;
557                                         tmp4 = y[i4]*w[exp].gam.d;
558                                         sum24 = y[i2]+y[i4]*w[exp].gam.e;
559                                         y[i1] = sum13 - tmp4;
560                                         y[i3] = sum13 + tmp4;
561                                         y[i2] = sum24 + tmp3;
562                                         y[i4] = sum24 - tmp3;
563                                 }
564                         }
565                 }
566                 /* l = 0 */ {
567                         // replace (y[0],y[1]) by (y[0]+y[1], y[0]-y[1])
568                         var fftr_real tmp;
569                         tmp = y[1];
570                         y[1] = y[0] - tmp;
571                         y[0] = y[0] + tmp;
572                 }
573         }
574         // Multiply the transformed vectors into z.
575         {
576                 // Multiplication mod (z-1).
577                 z[0] = x[0] * y[0];
578                 // Multiplication mod (z+1).
579                 z[1] = x[1] * y[1];
580                 #if 0 // This needs w[1..2^(n-1)-1].
581                 var const uintC smax = (uintC)1 << (n-1);
582                 for (var uintC s = 1; s < smax; s++) {
583                         // Multiplication mod (z^2 - 2 cos(2 pi j/2^n) z + 1)
584                         // with j = shuffle(n-1,s).
585                         var uintC exp = shuffle(n-1,s);
586                         var fftr_real tmp0 = x[2*s] * y[2*s];
587                         var fftr_real tmp1 = x[2*s] * y[2*s+1] + x[2*s+1] * y[2*s];
588                         var fftr_real tmp2 = x[2*s+1] * y[2*s+1];
589                         z[2*s] = tmp0 - tmp2;
590                         z[2*s+1] = tmp1 + tmp2*w[exp].gam.d;
591                 }
592                 #else // This only needs w[1..2^(n-2)-1].
593                 // Multiplication mod (z^2+1).
594                 /* s = 1 */ {
595                         var fftr_real tmp0 = x[2] * y[2];
596                         var fftr_real tmp1 = x[2] * y[3] + x[3] * y[2];
597                         var fftr_real tmp2 = x[3] * y[3];
598                         z[2] = tmp0 - tmp2;
599                         z[3] = tmp1;
600                 }
601                 var const uintC smax = (uintC)1 << (n-2);
602                 for (var uintC s = 1; s < smax; s++) {
603                         var uintC exp = shuffle(n-2,s);
604                         // Multiplication mod (z^2 - 2 cos(2 pi j/2^n) z + 1)
605                         // with j = shuffle(n-1,2*s) = shuffle(n-2,s).
606                         var fftr_real gam = w[exp].gam.d;
607                         {
608                                 var fftr_real tmp0 = x[4*s] * y[4*s];
609                                 var fftr_real tmp1 = x[4*s] * y[4*s+1] + x[4*s+1] * y[4*s];
610                                 var fftr_real tmp2 = x[4*s+1] * y[4*s+1];
611                                 z[4*s] = tmp0 - tmp2;
612                                 z[4*s+1] = tmp1 + tmp2 * gam;
613                         }
614                         // Multiplication mod (z^2 - 2 cos(2 pi j/2^n) z + 1)
615                         // with j = shuffle(n-1,2*s+1) = 2^(n-1) - shuffle(n-2,s).
616                         {
617                                 var fftr_real tmp0 = x[4*s+2] * y[4*s+2];
618                                 var fftr_real tmp1 = x[4*s+2] * y[4*s+3] + x[4*s+3] * y[4*s+2];
619                                 var fftr_real tmp2 = x[4*s+3] * y[4*s+3];
620                                 z[4*s+2] = tmp0 - tmp2;
621                                 z[4*s+3] = tmp1 - tmp2 * gam;
622                         }
623                 }
624                 #endif
625         }
626         // Undo an FFT of length N on z.
627         {
628                 var uintL l;
629                 /* l = 0 */ {
630                         // replace (z[0],z[1]) by ((z[0]+z[1])/2, (z[0]-z[1])/2)
631                         var fftr_real tmp;
632                         tmp = z[1];
633                         z[1] = (z[0] - tmp) * (fftr_real)0.5;
634                         z[0] = (z[0] + tmp) * (fftr_real)0.5;
635                 }
636                 for (l = 1; l < n; l++) {
637                         /* s = 0 */ {
638                                 var const uintC tmax = (uintC)1 << l;
639                                 for (var uintC t = 0; t < tmax; t++) {
640                                         var uintC i1 = t;
641                                         var uintC i2 = i1 + tmax;
642                                         // replace (z[i1],z[i2]) by
643                                         // ((z[i1]+z[i2])/2, (z[i1]-z[i2])/2)
644                                         // Do the division by 2 later.
645                                         var fftr_real tmp;
646                                         tmp = z[i2];
647                                         z[i2] = z[i1] - tmp;
648                                         z[i1] = z[i1] + tmp;
649                                 }
650                         }
651                         var const uintC smax = (uintC)1 << (n-1-l);
652                         var const uintC tmax = (uintC)1 << (l-1);
653                         for (var uintC s = 1; s < smax; s++) {
654                                 var uintC exp = shuffle(n-1-l,s) << (l-1);
655                                 for (var uintC t = 0; t < tmax; t++) {
656                                         var uintC i1 = (s << (l+1)) + t;
657                                         var uintC i2 = i1 + tmax;
658                                         var uintC i3 = i2 + tmax;
659                                         var uintC i4 = i3 + tmax;
660                                         // replace (z[i1],z[i2],z[i3],z[i4]) by
661                                         // ((z[i1]+z[i3]+(z[i2]-z[i4])/gam)/2,
662                                         //  (z[i2]+z[i4]-(z[i3]-z[i1])/gam*(gam^2-1))/2,
663                                         //  (z[i2]-z[i4])/(gam*2),
664                                         //  (z[i3]-z[i1])/(gam*2))
665                                         // Do the division by 2 later.
666                                         var fftr_real sum13;
667                                         var fftr_real sum24;
668                                         var fftr_real tmp3;
669                                         var fftr_real tmp4;
670                                         sum13 = z[i1] + z[i3];
671                                         sum24 = z[i2] + z[i4];
672                                         tmp4 = (z[i3] - z[i1]) * w[exp].gam.f;
673                                         tmp3 = (z[i2] - z[i4]) * w[exp].gam.f;
674                                         z[i1] = sum13 + tmp3;
675                                         z[i2] = sum24 - tmp4*w[exp].gam.e;
676                                         z[i3] = tmp3;
677                                         z[i4] = tmp4;
678                                 }
679                         }
680                 }
681                 // Do all divisions by 2 now.
682                 {
683                         var fftr_real f = (fftr_real)2.0 / (fftr_real)N; // 2^-(n-1)
684                         for (i = 0; i < N; i++)
685                                 z[i] = z[i]*f;
686                 }
687         }
688 }
689
690 // For a given k >= 2, the maximum l is determined by
691 // 2*l < m - 1/2 - 2*k - log_2(12*k-15) - (1 if e=1.0, 0 if e=0.5).
692 // This is a decreasing function of k.
693 #define max_l(k)  \
694         (int)((fftr_real_mant_bits                      \
695                - 2*(k)                                  \
696                - ((k)<=2 ? 4 : (k)<=3 ? 5 : (k)<=5 ? 6 : (k)<=8 ? 7 : (k)<=16 ? 8 : (k)<=31 ? 9 : 10) \
697                - (fftr_real_rounds == rounds_to_nearest ? 0 : 1)) \
698               / 2)
699 static int max_l_table[32+1] =
700   {         0,         0,  max_l(2),  max_l(3),  max_l(4),  max_l(5),  max_l(6),
701      max_l(7),  max_l(8),  max_l(9), max_l(10), max_l(11), max_l(12), max_l(13),
702     max_l(14), max_l(15), max_l(16), max_l(17), max_l(18), max_l(19), max_l(20),
703     max_l(21), max_l(22), max_l(23), max_l(24), max_l(25), max_l(26), max_l(27),
704     max_l(28), max_l(29), max_l(30), max_l(31), max_l(32)
705   };
706
707 // Split len uintD's below sourceptr into chunks of l bits, thus filling
708 // N real numbers at x.
709 static void fill_factor (uintC N, fftr_real* x, uintL l,
710                          const uintD* sourceptr, uintC len)
711 {
712         var uintC i;
713         if (max_l(2) > intDsize && l > intDsize) {
714                 // l > intDsize
715                 if (max_l(2) > 64 && l > 64) {
716                         throw runtime_exception("FFT problem: l > 64 not supported by pow2_table");
717                 }
718                 var fftr_real carry = 0;
719                 var sintL carrybits = 0; // number of bits in carry (>=0, <l)
720                 i = 0;
721                 while (len > 0) {
722                         var uintD digit = lsprefnext(sourceptr);
723                         if (carrybits+intDsize >= l) {
724                                 x[i] = carry + (fftr_real)(digit & bitm(l-carrybits)) * fftr_pow2_table[carrybits];
725                                 i++;
726                                 carry = (l-carrybits == intDsize ? (fftr_real)0 : (fftr_real)(digit >> (l-carrybits)));
727                                 carrybits = carrybits+intDsize-l;
728                         } else {
729                                 carry = carry + (fftr_real)digit * fftr_pow2_table[carrybits];
730                                 carrybits = carrybits+intDsize;
731                         }
732                         len--;
733                 }
734                 if (carrybits > 0) {
735                         x[i] = carry;
736                         i++;
737                 }
738                 if (i > N)
739                         throw runtime_exception();
740         } else if (max_l(2) >= intDsize && l == intDsize) {
741                 // l = intDsize
742                 if (len > N)
743                         throw runtime_exception();
744                 for (i = 0; i < len; i++) {
745                         var uintD digit = lsprefnext(sourceptr);
746                         x[i] = (fftr_real)digit;
747                 }
748         } else {
749                 // l < intDsize
750                 var const uintD l_mask = bit(l)-1;
751                 var uintD carry = 0;
752                 var sintL carrybits = 0; // number of bits in carry (>=0, <intDsize)
753                 for (i = 0; i < N; i++) {
754                         if (carrybits >= l) {
755                                 x[i] = (fftr_real)(carry & l_mask);
756                                 carry >>= l;
757                                 carrybits -= l;
758                         } else {
759                                 if (len == 0)
760                                         break;
761                                 len--;
762                                 var uintD digit = lsprefnext(sourceptr);
763                                 x[i] = (fftr_real)((carry | (digit << carrybits)) & l_mask);
764                                 carry = digit >> (l-carrybits);
765                                 carrybits = intDsize - (l-carrybits);
766                         }
767                 }
768                 while (carrybits > 0) {
769                         if (!(i < N))
770                                 throw runtime_exception();
771                         x[i] = (fftr_real)(carry & l_mask);
772                         carry >>= l;
773                         carrybits -= l;
774                         i++;
775                 }
776                 if (len > 0)
777                         throw runtime_exception();
778         }
779         for ( ; i < N; i++)
780                 x[i] = (fftr_real)0;
781 }
782
783 // Given a not too large floating point number, round it to the nearest integer.
784 static inline fftr_real fftr_fround (fftr_real x)
785 {
786         return
787           #if (fftr_real_rounds == rounds_to_nearest)
788             (x + (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2)))
789             - (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2));
790           #elif (fftr_real_rounds == rounds_to_infinity)
791             (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2))
792             - ((fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2))
793                - (x + (fftr_real)0.5));
794           #else // rounds_to_zero, rounds_to_minus_infinity
795             ((x + (fftr_real)0.5)
796              + (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2)))
797             - (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2));
798           #endif
799 }
800
801 // Given a not too large floating point number, round it down.
802 static inline fftr_real fftr_ffloor (fftr_real x)
803 {
804         #if (fftr_real_rounds == rounds_to_nearest)
805           var fftr_real y =
806             (x + (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2)))
807             - (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2));
808           if (y <= x)
809                 return y;
810           else
811                 return y - (fftr_real)1.0;
812         #elif (fftr_real_rounds == rounds_to_infinity)
813           return
814             (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2))
815             - ((fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2))
816                - x);
817         #else // rounds_to_zero, rounds_to_minus_infinity
818           return
819             (x + (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2)))
820             - (fftr_pow2(fftr_real_mant_bits-1)+fftr_pow2(fftr_real_mant_bits-2));
821         #endif
822 }
823
824 // Combine the N real numbers at z into uintD's below destptr.
825 // The z[i] are known to be approximately integers >= 0, < N*2^(2*l).
826 // Assumes room for floor(N*l/intDsize)+(1+ceiling((n+2*l)/intDsize)) uintD's
827 // below destptr. Fills len digits and returns (destptr lspop len).
828 static uintD* unfill_product (uintL n, uintC N, // N = 2^n
829                               const fftr_real * z, uintL l,
830                               uintD* destptr)
831 {
832         var uintC i;
833         if (n + 2*l <= intDsize) {
834                 // 2-digit carry is sufficient, l < intDsize
835                 var uintD carry0 = 0;
836                 var uintD carry1 = 0;
837                 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
838                 for (i = 0; i < N; i++) {
839                         // Fetch z[i] and round it to the nearest uintD.
840                         var uintD digit = (uintD)(z[i] + (fftr_real)0.5);
841                         if (shift > 0) {
842                                 carry1 += digit >> (intDsize-shift);
843                                 digit = digit << shift;
844                         }
845                         if ((carry0 += digit) < digit)
846                                 carry1 += 1;
847                         shift += l;
848                         if (shift >= intDsize) {
849                                 lsprefnext(destptr) = carry0;
850                                 carry0 = carry1;
851                                 carry1 = 0;
852                                 shift -= intDsize;
853                         }
854                 }
855                 lsprefnext(destptr) = carry0;
856                 lsprefnext(destptr) = carry1;
857         } else if (n + 2*l <= 2*intDsize) {
858                 // 3-digit carry is sufficient, l < intDsize
859                 #if HAVE_DD
860                 var uintDD carry0 = 0;
861                 var uintD carry1 = 0;
862                 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
863                 for (i = 0; i < N; i++) {
864                         // Fetch z[i] and round it to the nearest uintDD.
865                         var uintDD digit = (uintDD)(z[i] + (fftr_real)0.5);
866                         if (shift > 0) {
867                                 carry1 += (uintD)(digit >> (2*intDsize-shift));
868                                 digit = digit << shift;
869                         }
870                         if ((carry0 += digit) < digit)
871                                 carry1 += 1;
872                         shift += l;
873                         if (shift >= intDsize) {
874                                 lsprefnext(destptr) = lowD(carry0);
875                                 carry0 = highlowDD(carry1,highD(carry0));
876                                 carry1 = 0;
877                                 shift -= intDsize;
878                         }
879                 }
880                 lsprefnext(destptr) = lowD(carry0);
881                 lsprefnext(destptr) = highD(carry0);
882                 lsprefnext(destptr) = carry1;
883                 #else
884                 var uintD carry0 = 0;
885                 var uintD carry1 = 0;
886                 var uintD carry2 = 0;
887                 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
888                 for (i = 0; i < N; i++) {
889                         // Fetch z[i] and round it to the nearest uintDD.
890                         var fftr_real zi = z[i] + (fftr_real)0.5;
891                         var const fftr_real pow2_intDsize = fftr_pow2(intDsize);
892                         var const fftr_real invpow2_intDsize = (fftr_real)1.0/pow2_intDsize;
893                         var uintD digit1 = (uintD)(zi * invpow2_intDsize);
894                         var uintD digit0 = (uintD)(zi - (fftr_real)digit1 * pow2_intDsize);
895                         if (shift > 0) {
896                                 carry2 += digit1 >> (intDsize-shift);
897                                 digit1 = (digit1 << shift) | (digit0 >> (intDsize-shift));
898                                 digit0 = digit0 << shift;
899                         }
900                         if ((carry0 += digit0) < digit0)
901                                 if ((carry1 += 1) == 0)
902                                         carry2 += 1;
903                         if ((carry1 += digit1) < digit1)
904                                 carry2 += 1;
905                         shift += l;
906                         if (shift >= intDsize) {
907                                 lsprefnext(destptr) = carry0;
908                                 carry0 = carry1;
909                                 carry1 = carry2;
910                                 carry2 = 0;
911                                 shift -= intDsize;
912                         }
913                 }
914                 lsprefnext(destptr) = carry0;
915                 lsprefnext(destptr) = carry1;
916                 lsprefnext(destptr) = carry2;
917                 #endif
918         } else {
919                 // 1-digit+1-float carry is sufficient
920                 var uintD carry0 = 0;
921                 var fftr_real carry1 = 0;
922                 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
923                 for (i = 0; i < N; i++) {
924                         // Fetch z[i] and round it to the nearest integer.
925                         var fftr_real digit = fftr_fround(z[i]);
926                         #ifdef DEBUG_FFTR
927                         if (!(digit >= (fftr_real)0
928                               && z[i] > digit - (fftr_real)0.5
929                               && z[i] < digit + (fftr_real)0.5))
930                                 throw runtime_exception();
931                         #endif
932                         if (shift > 0)
933                                 digit = digit * fftr_pow2_table[shift];
934                         var fftr_real digit1 = fftr_ffloor(digit*((fftr_real)1.0/fftr_pow2(intDsize)));
935                         var uintD digit0 = (uintD)(digit - digit1*fftr_pow2(intDsize));
936                         carry1 += digit1;
937                         if ((carry0 += digit0) < digit0)
938                                 carry1 += (fftr_real)1.0;
939                         shift += l;
940                         while (shift >= intDsize) {
941                                 lsprefnext(destptr) = carry0;
942                                 var fftr_real tmp = fftr_ffloor(carry1*((fftr_real)1.0/fftr_pow2(intDsize)));
943                                 carry0 = (uintD)(carry1 - tmp*fftr_pow2(intDsize));
944                                 carry1 = tmp;
945                                 shift -= intDsize;
946                         }
947                 }
948                 if (carry0 > 0 || carry1 > (fftr_real)0.0) {
949                         lsprefnext(destptr) = carry0;
950                         while (carry1 > (fftr_real)0.0) {
951                                 var fftr_real tmp = fftr_ffloor(carry1*((fftr_real)1.0/fftr_pow2(intDsize)));
952                                 lsprefnext(destptr) = (uintD)(carry1 - tmp*fftr_pow2(intDsize));
953                                 carry1 = tmp;
954                         }
955                 }
956         }
957         return destptr;
958 }
959
960 static inline void mulu_fftr_nocheck (const uintD* sourceptr1, uintC len1,
961                                       const uintD* sourceptr2, uintC len2,
962                                       uintD* destptr)
963 // Es ist 2 <= len1 <= len2.
964 {
965         // We have to find parameters l and k such that
966         // ceiling(len1*intDsize/l) + ceiling(len2*intDsize/l) - 1 <= 2^k,
967         // and (12*k-15)*e*2^(2*l+2*k+1/2-m) < 1/2.
968         // Try primarily to minimize k. Minimizing l buys you nothing.
969         var uintL k;
970         // Computing k: If len1 and len2 differ much, we'll split source2 -
971         // hence for the moment just substitute len1 for len2.
972         //
973         // First approximation of k: A necessary condition for
974         // 2*ceiling(len1*intDsize/l) - 1 <= 2^k
975         // is  2*len1*intDsize/l_max - 1 <= 2^k.
976         {
977                 var const int l = max_l(2);
978                 var uintC lhs = 2*ceiling(len1*intDsize,l) - 1; // >=1
979                 if (lhs < 3)
980                         k = 2;
981                 else
982                         integerlengthC(lhs-1, k=); // k>=2
983         }
984         // Try whether this k is ok or whether we have to increase k.
985         for ( ; ; k++) {
986                 if (k >= sizeof(max_l_table)/sizeof(max_l_table[0])
987                     || max_l_table[k] <= 0) {
988                         throw runtime_exception("FFT problem: numbers too big, floating point precision not sufficient");
989                 }
990                 if (2*ceiling(len1*intDsize,max_l_table[k])-1 <= ((uintC)1 << k))
991                         break;
992         }
993         // We could try to reduce l, keeping the same k. But why should we?
994         // Calculate the number of pieces in which source2 will have to be
995         // split. Each of the pieces must satisfy
996         // ceiling(len1*intDsize/l) + ceiling(len2*intDsize/l) - 1 <= 2^k,
997         var uintC len2p;
998         // Try once with k, once with k+1. Compare them.
999         {
1000                 var uintC remaining_k = ((uintC)1 << k) + 1 - ceiling(len1*intDsize,max_l_table[k]);
1001                 var uintC max_piecelen_k = floor(remaining_k*max_l_table[k],intDsize);
1002                 var uintC numpieces_k = ceiling(len2,max_piecelen_k);
1003                 var uintC remaining_k1 = ((uintC)1 << (k+1)) + 1 - ceiling(len1*intDsize,max_l_table[k+1]);
1004                 var uintC max_piecelen_k1 = floor(remaining_k1*max_l_table[k+1],intDsize);
1005                 var uintC numpieces_k1 = ceiling(len2,max_piecelen_k1);
1006                 if (numpieces_k <= 2*numpieces_k1) {
1007                         // keep k
1008                         len2p = max_piecelen_k;
1009                 } else {
1010                         // choose k+1
1011                         k = k+1;
1012                         len2p = max_piecelen_k1;
1013                 }
1014         }
1015         var const uintL l = max_l_table[k];
1016         var const uintL n = k;
1017         var const uintC N = (uintC)1 << n;
1018         CL_ALLOCA_STACK;
1019         var fftr_real* const x = cl_alloc_array(fftr_real,N);
1020         var fftr_real* const y = cl_alloc_array(fftr_real,N);
1021         #ifdef DEBUG_FFTR
1022         var fftr_real* const z = cl_alloc_array(fftr_real,N);
1023         #else
1024         var fftr_real* const z = x; // put z in place of x - saves memory
1025         #endif
1026         var uintD* const tmpprod1 = cl_alloc_array(uintD,len1+1);
1027         var uintC tmpprod_len = floor(l<<n,intDsize)+6;
1028         var uintD* const tmpprod = cl_alloc_array(uintD,tmpprod_len);
1029         var uintC destlen = len1+len2;
1030         clear_loop_lsp(destptr,destlen);
1031         do {
1032                 if (len2p > len2)
1033                         len2p = len2;
1034                 if (len2p == 1) {
1035                         // cheap case
1036                         var uintD* tmpptr = arrayLSDptr(tmpprod1,len1+1);
1037                         mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
1038                         if (addto_loop_lsp(tmpptr,destptr,len1+1))
1039                                 if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
1040                                         throw runtime_exception();
1041                 } else {
1042                         var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
1043                         // Fill factor x.
1044                         fill_factor(N,x,l,sourceptr1,len1);
1045                         // Fill factor y.
1046                         if (!squaring)
1047                                 fill_factor(N,y,l,sourceptr2,len2p);
1048                         // Multiply.
1049                         if (!squaring)
1050                                 fftr_convolution(n,N, &x[0], &y[0], &z[0]);
1051                         else
1052                                 fftr_convolution(n,N, &x[0], &x[0], &z[0]);
1053                         #ifdef DEBUG_FFTR
1054                         // Check result.
1055                         {
1056                                 var fftr_real re_lo_limit = (fftr_real)(-0.5);
1057                                 var fftr_real re_hi_limit = (fftr_real)N * fftr_pow2_table[l] * fftr_pow2_table[l] + (fftr_real)0.5;
1058                                 for (var uintC i = 0; i < N; i++)
1059                                         if (!(z[i] > re_lo_limit
1060                                               && z[i] < re_hi_limit))
1061                                                 throw runtime_exception();
1062                         }
1063                         #endif
1064                         var uintD* tmpLSDptr = arrayLSDptr(tmpprod,tmpprod_len);
1065                         var uintD* tmpMSDptr = unfill_product(n,N,z,l,tmpLSDptr);
1066                         var uintC tmplen =
1067                           #if CL_DS_BIG_ENDIAN_P
1068                             tmpLSDptr - tmpMSDptr;
1069                           #else
1070                             tmpMSDptr - tmpLSDptr;
1071                           #endif
1072                         if (tmplen > tmpprod_len)
1073                           throw runtime_exception();
1074                         // Add result to destptr[-destlen..-1]:
1075                         if (tmplen > destlen) {
1076                                 if (test_loop_msp(tmpMSDptr,tmplen-destlen))
1077                                         throw runtime_exception();
1078                                 tmplen = destlen;
1079                         }
1080                         if (addto_loop_lsp(tmpLSDptr,destptr,tmplen))
1081                                 if (inc_loop_lsp(destptr lspop tmplen,destlen-tmplen))
1082                                         throw runtime_exception();
1083                 }
1084                 // Decrement len2.
1085                 destptr = destptr lspop len2p;
1086                 destlen -= len2p;
1087                 sourceptr2 = sourceptr2 lspop len2p;
1088                 len2 -= len2p;
1089         } while (len2 > 0);
1090 }
1091
1092 #ifndef _CHECKSUM
1093 #define _CHECKSUM
1094
1095 // Compute a checksum: number mod (2^intDsize-1).
1096 static uintD compute_checksum (const uintD* sourceptr, uintC len)
1097 {
1098         var uintD tmp = ~(uintD)0; // -1-(sum mod 2^intDsize-1), always >0
1099         do {
1100                 var uintD digit = lsprefnext(sourceptr);
1101                 if (digit < tmp)
1102                         tmp -= digit; // subtract digit
1103                 else
1104                         tmp -= digit+1; // subtract digit-(2^intDsize-1)
1105         } while (--len > 0);
1106         return ~tmp;
1107 }
1108
1109 // Multiply two checksums modulo (2^intDsize-1).
1110 static inline uintD multiply_checksum (uintD checksum1, uintD checksum2)
1111 {
1112         var uintD checksum;
1113         var uintD cksum_hi;
1114         #if HAVE_DD
1115         var uintDD cksum = muluD(checksum1,checksum2);
1116         cksum_hi = highD(cksum); checksum = lowD(cksum);
1117         #else
1118         muluD(checksum1,checksum2, cksum_hi =, checksum =);
1119         #endif
1120         if ((checksum += cksum_hi) + 1 <= cksum_hi)
1121                 checksum += 1;
1122         return checksum;
1123 }
1124
1125 #endif // _CHECKSUM
1126
1127 static void mulu_fftr (const uintD* sourceptr1, uintC len1,
1128                        const uintD* sourceptr2, uintC len2,
1129                        uintD* destptr)
1130 {
1131         // Compute checksums of the arguments and multiply them.
1132         var uintD checksum1 = compute_checksum(sourceptr1,len1);
1133         var uintD checksum2 = compute_checksum(sourceptr2,len2);
1134         var uintD checksum = multiply_checksum(checksum1,checksum2);
1135         mulu_fftr_nocheck(sourceptr1,len1,sourceptr2,len2,destptr);
1136         if (!(checksum == compute_checksum(destptr,len1+len2))) {
1137                 throw runtime_exception("FFT problem: checksum error.");
1138         }
1139 }