src/base/digitseq/cl_DS_mul_fftc.h

   1 // Fast integer multiplication using FFT over the complex numbers.
   2 // [Donald Ervin Knuth: The Art of Computer Programming, Vol. II:
   3 //  Seminumerical Algorithms, second edition. Section 4.3.3, p. 290-294.]
   4 // Bruno Haible 6.5.1996, 24.-25.8.1996
   5
   6 // FFT in the complex domain has the drawback that it needs careful round-off
   7 // error analysis. But for CPUs with good floating-point performance it might
   8 // nevertheless be better than FFT mod Z/mZ.
   9
  10 // It is important to have precise round-off error estimates. Although
  11 // (by laws of statistics) in the average the actual round-off error makes up
  12 // only half of the bits provided for round-off protection, we cannot rely
  13 // on this average behaviour, but have to produce correct results.
  14 //
  15 // Knuth's formula (42), p. 294, says:
  16 // If we want to multiply l-bit words using an FFT(2^k), our floating point
  17 // numbers shall have m >= 2(k+l) + k + log_2 k + 3.5 mantissa bits.
  18 //
  19 // Here is a more careful analysis, using absolute error estimates.
  20 //
  21 // 1. We assume floating point numbers with radix 2, with the properties:
  22 //    (i)   Multiplication with 2^n and 2^-n is exact.
  23 //    (ii)  Negation x -> -x is exact.
  24 //    (iii) Addition: When adding x and y, with |x| <= 2^a, |y| <= 2^a,
  25 //          the result |x+y| <= 2^(a+1) has an error <= e*2^(a+1-m).
  26 //    (iv)  Multiplication: When multiplying x and y, with |x| <= 2^a,
  27 //          |y| <= 2^b, the result |x*y| <= 2^(a+b) has an error <= e*2^(a+b-m).
  28 //    Here e = 1 for a truncating arithmetic, but e = 1/2 for a rounding
  29 //    arithmetic like IEEE single and double floats.
  30 // 2. Let's introduce some notation: err(x) means |x'-x| where x is the
  31 //    exact mathematical value and x' is its representation in the machine.
  32 // 3. From 1. we get for real numbers x,y:
  33 //    (i)   err(2^n*x) = 2^n * err(x),
  34 //    (ii)  err(-x) = err(x),
  35 //    (iii) |x| <= 2^a, |y| <= 2^a, then
  36 //          err(x+y) <= err(x) + err(y) + e*2^(a+1-m),
  37 //          [or ....    ............... + e*2^(a-m) if |x+y| <= 2^a].
  38 //    (iv)  |x| <= 2^a, |y| <= 2^b, then
  39 //          err(x*y) <= 2^a * err(y) + 2^b * err(x) + e*2^(a+b-m).
  40 // 4. Our complex arithmetic will be based on the formulas:
  41 //    (i)   2^n*(x+iy) = (2^n*x)+i(2^n*y)
  42 //    (ii)  -(x+iy) = (-x)+i(-y)
  43 //    (iii) (x+iy)+(u+iv) = (x+u)+i(y+v)
  44 //    (iv)  (x+iy)*(u+iv) = (x*u-y*v)+i(x*v+y*u)
  45 //    The notation err(z) means |z'-z|, as above, with |.| being the usual
  46 //    absolute value on complex numbers (_not_ the L^1 norm).
  47 // 5. From 3. and 4. we get for complex numbers x,y:
  48 //    (i)   err(2^n*x) = 2^n * err(x),
  49 //    (ii)  err(-x) = err(x),
  50 //    (iii) |x| <= 2^a, |y| <= 2^a, then
  51 //          err(x+y) <= err(x) + err(y) + e*2^(a+3/2-m),
  52 //    (iv)  |x| <= 2^a, |y| <= 2^b, then
  53 //          err(x*y) <= 2^a * err(y) + 2^b * err(x) + 3*e*2^(a+b+1/2-m).
  54 // 6. We start out with precomputed roots of unity:
  55 //          |exp(2 pi i/2^n)| <= 1,
  56 //          err(exp(2 pi i/2^n)) <= e*2^(1/2-m), (even err(..)=0 for n=0,1,2),
  57 //    and compute  exp(2 pi i * j/2^k)  according to the binary digits of j.
  58 //    This way, each root of unity will be a product of at most k precomputed
  59 //    roots. If (j mod 2^(k-2)) has n bits, then  exp(2 pi i * j/2^k)  will
  60 //    be computed using n factors, i.e. n-1 complex multiplications, and by
  61 //    5.iv. we'll have
  62 //          err(exp(2 pi i * j/2^k)) <= n*e*2^(1/2-m) + max(n-1,0)*3*e*2^(1/2-m)
  63 //                                    = max(4*n-3,0)*e*2^(1/2-m).
  64 //    Hence the maximum roots-of-unity error is (set n=k-2)
  65 //          err(w^j) <= (4*k-11)*e*2^(1/2-m),
  66 //    and the average roots-of-unity error is (set n=(k-2)/2)
  67 //          < 2*(k-2)*e*2^(1/2-m).
  68 // 7. Now we start the FFT.
  69 //    Before the first step, x_i are integral, |x_i| < 2^l and err(x_i) = 0.
  70 //    After the first butterfly, which replaces (x(i1),x(i2)) by
  71 //    (x(i1) + x(i2), x(i1) - x(i2)), we have |x_i| < 2^(l+1) and err(x_i) = 0.
  72 //    Then, for each of the remaining k-1 steps, a butterfly replaces
  73 //    (x(i1),x(i2)) by (x(i1) + w^exp*x(i2), x(i1) - w^exp*x(i2)). Thus,
  74 //    after n steps we have |x_i| < 2^(l+n) and err(x_i) <= E_i where
  75 //          E_0 = 0,
  76 //          E_1 = 0,
  77 //          E_(n+1) = (E_n + 2^(l+n)*(4*k-11)*e*2^(1/2-m) + 3*e*2^(l+n+1/2-m))
  78 //                    + E_n + e*2^(l+n+3/2-m)
  79 //                  = 2*E_n + (4*k-6)*e*2^(l+n+1/2-m)
  80 //    hence  E_n = (2^n-2)*(4*k-6)*e*2^(l+1/2-m).
  81 //    Setting n = k, we have proved that after the FFT ends, we have
  82 //          |x_i| < 2^(l+k) and err(x_i) <= (4*k-6)*e*2^(l+k+1/2-m).
  83 // 8. The same error analysis holds for the y_i and their FFT. After we
  84 //    multiply z_i := x_i * y_i, we have
  85 //          |z_i| < 2^(2*l+2*k) and err(z_i) <= (8*k-9)*e*2^(2*l+2*k+1/2-m).
  86 // 9. Then an inverse FFT on z_i is done, which is the same as an FFT
  87 //    followed by a permutation and a division by 2^k. After n steps of
  88 //    the FFT, we have |z_i| < 2^(2*l+2*k+n) and err(z_i) <= E_i where
  89 //          E_0 = (8*k-9)*e*2^(2*l+2*k+1/2-m),
  90 //          E_(n+1) = (E_n + 2^(2*l+2*k+n)*(4*k-11)*e*2^(1/2-m)
  91 //                         + 3*e*2^(2*l+2*k+n+1/2-m))
  92 //                    + E_n + e*2^(2*l+2*k+n+3/2-m)
  93 //                  = 2*E_n + (4*k-6)*e*2^(2*l+2*k+n+1/2-m)
  94 //    hence  E_n = 2^n*(8*k-9)*e*2^(2*l+2*k+1/2-m)
  95 //                 + (2^n-1)*(4*k-6)*e*2^(2*l+2*k+1/2-m).
  96 //    So, after the FFT, we have (set n=k) |z_i| < 2^(2*l+3*k) and
  97 //          err(z_i) <= (12*k-15)*e*2^(2*l+3*k+1/2-m).
  98 //    Permutation doesn't change the estimates. After division by 2^k, we get
  99 //    |z_i| < 2^(2*l+2*k) and
 100 //          err(z_i) <= (12*k-15)*e*2^(2*l+2*k+1/2-m).
 101 // 10. When converting the z_i back to integers, we know that z_i should be
 102 //     real, integral, and |z_i| < 2^(2*l+k). We can only guarantee that we
 103 //     can find the integral z_i from the floating-point computation if
 104 //          (12*k-15)*e*2^(2*l+2*k+1/2-m) < 1/2.
 105 // 11. Assuming e = 1/2 and m = 53 (typical values for IEEE double arithmetic),
 106 //     we get the constraint  2*l < m - 1/2 - 2*k - log_2(12*k-15).
 107 //          k = 2    l <= 22
 108 //          k = 3    l <= 21
 109 //          k = 4    l <= 19
 110 //          k = 5    l <= 18
 111 //          k = 6    l <= 17
 112 //          k = 7    l <= 16
 113 //          k = 8    l <= 15
 114 //          k = 9    l <= 13
 115 //          k = 10   l <= 12
 116 //          k = 11   l <= 11
 117 //          k = 12   l <= 10
 118 //          k = 13   l <= 9
 119 //          k = 14   l <= 8
 120 //          k = 15   l <= 7
 121 //          k = 16   l <= 6
 122 //          k = 17   l <= 5
 123 //          k = 18   l <= 4
 124 //          k = 19   l <= 3
 125 //          k = 20   l <= 2
 126 //     Assuming e = 1/2 and m = 64 ("long double" arithmetic on i387/i486/i586),
 127 //     we get the constraint  2*l < m - 1/2 - 2*k - log_2(12*k-15).
 128 //          k = 2    l <= 28
 129 //          k = 3    l <= 26
 130 //          k = 4    l <= 25
 131 //          k = 5    l <= 24
 132 //          k = 6    l <= 22
 133 //          k = 7    l <= 21
 134 //          k = 8    l <= 20
 135 //          k = 9    l <= 19
 136 //          k = 10   l <= 18
 137 //          k = 11   l <= 17
 138 //          k = 12   l <= 16
 139 //          k = 13   l <= 15
 140 //          k = 14   l <= 14
 141 //          k = 15   l <= 13
 142 //          k = 16   l <= 12
 143 //          k = 17   l <= 10
 144 //          k = 18   l <= 9
 145 //          k = 19   l <= 8
 146 //          k = 20   l <= 7
 147 //          k = 21   l <= 6
 148 //          k = 22   l <= 5
 149 //          k = 23   l <= 4
 150 //          k = 24   l <= 3
 151 //          k = 25   l <= 2
 152
 153
 154 #if !(intDsize==32)
 155 #error "complex fft implemented only for intDsize==32"
 156 #endif
 157
 158
 159 #include "cl_floatparam.h"
 160 #include "cl_io.h"
 161 #include "cl_abort.h"
 162
 163 #if defined(HAVE_LONGDOUBLE) && (long_double_mant_bits > double_mant_bits) && (defined(__i386__) || defined(__m68k__) || (defined(__sparc__) && 0))
 164 // Only these CPUs have fast "long double"s in hardware.
 165 // On SPARC, "long double"s are emulated in software and don't work.
 166 typedef long double fftc_real;
 167 #define fftc_real_mant_bits long_double_mant_bits
 168 #define fftc_real_rounds long_double_rounds
 169 #else
 170 typedef double fftc_real;
 171 #define fftc_real_mant_bits double_mant_bits
 172 #define fftc_real_rounds double_rounds
 173 #endif
 174
 175 typedef struct fftc_complex
 176   {
 177     fftc_real re;
 178     fftc_real im;
 179   }
 180   fftc_complex;
 181
 182 static const fftc_complex fftc_roots_of_1 [32+1] =
 183   // roots_of_1[n] is a (2^n)th root of unity in C.
 184   // Also roots_of_1[n-1] = roots_of_1[n]^2.
 185   // For simplicity we choose  roots_of_1[n] = exp(2 pi i/2^n).
 186   {
 187    #if (fftc_real_mant_bits == double_mant_bits)
 188     // These values have 64 bit precision.
 189     { 1.0,                    0.0 },
 190     { -1.0,                   0.0 },
 191     { 0.0,                    1.0 },
 192     { 0.7071067811865475244,  0.7071067811865475244 },
 193     { 0.9238795325112867561,  0.38268343236508977172 },
 194     { 0.9807852804032304491,  0.19509032201612826784 },
 195     { 0.99518472667219688623, 0.098017140329560601996 },
 196     { 0.9987954562051723927,  0.049067674327418014254 },
 197     { 0.9996988186962042201,  0.024541228522912288032 },
 198     { 0.99992470183914454094, 0.0122715382857199260795 },
 199     { 0.99998117528260114264, 0.0061358846491544753597 },
 200     { 0.9999952938095761715,  0.00306795676296597627 },
 201     { 0.99999882345170190993, 0.0015339801862847656123 },
 202     { 0.99999970586288221914, 7.6699031874270452695e-4 },
 203     { 0.99999992646571785114, 3.8349518757139558907e-4 },
 204     { 0.9999999816164292938,  1.9174759731070330744e-4 },
 205     { 0.9999999954041073129,  9.5873799095977345874e-5 },
 206     { 0.99999999885102682754, 4.793689960306688455e-5 },
 207     { 0.99999999971275670683, 2.3968449808418218729e-5 },
 208     { 0.9999999999281891767,  1.1984224905069706422e-5 },
 209     { 0.99999999998204729416, 5.9921124526424278428e-6 },
 210     { 0.99999999999551182357, 2.9960562263346607504e-6 },
 211     { 0.99999999999887795586, 1.4980281131690112288e-6 },
 212     { 0.999999999999719489,   7.4901405658471572114e-7 },
 213     { 0.99999999999992987223, 3.7450702829238412391e-7 },
 214     { 0.99999999999998246807, 1.8725351414619534487e-7 },
 215     { 0.999999999999995617,   9.36267570730980828e-8 },
 216     { 0.99999999999999890425, 4.6813378536549092695e-8 },
 217     { 0.9999999999999997261,  2.340668926827455276e-8 },
 218     { 0.99999999999999993153, 1.1703344634137277181e-8 },
 219     { 0.99999999999999998287, 5.8516723170686386908e-9 },
 220     { 0.9999999999999999957,  2.925836158534319358e-9 },
 221     { 0.9999999999999999989,  1.4629180792671596806e-9 }
 222    #else (fftc_real_mant_bits > double_mant_bits)
 223     // These values have 128 bit precision.
 224     { 1.0L,                                       0.0L },
 225     { -1.0L,                                      0.0L },
 226     { 0.0L,                                       1.0L },
 227     { 0.707106781186547524400844362104849039284L, 0.707106781186547524400844362104849039284L },
 228     { 0.923879532511286756128183189396788286823L, 0.38268343236508977172845998403039886676L },
 229     { 0.980785280403230449126182236134239036975L, 0.195090322016128267848284868477022240928L },
 230     { 0.995184726672196886244836953109479921574L, 0.098017140329560601994195563888641845861L },
 231     { 0.998795456205172392714771604759100694444L, 0.0490676743274180142549549769426826583147L },
 232     { 0.99969881869620422011576564966617219685L,  0.0245412285229122880317345294592829250654L },
 233     { 0.99992470183914454092164649119638322435L,  0.01227153828571992607940826195100321214037L },
 234     { 0.999981175282601142656990437728567716173L, 0.00613588464915447535964023459037258091705L },
 235     { 0.999995293809576171511580125700119899554L, 0.00306795676296597627014536549091984251894L },
 236     { 0.99999882345170190992902571017152601905L,  0.001533980186284765612303697150264079079954L },
 237     { 0.999999705862882219160228217738765677117L, 7.66990318742704526938568357948576643142e-4L },
 238     { 0.99999992646571785114473148070738785695L,  3.83495187571395589072461681181381263396e-4L },
 239     { 0.999999981616429293808346915402909714504L, 1.91747597310703307439909561989000933469e-4L },
 240     { 0.99999999540410731289097193313960614896L,  9.58737990959773458705172109764763511872e-5L },
 241     { 0.9999999988510268275626733077945541084L,   4.79368996030668845490039904946588727468e-5L },
 242     { 0.99999999971275670684941397221864177609L,  2.39684498084182187291865771650218200947e-5L },
 243     { 0.999999999928189176709775095883850490262L, 1.198422490506970642152156159698898480473e-5L },
 244     { 0.99999999998204729417728262414778410738L,  5.99211245264242784287971180889086172999e-6L },
 245     { 0.99999999999551182354431058417299732444L,  2.99605622633466075045481280835705981183e-6L },
 246     { 0.999999999998877955886077016551752536504L, 1.49802811316901122885427884615536112069e-6L },
 247     { 0.999999999999719488971519214794719584451L, 7.49014056584715721130498566730655637157e-7L },
 248     { 0.99999999999992987224287980123972873676L,  3.74507028292384123903169179084633177398e-7L },
 249     { 0.99999999999998246806071995015624773673L,  1.8725351414619534486882457659356361712e-7L },
 250     { 0.999999999999995617015179987529456656217L, 9.3626757073098082799067286680885620193e-8L },
 251     { 0.999999999999998904253794996881763834182L, 4.68133785365490926951155181385400969594e-8L },
 252     { 0.99999999999999972606344874922040343793L,  2.34066892682745527595054934190348440379e-8L },
 253     { 0.999999999999999931515862187305098514444L, 1.170334463413727718124621350323810379807e-8L },
 254     { 0.999999999999999982878965546826274482047L, 5.8516723170686386908097901008341396944e-9L },
 255     { 0.999999999999999995719741386706568611352L, 2.92583615853431935792823046906895590202e-9L },
 256     { 0.999999999999999998929935346676642152265L, 1.46291807926715968052953216186596371037e-9L }
 257    #endif
 258   };
 259
 260 // Define this for (cheap) consistency checks.
 261 //#define DEBUG_FFTC
 262
 263 // Define the algorithm of the backward FFT:
 264 // Either FORWARD (a normal FFT followed by a permutation)
 265 // or     RECIPROOT (an FFT with reciprocal root of unity)
 266 // or     CLEVER (an FFT with reciprocal root of unity but clever computation
 267 //                of the reciprocals).
 268 // Drawback of FORWARD: the permutation pass.
 269 // Drawback of RECIPROOT: need all the powers of the root, not only half of them.
 270 #define FORWARD   42
 271 #define RECIPROOT 43
 272 #define CLEVER    44
 273 #define FFTC_BACKWARD CLEVER
 274
 275 static fftc_real fftc_pow2_table[64] = // table of powers of 2
 276   {
 277                       1.0,
 278                       2.0,
 279                       4.0,
 280                       8.0,
 281                      16.0,
 282                      32.0,
 283                      64.0,
 284                     128.0,
 285                     256.0,
 286                     512.0,
 287                    1024.0,
 288                    2048.0,
 289                    4096.0,
 290                    8192.0,
 291                   16384.0,
 292                   32768.0,
 293                   65536.0,
 294                  131072.0,
 295                  262144.0,
 296                  524288.0,
 297                 1048576.0,
 298                 2097152.0,
 299                 4194304.0,
 300                 8388608.0,
 301                16777216.0,
 302                33554432.0,
 303                67108864.0,
 304               134217728.0,
 305               268435456.0,
 306               536870912.0,
 307              1073741824.0,
 308              2147483648.0,
 309              4294967296.0,
 310              8589934592.0,
 311             17179869184.0,
 312             34359738368.0,
 313             68719476736.0,
 314            137438953472.0,
 315            274877906944.0,
 316            549755813888.0,
 317           1099511627776.0,
 318           2199023255552.0,
 319           4398046511104.0,
 320           8796093022208.0,
 321          17592186044416.0,
 322          35184372088832.0,
 323          70368744177664.0,
 324         140737488355328.0,
 325         281474976710656.0,
 326         562949953421312.0,
 327        1125899906842624.0,
 328        2251799813685248.0,
 329        4503599627370496.0,
 330        9007199254740992.0,
 331       18014398509481984.0,
 332       36028797018963968.0,
 333       72057594037927936.0,
 334      144115188075855872.0,
 335      288230376151711744.0,
 336      576460752303423488.0,
 337     1152921504606846976.0,
 338     2305843009213693952.0,
 339     4611686018427387904.0,
 340     9223372036854775808.0
 341   };
 342
 343 // For a constant expression n (0 <= n < 128), returns 2^n of type fftc_real.
 344 #define fftc_pow2(n)  \
 345   (((n) & 64 ? (fftc_real)18446744073709551616.0 : (fftc_real)1.0)      \
 346    * ((n) & 32 ? (fftc_real)4294967296.0 : (fftc_real)1.0)              \
 347    * ((n) & 16 ? (fftc_real)65536.0 : (fftc_real)1.0)                   \
 348    * ((n) & 8 ? (fftc_real)256.0 : (fftc_real)1.0)                      \
 349    * ((n) & 4 ? (fftc_real)16.0 : (fftc_real)1.0)                       \
 350    * ((n) & 2 ? (fftc_real)4.0 : (fftc_real)1.0)                        \
 351    * ((n) & 1 ? (fftc_real)2.0 : (fftc_real)1.0)                        \
 352   )
 353
 354 // r := a + b
 355 static inline void add (const fftc_complex& a, const fftc_complex& b, fftc_complex& r)
 356 {
 357         r.re = a.re + b.re;
 358         r.im = a.im + b.im;
 359 }
 360
 361 // r := a - b
 362 static inline void sub (const fftc_complex& a, const fftc_complex& b, fftc_complex& r)
 363 {
 364         r.re = a.re - b.re;
 365         r.im = a.im - b.im;
 366 }
 367
 368 // r := a * b
 369 static inline void mul (fftc_real a, const fftc_complex& b, fftc_complex& r)
 370 {
 371         r.re = a * b.re;
 372         r.im = a * b.im;
 373 }
 374
 375 // r := a * b
 376 static inline void mul (const fftc_complex& a, const fftc_complex& b, fftc_complex& r)
 377 {
 378         var fftc_real r_re = a.re * b.re - a.im * b.im;
 379         var fftc_real r_im = a.re * b.im + a.im * b.re;
 380         r.re = r_re;
 381         r.im = r_im;
 382 }
 383
 384 #ifndef _BIT_REVERSE
 385 #define _BIT_REVERSE
 386 // Reverse an n-bit number x. n>0.
 387 static uintL bit_reverse (uintL n, uintL x)
 388 {
 389         var uintL y = 0;
 390         do {
 391                 y <<= 1;
 392                 y |= (x & 1);
 393                 x >>= 1;
 394         } while (!(--n == 0));
 395         return y;
 396 }
 397 #endif
 398
 399 // Compute a complex convolution using FFT: z[0..N-1] := x[0..N-1] * y[0..N-1].
 400 static void fftc_convolution (const uintL n, const uintL N, // N = 2^n
 401                               fftc_complex * x, // N numbers
 402                               fftc_complex * y, // N numbers
 403                               fftc_complex * z  // N numbers result
 404                              )
 405 {
 406         CL_ALLOCA_STACK;
 407         #if (FFTC_BACKWARD == RECIPROOT) || defined(DEBUG_FFTC)
 408         var fftc_complex* const w = cl_alloc_array(fftc_complex,N);
 409         #else
 410         var fftc_complex* const w = cl_alloc_array(fftc_complex,(N>>1)+1);
 411         #endif
 412         var uintL i;
 413         // Initialize w[i] to w^i, w a primitive N-th root of unity.
 414         w[0] = fftc_roots_of_1[0];
 415         #if (FFTC_BACKWARD == RECIPROOT) || defined(DEBUG_FFTC)
 416         {
 417                 var int j;
 418                 for (j = n-1; j>=0; j--) {
 419                         var fftc_complex r_j = fftc_roots_of_1[n-j];
 420                         w[1<<j] = r_j;
 421                         for (i = (2<<j); i < N; i += (2<<j))
 422                                 mul(w[i],r_j, w[i+(1<<j)]);
 423                 }
 424         }
 425         #else
 426         {
 427                 var int j;
 428                 for (j = n-1; j>=0; j--) {
 429                         var fftc_complex r_j = fftc_roots_of_1[n-j];
 430                         w[1<<j] = r_j;
 431                         for (i = (2<<j); i < N>>1; i += (2<<j))
 432                                 mul(w[i],r_j, w[i+(1<<j)]);
 433                 }
 434         }
 435         #endif
 436         #ifdef DEBUG_FFTC
 437         // Check that w is really a primitive N-th root of unity.
 438         {
 439                 var fftc_real epsilon;
 440                 epsilon = (fftc_real)(n>=3 ? 4*n-11 : 0);
 441                 epsilon = epsilon / fftc_pow2(fftc_real_mant_bits);
 442                 if (fftc_real_rounds == rounds_to_nearest)
 443                         epsilon = 0.5*epsilon;
 444                 epsilon = 1.414*epsilon;
 445                 // epsilon = (4*k-11)*e*2^(1/2-m).
 446                 var fftc_real part;
 447                 var fftc_complex& w_N = w[N>>1];
 448                 part = w_N.re - (fftc_real)(-1.0);
 449                 if (part > epsilon || part < -epsilon)
 450                         cl_abort();
 451                 part = w_N.im;
 452                 if (part > epsilon || part < -epsilon)
 453                         cl_abort();
 454         }
 455         {
 456                 var fftc_complex w_N;
 457                 mul(w[N-1],fftc_roots_of_1[n], w_N);
 458                 // Since there was one more multiplication,
 459                 // we have to replace n by (n+1) in the epsilon above.
 460                 var fftc_real epsilon;
 461                 epsilon = (fftc_real)(4*n-7);
 462                 epsilon = epsilon / fftc_pow2(fftc_real_mant_bits);
 463                 if (fftc_real_rounds == rounds_to_nearest)
 464                         epsilon = 0.5*epsilon;
 465                 epsilon = 1.414*epsilon;
 466                 // epsilon = (4*k-7)*e*2^(1/2-m).
 467                 var fftc_real part;
 468                 part = w_N.re - (fftc_real)1.0;
 469                 if (part > epsilon || part < -epsilon)
 470                         cl_abort();
 471                 part = w_N.im;
 472                 if (part > epsilon || part < -epsilon)
 473                         cl_abort();
 474         }
 475         #endif
 476         var bool squaring = (x == y);
 477         // Do an FFT of length N on x.
 478         {
 479                 var sintL l;
 480                 /* l = n-1 */ {
 481                         var const uintL tmax = N>>1; // tmax = 2^(n-1)
 482                         for (var uintL t = 0; t < tmax; t++) {
 483                                 var uintL i1 = t;
 484                                 var uintL i2 = i1 + tmax;
 485                                 // Butterfly: replace (x(i1),x(i2)) by
 486                                 // (x(i1) + x(i2), x(i1) - x(i2)).
 487                                 var fftc_complex tmp;
 488                                 tmp = x[i2];
 489                                 sub(x[i1],tmp, x[i2]);
 490                                 add(x[i1],tmp, x[i1]);
 491                         }
 492                 }
 493                 for (l = n-2; l>=0; l--) {
 494                         var const uintL smax = (uintL)1 << (n-1-l);
 495                         var const uintL tmax = (uintL)1 << l;
 496                         for (var uintL s = 0; s < smax; s++) {
 497                                 var uintL exp = bit_reverse(n-1-l,s) << l;
 498                                 for (var uintL t = 0; t < tmax; t++) {
 499                                         var uintL i1 = (s << (l+1)) + t;
 500                                         var uintL i2 = i1 + tmax;
 501                                         // Butterfly: replace (x(i1),x(i2)) by
 502                                         // (x(i1) + w^exp*x(i2), x(i1) - w^exp*x(i2)).
 503                                         var fftc_complex tmp;
 504                                         mul(x[i2],w[exp], tmp);
 505                                         sub(x[i1],tmp, x[i2]);
 506                                         add(x[i1],tmp, x[i1]);
 507                                 }
 508                         }
 509                 }
 510         }
 511         // Do an FFT of length N on y.
 512         if (!squaring) {
 513                 var sintL l;
 514                 /* l = n-1 */ {
 515                         var uintL const tmax = N>>1; // tmax = 2^(n-1)
 516                         for (var uintL t = 0; t < tmax; t++) {
 517                                 var uintL i1 = t;
 518                                 var uintL i2 = i1 + tmax;
 519                                 // Butterfly: replace (y(i1),y(i2)) by
 520                                 // (y(i1) + y(i2), y(i1) - y(i2)).
 521                                 var fftc_complex tmp;
 522                                 tmp = y[i2];
 523                                 sub(y[i1],tmp, y[i2]);
 524                                 add(y[i1],tmp, y[i1]);
 525                         }
 526                 }
 527                 for (l = n-2; l>=0; l--) {
 528                         var const uintL smax = (uintL)1 << (n-1-l);
 529                         var const uintL tmax = (uintL)1 << l;
 530                         for (var uintL s = 0; s < smax; s++) {
 531                                 var uintL exp = bit_reverse(n-1-l,s) << l;
 532                                 for (var uintL t = 0; t < tmax; t++) {
 533                                         var uintL i1 = (s << (l+1)) + t;
 534                                         var uintL i2 = i1 + tmax;
 535                                         // Butterfly: replace (y(i1),y(i2)) by
 536                                         // (y(i1) + w^exp*y(i2), y(i1) - w^exp*y(i2)).
 537                                         var fftc_complex tmp;
 538                                         mul(y[i2],w[exp], tmp);
 539                                         sub(y[i1],tmp, y[i2]);
 540                                         add(y[i1],tmp, y[i1]);
 541                                 }
 542                         }
 543                 }
 544         }
 545         // Multiply the transformed vectors into z.
 546         for (i = 0; i < N; i++)
 547                 mul(x[i],y[i], z[i]);
 548         // Undo an FFT of length N on z.
 549         {
 550                 var uintL l;
 551                 for (l = 0; l < n-1; l++) {
 552                         var const uintL smax = (uintL)1 << (n-1-l);
 553                         var const uintL tmax = (uintL)1 << l;
 554                         #if FFTC_BACKWARD != CLEVER
 555                         for (var uintL s = 0; s < smax; s++) {
 556                                 var uintL exp = bit_reverse(n-1-l,s) << l;
 557                                 #if FFTC_BACKWARD == RECIPROOT
 558                                 if (exp > 0)
 559                                         exp = N - exp; // negate exp (use w^-1 instead of w)
 560                                 #endif
 561                                 for (var uintL t = 0; t < tmax; t++) {
 562                                         var uintL i1 = (s << (l+1)) + t;
 563                                         var uintL i2 = i1 + tmax;
 564                                         // Inverse Butterfly: replace (z(i1),z(i2)) by
 565                                         // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)).
 566                                         // Do the division by 2 later.
 567                                         var fftc_complex sum;
 568                                         var fftc_complex diff;
 569                                         add(z[i1],z[i2], sum);
 570                                         sub(z[i1],z[i2], diff);
 571                                         z[i1] = sum;
 572                                         mul(diff,w[exp], z[i2]);
 573                                 }
 574                         }
 575                         #else // FFTC_BACKWARD == CLEVER: clever handling of negative exponents
 576                         /* s = 0, exp = 0 */ {
 577                                 for (var uintL t = 0; t < tmax; t++) {
 578                                         var uintL i1 = t;
 579                                         var uintL i2 = i1 + tmax;
 580                                         // Inverse Butterfly: replace (z(i1),z(i2)) by
 581                                         // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)),
 582                                         // with exp <-- 0.
 583                                         // Do the division by 2 later.
 584                                         var fftc_complex sum;
 585                                         var fftc_complex diff;
 586                                         add(z[i1],z[i2], sum);
 587                                         sub(z[i1],z[i2], diff);
 588                                         z[i1] = sum;
 589                                         z[i2] = diff;
 590                                 }
 591                         }
 592                         for (var uintL s = 1; s < smax; s++) {
 593                                 var uintL exp = bit_reverse(n-1-l,s) << l;
 594                                 exp = (N>>1) - exp; // negate exp (use w^-1 instead of w)
 595                                 for (var uintL t = 0; t < tmax; t++) {
 596                                         var uintL i1 = (s << (l+1)) + t;
 597                                         var uintL i2 = i1 + tmax;
 598                                         // Inverse Butterfly: replace (z(i1),z(i2)) by
 599                                         // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)),
 600                                         // with exp <-- (N/2 - exp).
 601                                         // Do the division by 2 later.
 602                                         var fftc_complex sum;
 603                                         var fftc_complex diff;
 604                                         add(z[i1],z[i2], sum);
 605                                         sub(z[i2],z[i1], diff); // note that w^(N/2) = -1
 606                                         z[i1] = sum;
 607                                         mul(diff,w[exp], z[i2]);
 608                                 }
 609                         }
 610                         #endif
 611                 }
 612                 /* l = n-1 */ {
 613                         var const fftc_real pow2 = (fftc_real)1 / (fftc_real)N;
 614                         var const uintL tmax = N>>1; // tmax = 2^(n-1)
 615                         for (var uintL t = 0; t < tmax; t++) {
 616                                 var uintL i1 = t;
 617                                 var uintL i2 = i1 + tmax;
 618                                 // Inverse Butterfly: replace (z(i1),z(i2)) by
 619                                 // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/2).
 620                                 // Do all the divisions by 2 now.
 621                                 var fftc_complex sum;
 622                                 var fftc_complex diff;
 623                                 add(z[i1],z[i2], sum);
 624                                 sub(z[i1],z[i2], diff);
 625                                 mul(pow2,sum, z[i1]);
 626                                 mul(pow2,diff, z[i2]);
 627                         }
 628                 }
 629         }
 630         #if FFTC_BACKWARD == FORWARD
 631         // Swap z[i] and z[N-i] for 0 < i < N/2.
 632         for (i = (N>>1)-1; i > 0; i--) {
 633                 var fftc_complex tmp = z[i];
 634                 z[i] = z[N-i];
 635                 z[N-i] = tmp;
 636         }
 637         #endif
 638 }
 639
 640 // For a given k >= 2, the maximum l is determined by
 641 // 2*l < m - 1/2 - 2*k - log_2(12*k-15) - (1 if e=1.0, 0 if e=0.5).
 642 // This is a decreasing function of k.
 643 #define max_l(k)  \
 644         (int)((fftc_real_mant_bits                      \
 645                - 2*(k)                                  \
 646                - ((k)<=2 ? 4 : (k)<=3 ? 5 : (k)<=5 ? 6 : (k)<=8 ? 7 : (k)<=16 ? 8 : (k)<=31 ? 9 : 10) \
 647                - (fftc_real_rounds == rounds_to_nearest ? 0 : 1)) \
 648               / 2)
 649 static int max_l_table[32+1] =
 650   {         0,         0,  max_l(2),  max_l(3),  max_l(4),  max_l(5),  max_l(6),
 651      max_l(7),  max_l(8),  max_l(9), max_l(10), max_l(11), max_l(12), max_l(13),
 652     max_l(14), max_l(15), max_l(16), max_l(17), max_l(18), max_l(19), max_l(20),
 653     max_l(21), max_l(22), max_l(23), max_l(24), max_l(25), max_l(26), max_l(27),
 654     max_l(28), max_l(29), max_l(30), max_l(31), max_l(32)
 655   };
 656
 657 // Split len uintD's below sourceptr into chunks of l bits, thus filling
 658 // N complex numbers at x.
 659 static void fill_factor (uintL N, fftc_complex* x, uintL l,
 660                          const uintD* sourceptr, uintL len)
 661 {
 662         var uintL i;
 663         if (max_l(2) > intDsize && l > intDsize) {
 664                 // l > intDsize
 665                 if (max_l(2) > 64 && l > 64) {
 666                         fprint(cl_stderr, "FFT problem: l > 64 not supported by pow2_table\n");
 667                         cl_abort();
 668                 }
 669                 var fftc_real carry = 0;
 670                 var sintL carrybits = 0; // number of bits in carry (>=0, <l)
 671                 i = 0;
 672                 while (len > 0) {
 673                         var uintD digit = lsprefnext(sourceptr);
 674                         if (carrybits+intDsize >= l) {
 675                                 x[i].re = carry + (fftc_real)(digit & bitm(l-carrybits)) * fftc_pow2_table[carrybits];
 676                                 x[i].im = (fftc_real)0;
 677                                 i++;
 678                                 carry = (l-carrybits == intDsize ? (fftc_real)0 : (fftc_real)(digit >> (l-carrybits)));
 679                                 carrybits = carrybits+intDsize-l;
 680                         } else {
 681                                 carry = carry + (fftc_real)digit * fftc_pow2_table[carrybits];
 682                                 carrybits = carrybits+intDsize;
 683                         }
 684                         len--;
 685                 }
 686                 if (carrybits > 0) {
 687                         x[i].re = carry;
 688                         x[i].im = (fftc_real)0;
 689                         i++;
 690                 }
 691                 if (i > N)
 692                         cl_abort();
 693         } else if (max_l(2) >= intDsize && l == intDsize) {
 694                 // l = intDsize
 695                 if (len > N)
 696                         cl_abort();
 697                 for (i = 0; i < len; i++) {
 698                         var uintD digit = lsprefnext(sourceptr);
 699                         x[i].re = (fftc_real)digit;
 700                         x[i].im = (fftc_real)0;
 701                 }
 702         } else {
 703                 // l < intDsize
 704                 var const uintD l_mask = bit(l)-1;
 705                 var uintD carry = 0;
 706                 var sintL carrybits = 0; // number of bits in carry (>=0, <intDsize)
 707                 for (i = 0; i < N; i++) {
 708                         if (carrybits >= l) {
 709                                 x[i].re = (fftc_real)(carry & l_mask);
 710                                 x[i].im = (fftc_real)0;
 711                                 carry >>= l;
 712                                 carrybits -= l;
 713                         } else {
 714                                 if (len == 0)
 715                                         break;
 716                                 len--;
 717                                 var uintD digit = lsprefnext(sourceptr);
 718                                 x[i].re = (fftc_real)((carry | (digit << carrybits)) & l_mask);
 719                                 x[i].im = (fftc_real)0;
 720                                 carry = digit >> (l-carrybits);
 721                                 carrybits = intDsize - (l-carrybits);
 722                         }
 723                 }
 724                 while (carrybits > 0) {
 725                         if (!(i < N))
 726                                 cl_abort();
 727                         x[i].re = (fftc_real)(carry & l_mask);
 728                         x[i].im = (fftc_real)0;
 729                         carry >>= l;
 730                         carrybits -= l;
 731                         i++;
 732                 }
 733                 if (len > 0)
 734                         cl_abort();
 735         }
 736         for ( ; i < N; i++) {
 737                 x[i].re = (fftc_real)0;
 738                 x[i].im = (fftc_real)0;
 739         }
 740 }
 741
 742 // Given a not too large floating point number, round it to the nearest integer.
 743 static inline fftc_real fftc_fround (fftc_real x)
 744 {
 745         return
 746           #if (fftc_real_rounds == rounds_to_nearest)
 747             (x + (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2)))
 748             - (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2));
 749           #elif (fftc_real_rounds == rounds_to_infinity)
 750             (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2))
 751             - ((fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2))
 752                - (x + (fftc_real)0.5));
 753           #else // rounds_to_zero, rounds_to_minus_infinity
 754             ((x + (fftc_real)0.5)
 755              + (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2)))
 756             - (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2));
 757           #endif
 758 }
 759
 760 // Given a not too large floating point number, round it down.
 761 static inline fftc_real fftc_ffloor (fftc_real x)
 762 {
 763         #if (fftc_real_rounds == rounds_to_nearest)
 764           var fftc_real y =
 765             (x + (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2)))
 766             - (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2));
 767           if (y <= x)
 768                 return y;
 769           else
 770                 return y - (fftc_real)1.0;
 771         #elif (fftc_real_rounds == rounds_to_infinity)
 772           return
 773             (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2))
 774             - ((fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2))
 775                - x);
 776         #else // rounds_to_zero, rounds_to_minus_infinity
 777           return
 778             (x + (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2)))
 779             - (fftc_pow2(fftc_real_mant_bits-1)+fftc_pow2(fftc_real_mant_bits-2));
 780         #endif
 781 }
 782
 783 // Combine the N complex numbers at z into uintD's below destptr.
 784 // The z[i] are known to be approximately integers >= 0, < N*2^(2*l).
 785 // Assumes room for floor(N*l/intDsize)+(1+ceiling((n+2*l)/intDsize)) uintD's
 786 // below destptr. Fills len digits and returns (destptr lspop len).
 787 static uintD* unfill_product (uintL n, uintL N, // N = 2^n
 788                               const fftc_complex * z, uintL l,
 789                               uintD* destptr)
 790 {
 791         var uintL i;
 792         if (n + 2*l <= intDsize) {
 793                 // 2-digit carry is sufficient, l < intDsize
 794                 var uintD carry0 = 0;
 795                 var uintD carry1 = 0;
 796                 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
 797                 for (i = 0; i < N; i++) {
 798                         // Fetch z[i].re and round it to the nearest uintD.
 799                         var uintD digit = (uintD)(z[i].re + (fftc_real)0.5);
 800                         if (shift > 0) {
 801                                 carry1 += digit >> (intDsize-shift);
 802                                 digit = digit << shift;
 803                         }
 804                         if ((carry0 += digit) < digit)
 805                                 carry1 += 1;
 806                         shift += l;
 807                         if (shift >= intDsize) {
 808                                 lsprefnext(destptr) = carry0;
 809                                 carry0 = carry1;
 810                                 carry1 = 0;
 811                                 shift -= intDsize;
 812                         }
 813                 }
 814                 lsprefnext(destptr) = carry0;
 815                 lsprefnext(destptr) = carry1;
 816         } else if (n + 2*l <= 2*intDsize) {
 817                 // 3-digit carry is sufficient, l < intDsize
 818                 #if HAVE_DD
 819                 var uintDD carry0 = 0;
 820                 var uintD carry1 = 0;
 821                 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
 822                 for (i = 0; i < N; i++) {
 823                         // Fetch z[i].re and round it to the nearest uintDD.
 824                         var uintDD digit = (uintDD)(z[i].re + (fftc_real)0.5);
 825                         if (shift > 0) {
 826                                 carry1 += (uintD)(digit >> (2*intDsize-shift));
 827                                 digit = digit << shift;
 828                         }
 829                         if ((carry0 += digit) < digit)
 830                                 carry1 += 1;
 831                         shift += l;
 832                         if (shift >= intDsize) {
 833                                 lsprefnext(destptr) = lowD(carry0);
 834                                 carry0 = highlowDD(carry1,highD(carry0));
 835                                 carry1 = 0;
 836                                 shift -= intDsize;
 837                         }
 838                 }
 839                 lsprefnext(destptr) = lowD(carry0);
 840                 lsprefnext(destptr) = highD(carry0);
 841                 lsprefnext(destptr) = carry1;
 842                 #else
 843                 var uintD carry0 = 0;
 844                 var uintD carry1 = 0;
 845                 var uintD carry2 = 0;
 846                 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
 847                 for (i = 0; i < N; i++) {
 848                         // Fetch z[i].re and round it to the nearest uintDD.
 849                         var fftc_real zi = z[i].re + (fftc_real)0.5;
 850                         var const fftc_real pow2_intDsize = fftc_pow2(intDsize);
 851                         var const fftc_real invpow2_intDsize = (fftc_real)1.0/pow2_intDsize;
 852                         var uintD digit1 = (uintD)(zi * invpow2_intDsize);
 853                         var uintD digit0 = (uintD)(zi - (fftc_real)digit1 * pow2_intDsize);
 854                         if (shift > 0) {
 855                                 carry2 += digit1 >> (intDsize-shift);
 856                                 digit1 = (digit1 << shift) | (digit0 >> (intDsize-shift));
 857                                 digit0 = digit0 << shift;
 858                         }
 859                         if ((carry0 += digit0) < digit0)
 860                                 if ((carry1 += 1) == 0)
 861                                         carry2 += 1;
 862                         if ((carry1 += digit1) < digit1)
 863                                 carry2 += 1;
 864                         shift += l;
 865                         if (shift >= intDsize) {
 866                                 lsprefnext(destptr) = carry0;
 867                                 carry0 = carry1;
 868                                 carry1 = carry2;
 869                                 carry2 = 0;
 870                                 shift -= intDsize;
 871                         }
 872                 }
 873                 lsprefnext(destptr) = carry0;
 874                 lsprefnext(destptr) = carry1;
 875                 lsprefnext(destptr) = carry2;
 876                 #endif
 877         } else {
 878                 // 1-digit+1-float carry is sufficient
 879                 var uintD carry0 = 0;
 880                 var fftc_real carry1 = 0;
 881                 var uintL shift = 0; // shift next digit before adding it to the carry, >=0, <intDsize
 882                 for (i = 0; i < N; i++) {
 883                         // Fetch z[i].re and round it to the nearest integer.
 884                         var fftc_real digit = fftc_fround(z[i].re);
 885                         #ifdef DEBUG_FFTC
 886                         if (!(digit >= (fftc_real)0
 887                               && z[i].re > digit - (fftc_real)0.5
 888                               && z[i].re < digit + (fftc_real)0.5))
 889                                 cl_abort();
 890                         #endif
 891                         if (shift > 0)
 892                                 digit = digit * fftc_pow2_table[shift];
 893                         var fftc_real digit1 = fftc_ffloor(digit*((fftc_real)1.0/fftc_pow2(intDsize)));
 894                         var uintD digit0 = (uintD)(digit - digit1*fftc_pow2(intDsize));
 895                         carry1 += digit1;
 896                         if ((carry0 += digit0) < digit0)
 897                                 carry1 += (fftc_real)1.0;
 898                         shift += l;
 899                         while (shift >= intDsize) {
 900                                 lsprefnext(destptr) = carry0;
 901                                 var fftc_real tmp = fftc_ffloor(carry1*((fftc_real)1.0/fftc_pow2(intDsize)));
 902                                 carry0 = (uintD)(carry1 - tmp*fftc_pow2(intDsize));
 903                                 carry1 = tmp;
 904                                 shift -= intDsize;
 905                         }
 906                 }
 907                 if (carry0 > 0 || carry1 > (fftc_real)0.0) {
 908                         lsprefnext(destptr) = carry0;
 909                         while (carry1 > (fftc_real)0.0) {
 910                                 var fftc_real tmp = fftc_ffloor(carry1*((fftc_real)1.0/fftc_pow2(intDsize)));
 911                                 lsprefnext(destptr) = (uintD)(carry1 - tmp*fftc_pow2(intDsize));
 912                                 carry1 = tmp;
 913                         }
 914                 }
 915         }
 916         return destptr;
 917 }
 918
 919 static inline void mulu_fftcomplex_nocheck (const uintD* sourceptr1, uintC len1,
 920                                             const uintD* sourceptr2, uintC len2,
 921                                             uintD* destptr)
 922 // Es ist 2 <= len1 <= len2.
 923 {
 924         // We have to find parameters l and k such that
 925         // ceiling(len1*intDsize/l) + ceiling(len2*intDsize/l) - 1 <= 2^k,
 926         // and (12*k-15)*e*2^(2*l+2*k+1/2-m) < 1/2.
 927         // Try primarily to minimize k. Minimizing l buys you nothing.
 928         var uintL k;
 929         // Computing k: If len1 and len2 differ much, we'll split source2 -
 930         // hence for the moment just substitute len1 for len2.
 931         //
 932         // First approximation of k: A necessary condition for
 933         // 2*ceiling(len1*intDsize/l) - 1 <= 2^k
 934         // is  2*len1*intDsize/l_max - 1 <= 2^k.
 935         {
 936                 var const int l = max_l(2);
 937                 var uintL lhs = 2*ceiling((uintL)len1*intDsize,l) - 1; // >=1
 938                 if (lhs < 3)
 939                         k = 2;
 940                 else
 941                         integerlength32(lhs-1, k=); // k>=2
 942         }
 943         // Try whether this k is ok or whether we have to increase k.
 944         for ( ; ; k++) {
 945                 if (k >= sizeof(max_l_table)/sizeof(max_l_table[0])
 946                     || max_l_table[k] <= 0) {
 947                         fprint(cl_stderr, "FFT problem: numbers too big, floating point precision not sufficient\n");
 948                         cl_abort();
 949                 }
 950                 if (2*ceiling((uintL)len1*intDsize,max_l_table[k])-1 <= ((uintL)1 << k))
 951                         break;
 952         }
 953         // We could try to reduce l, keeping the same k. But why should we?
 954         // Calculate the number of pieces in which source2 will have to be
 955         // split. Each of the pieces must satisfy
 956         // ceiling(len1*intDsize/l) + ceiling(len2*intDsize/l) - 1 <= 2^k,
 957         var uintL len2p;
 958         // Try once with k, once with k+1. Compare them.
 959         {
 960                 var uintL remaining_k = ((uintL)1 << k) + 1 - ceiling((uintL)len1*intDsize,max_l_table[k]);
 961                 var uintL max_piecelen_k = floor(remaining_k*max_l_table[k],intDsize);
 962                 var uintL numpieces_k = ceiling(len2,max_piecelen_k);
 963                 var uintL remaining_k1 = ((uintL)1 << (k+1)) + 1 - ceiling((uintL)len1*intDsize,max_l_table[k+1]);
 964                 var uintL max_piecelen_k1 = floor(remaining_k1*max_l_table[k+1],intDsize);
 965                 var uintL numpieces_k1 = ceiling(len2,max_piecelen_k1);
 966                 if (numpieces_k <= 2*numpieces_k1) {
 967                         // keep k
 968                         len2p = max_piecelen_k;
 969                 } else {
 970                         // choose k+1
 971                         k = k+1;
 972                         len2p = max_piecelen_k1;
 973                 }
 974         }
 975         var const uintL l = max_l_table[k];
 976         var const uintL n = k;
 977         var const uintL N = (uintL)1 << n;
 978         CL_ALLOCA_STACK;
 979         var fftc_complex* const x = cl_alloc_array(fftc_complex,N);
 980         var fftc_complex* const y = cl_alloc_array(fftc_complex,N);
 981         #ifdef DEBUG_FFTC
 982         var fftc_complex* const z = cl_alloc_array(fftc_complex,N);
 983         #else
 984         var fftc_complex* const z = x; // put z in place of x - saves memory
 985         #endif
 986         var uintD* const tmpprod1 = cl_alloc_array(uintD,len1+1);
 987         var uintL tmpprod_len = floor(l<<n,intDsize)+6;
 988         var uintD* const tmpprod = cl_alloc_array(uintD,tmpprod_len);
 989         var uintL destlen = len1+len2;
 990         clear_loop_lsp(destptr,destlen);
 991         do {
 992                 if (len2p > len2)
 993                         len2p = len2;
 994                 if (len2p == 1) {
 995                         // cheap case
 996                         var uintD* tmpptr = arrayLSDptr(tmpprod1,len1+1);
 997                         mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
 998                         if (addto_loop_lsp(tmpptr,destptr,len1+1))
 999                                 if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
1000                                         cl_abort();
1001                 } else {
1002                         var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
1003                         // Fill factor x.
1004                         fill_factor(N,x,l,sourceptr1,len1);
1005                         // Fill factor y.
1006                         if (!squaring)
1007                                 fill_factor(N,y,l,sourceptr2,len2p);
1008                         // Multiply.
1009                         if (!squaring)
1010                                 fftc_convolution(n,N, &x[0], &y[0], &z[0]);
1011                         else
1012                                 fftc_convolution(n,N, &x[0], &x[0], &z[0]);
1013                         #ifdef DEBUG_FFTC
1014                         // Check result.
1015                         {
1016                                 var fftc_real re_lo_limit = (fftc_real)(-0.5);
1017                                 var fftc_real re_hi_limit = (fftc_real)N * fftc_pow2_table[l] * fftc_pow2_table[l] + (fftc_real)0.5;
1018                                 var fftc_real im_lo_limit = (fftc_real)(-0.5);
1019                                 var fftc_real im_hi_limit = (fftc_real)0.5;
1020                                 for (var uintL i = 0; i < N; i++) {
1021                                         if (!(z[i].im > im_lo_limit
1022                                               && z[i].im < im_hi_limit))
1023                                                 cl_abort();
1024                                         if (!(z[i].re > re_lo_limit
1025                                               && z[i].re < re_hi_limit))
1026                                                 cl_abort();
1027                                 }
1028                         }
1029                         #endif
1030                         var uintD* tmpLSDptr = arrayLSDptr(tmpprod,tmpprod_len);
1031                         var uintD* tmpMSDptr = unfill_product(n,N,z,l,tmpLSDptr);
1032                         var uintL tmplen =
1033                           #if CL_DS_BIG_ENDIAN_P
1034                             tmpLSDptr - tmpMSDptr;
1035                           #else
1036                             tmpMSDptr - tmpLSDptr;
1037                           #endif
1038                         if (tmplen > tmpprod_len)
1039                           cl_abort();
1040                         // Add result to destptr[-destlen..-1]:
1041                         if (tmplen > destlen) {
1042                                 if (test_loop_msp(tmpMSDptr,tmplen-destlen))
1043                                         cl_abort();
1044                                 tmplen = destlen;
1045                         }
1046                         if (addto_loop_lsp(tmpLSDptr,destptr,tmplen))
1047                                 if (inc_loop_lsp(destptr lspop tmplen,destlen-tmplen))
1048                                         cl_abort();
1049                 }
1050                 // Decrement len2.
1051                 destptr = destptr lspop len2p;
1052                 destlen -= len2p;
1053                 sourceptr2 = sourceptr2 lspop len2p;
1054                 len2 -= len2p;
1055         } while (len2 > 0);
1056 }
1057
1058 #ifndef _CHECKSUM
1059 #define _CHECKSUM
1060
1061 // Compute a checksum: number mod (2^intDsize-1).
1062 static uintD compute_checksum (const uintD* sourceptr, uintC len)
1063 {
1064         var uintD tmp = ~(uintD)0; // -1-(sum mod 2^intDsize-1), always >0
1065         do {
1066                 var uintD digit = lsprefnext(sourceptr);
1067                 if (digit < tmp)
1068                         tmp -= digit; // subtract digit
1069                 else
1070                         tmp -= digit+1; // subtract digit-(2^intDsize-1)
1071         } while (--len > 0);
1072         return ~tmp;
1073 }
1074
1075 // Multiply two checksums modulo (2^intDsize-1).
1076 static inline uintD multiply_checksum (uintD checksum1, uintD checksum2)
1077 {
1078         var uintD checksum;
1079         var uintD cksum_hi;
1080         #if HAVE_DD
1081         var uintDD cksum = muluD(checksum1,checksum2);
1082         cksum_hi = highD(cksum); checksum = lowD(cksum);
1083         #else
1084         muluD(checksum1,checksum2, cksum_hi =, checksum =);
1085         #endif
1086         if ((checksum += cksum_hi) + 1 <= cksum_hi)
1087                 checksum += 1;
1088         return checksum;
1089 }
1090
1091 #endif // _CHECKSUM
1092
1093 static void mulu_fftcomplex (const uintD* sourceptr1, uintC len1,
1094                              const uintD* sourceptr2, uintC len2,
1095                              uintD* destptr)
1096 {
1097         // Compute checksums of the arguments and multiply them.
1098         var uintD checksum1 = compute_checksum(sourceptr1,len1);
1099         var uintD checksum2 = compute_checksum(sourceptr2,len2);
1100         var uintD checksum = multiply_checksum(checksum1,checksum2);
1101         mulu_fftcomplex_nocheck(sourceptr1,len1,sourceptr2,len2,destptr);
1102         if (!(checksum == compute_checksum(destptr,len1+len2))) {
1103                 fprint(cl_stderr, "FFT problem: checksum error\n");
1104                 cl_abort();
1105         }
1106 }