cl_DS_mul_fftm.h

   1 // Fast integer multiplication using FFT in a modular ring.
   2 // Bruno Haible 14.5.,16.5.1996
   3
   4 // FFT in the complex domain has the drawback that it needs careful round-off
   5 // error analysis. So here we choose another field of characteristic 0: Q_p.
   6 // Since Q_p contains exactly the (p-1)th roots of unity, we choose
   7 // p == 1 mod N and have the Nth roots of unity (N = 2^n) in Q_p and
   8 // even in Z_p. Actually, we compute in Z/(p^m Z).
   9
  10 // All operations the FFT algorithm needs is addition, subtraction,
  11 // multiplication, multiplication by the Nth root and unity and division
  12 // by N. Hence we can use the domain Z/(p^m Z) even if p is not a prime!
  13
  14 // We use the Schönhage-Strassen choice of the modulus: p = 2^R+1. This
  15 // has two big advantages: Multiplication and division by 2 (which is a
  16 // (2R)th root of unity) or a power of 2 is just a shift and an subtraction.
  17 // And multiplication mod p is just a normal multiplication, followed by
  18 // a subtraction.
  19 // In order to exploit the (2R)th root of unity for FFT, we choose R = 2^r,
  20 // and do an FFT of size M with M = 2^m and M | 2R.
  21
  22 // Say we want to compute the product of two integers with N1 and N2 bits,
  23 // respectively. We choose N >= N1+N2 and K, R, M with
  24 //      ceiling(N1/K)+ceiling(N2/K)-1 <= M  (i.e. roughly N <= K*M),
  25 //      2*K+ceiling(log2(M)) <= R,
  26 //      R = 2^r, M = 2^m, M | 2R.
  27 // We then split each of the factors in M K-bit chunks each, and do
  28 // an FFT mod p = 2^R+1. We then recover the convolution of the chunks
  29 // from the FFT product (the first inequality ensures that this is possible).
  30 // The second inequality ensures that we have no overflow, i.e. the
  31 // convolution result is valid in Z, not only in Z/pZ.
  32
  33 // The computation time (bit complexity) will be proportional to
  34 //    Mul(N) = O(M log(M) * O(2R)) + M * Mul(R+1).
  35 // Hence we try to choose R as small as possible.
  36 // Roughly, R >= 2*K, R >= M/2, hence R^2 >= K*M >= N.
  37
  38 // For example, when N1 = N2 = 1000000:
  39 // Choosing R = 1024, M = 2048, K = 506, ceiling(N1/K) = ceiling(N2/K) = 1977,
  40 // M >= 3953, doesn't work.
  41 // Choosing R = 2048, M = 4096, K = 1018, ceiling(N1/K) = ceiling(N2/K) = 983,
  42 // M >= 1965, works.
  43 // Actually, we will also want  intDsize | K,  so that splitting into chunks
  44 // and putting together the result can be done without shifts. So
  45 // choose R = 2048, M = 4096, K = 992, ceiling(N1/K) = ceiling(N2/K) = 1009.
  46 // We see that M = 2048 suffices.
  47
  48 // In contrast to Nussbaumer multiplication, here we can use the standard
  49 // Karatsuba algorithm for multiplication mod p = 2^R+1. We don't have to
  50 // recurse until N=1.
  51
  52
  53 // Define this for (cheap) consistency checks.
  54 //#define DEBUG_FFTM
  55
  56
  57 // Operations modulo p = 2^R+1, each chunk represented as chlen words
  58 // (chlen = floor(R/intDsize)+1).
  59
  60 static inline void assign (const uintL R, const uintL chlen,
  61                            const uintD* a, uintD* r)
  62 {
  63         unused R;
  64         copy_loop_lsp(a,r,chlen);
  65 }
  66
  67 // r := (a + b) mod p
  68 static void addm (const uintL R, const uintL chlen,
  69                   const uintD* a, const uintD* b, uintD* r)
  70 {
  71         unused R;
  72         // r := a+b.
  73         add_loop_lsp(a,b, r, chlen);
  74 #if 0
  75         if (lspref(r,chlen-1) < ((uintD)1 << (R % intDsize)))
  76                 return;
  77         if (lspref(r,chlen-1) == ((uintD)1 << (R % intDsize)))
  78                 if (!DS_test_loop(r lspop (chlen-1),chlen-1,r))
  79                         return;
  80         // r >= p, so subtract r := r-p.
  81         lspref(r,chlen-1) -= ((uintD)1 << (R % intDsize));
  82         dec_loop_lsp(r,chlen);
  83 #else
  84         if (lspref(r,chlen-1) < 1)
  85                 return;
  86         if (lspref(r,chlen-1) == 1)
  87                 if (!DS_test_loop(r lspop (chlen-1),chlen-1,r))
  88                         return;
  89         // r >= p, so subtract r := r-p.
  90         lspref(r,chlen-1) -= 1;
  91         dec_loop_lsp(r,chlen);
  92 #endif
  93 }
  94
  95 // r := (a - b) mod p
  96 static void subm (const uintL R, const uintL chlen,
  97                   const uintD* a, const uintD* b, uintD* r)
  98 {
  99         unused R;
 100         // r := a-b.
 101         sub_loop_lsp(a,b, r, chlen);
 102 #if 0
 103         if ((sintD)lspref(r,chlen-1) >= 0)
 104                 return;
 105         // r < 0, so add r := r+p.
 106         lspref(r,chlen-1) += ((uintD)1 << (R % intDsize));
 107         inc_loop_lsp(r,chlen);
 108 #else
 109         if ((sintD)lspref(r,chlen-1) >= 0)
 110                 return;
 111         // r < 0, so add r := r+p.
 112         lspref(r,chlen-1) += 1;
 113         inc_loop_lsp(r,chlen);
 114 #endif
 115 }
 116
 117 // r := (a << s) mod p (0 <= s < R).
 118 // Assume that a and r don't overlap.
 119 static void shiftleftm (const uintL R, const uintL chlen,
 120                         const uintD* a, uintL s, uintD* r)
 121 {
 122         // Write a = 2^(R-s)*b + c, then
 123         // a << s = 2^R*b + (c << s) = (c << s) - b.
 124 #if 0
 125         if (chlen == 1) {
 126                 // R < intDsize.
 127                 var uintD b = lspref(a,0) >> (R-s);
 128                 var uintD c = lspref(a,0) & (((uintD)1 << (R-s)) - 1);
 129                 c = c << s;
 130                 c -= b;
 131                 if ((sintD)c < 0)
 132                         c += ((uintD)1 << R) + 1;
 133                 lspref(r,0) = c;
 134                 return;
 135         }
 136 #endif
 137         // Here R >= intDsize, hence intDsize | R.
 138         if ((s % intDsize) == 0) {
 139                 var uintP lenb = s/intDsize;
 140                 var uintP lenc = (R-s)/intDsize;
 141                 // chlen = 1 + lenb + lenc.
 142                 lspref(r,lenb+lenc) = 0;
 143                 copy_loop_lsp(a,r lspop lenb,lenc);
 144                 copy_loop_lsp(a lspop lenc,r,lenb);
 145                 if ((lspref(a,lenb+lenc) > 0) || neg_loop_lsp(r,lenb)) // -b gives carry?
 146                         if (dec_loop_lsp(r lspop lenb,lenc))
 147                                 // add p = 2^R+1 to compensate with carry
 148                                 inc_loop_lsp(r,chlen);
 149         } else {
 150                 var uintP lenb = floor(s,intDsize);
 151                 var uintP lenc = floor(R-s,intDsize)+1;
 152                 // chlen = 1 + lenb + lenc.
 153                 s = s % intDsize;
 154                 lspref(r,lenb+lenc) = 0;
 155                 var uintD b0 = shiftleftcopy_loop_lsp(a,r lspop lenb,lenc,s);
 156                 var uintD bov;
 157                 if (lenb == 0)
 158                         bov = b0;
 159                 else {
 160                         bov = shiftleftcopy_loop_lsp(a lspop lenc,r,lenb,s);
 161                         lspref(r,0) |= b0;
 162                 }
 163                 bov |= lspref(a,lenb+lenc) << s;
 164                 if (neg_loop_lsp(r,lenb))
 165                         bov++;
 166                 if (lspref(r,lenb) >= bov)
 167                         lspref(r,lenb) -= bov;
 168                 else {
 169                         lspref(r,lenb) -= bov;
 170                         if (dec_loop_lsp(r lspop (lenb+1),lenc-1))
 171                                 // add p = 2^R+1 to compensate with carry
 172                                 inc_loop_lsp(r,chlen);
 173                 }
 174         }
 175 }
 176
 177 // r := (a * b) mod p
 178 static void mulm (const uintL R, const uintL chlen,
 179                   const uintD* a, const uintD* b, uintD* r)
 180 {
 181         unused R;
 182         // The leading digits are very likely to be 0.
 183         var uintP a_len = chlen;
 184         if (lspref(a,a_len-1) == 0)
 185                 do {
 186                         a_len--;
 187                 } while ((a_len > 0) && (lspref(a,a_len-1) == 0));
 188         if (a_len == 0) {
 189                 clear_loop_lsp(r,chlen);
 190                 return;
 191         }
 192         var uintP b_len = chlen;
 193         if (lspref(b,b_len-1) == 0)
 194                 do {
 195                         b_len--;
 196                 } while ((b_len > 0) && (lspref(b,b_len-1) == 0));
 197         if (b_len == 0) {
 198                 clear_loop_lsp(r,chlen);
 199                 return;
 200         }
 201         CL_SMALL_ALLOCA_STACK;
 202         var uintD* tmp = cl_small_alloc_array(uintD,2*chlen);
 203         cl_UDS_mul(a,a_len, b,b_len, arrayLSDptr(tmp,2*chlen));
 204         DS_clear_loop(arrayMSDptr(tmp,2*chlen),2*chlen-(a_len+b_len),arrayLSDptr(tmp,2*chlen) lspop (a_len+b_len));
 205         // To divide c (0 <= c < p^2) by p = 2^R+1,
 206         // we set q := floor(c/2^R) and r := c - q*p = (c mod 2^R) - q.
 207         // If this becomes negative, set r := r + p (at most twice).
 208         // (This works because  floor(c/p) <= q <= floor(c/p)+2.)
 209         // (Actually, here, 0 <= c <= (p-1)^2, hence
 210         // floor(c/p) <= q <= floor(c/p)+1, so we have
 211         // to set r := r + p at most once!)
 212 #if 0
 213         if (chlen == 1) {
 214                 // R < intDsize.
 215                 var uintD r0 = (arrayLSref(tmp,2,0) & (((uintD)1 << R) - 1))
 216                              - ((arrayLSref(tmp,2,1) << (intDsize-R)) | (arrayLSref(tmp,2,0) >> R));
 217                 if ((sintD)r0 < 0)
 218                         r0 += ((uintD)1 << R) + 1;
 219                 lspref(r,0) = r0;
 220                 return;
 221         }
 222 #endif
 223         // Here R >= intDsize, hence intDsize | R.
 224         // R/intDsize = chlen-1.
 225         // arrayLSref(tmp,2*chlen,2*chlen-1) = 0, arrayLSref(tmp,2*chlen,2*chlen-2) <= 1.
 226         lspref(r,chlen-1) = 0;
 227         if (sub_loop_lsp(arrayLSDptr(tmp,2*chlen),arrayLSDptr(tmp,2*chlen) lspop (chlen-1),r,chlen-1) || arrayLSref(tmp,2*chlen,2*chlen-2))
 228                 // add p = 2^R+1 to compensate with carry
 229                 inc_loop_lsp(r,chlen);
 230 }
 231
 232 // b := (a / 2) mod p
 233 static void shiftm (const uintL R, const uintL chlen,
 234                     const uintD* a, uintD* b)
 235 {
 236         unused R;
 237         shiftrightcopy_loop_msp(a lspop chlen,b lspop chlen,chlen,1,0);
 238         if (lspref(a,0) & 1) {
 239                 // ((a + p) >> 1) = (a >> 1) + (p>>1) + 1.
 240 #if 0
 241                 if (chlen == 1)
 242                         // R < intDsize.
 243                         lspref(b,0) |= ((uintD)1 << (R-1));
 244                 else
 245 #endif
 246                         // intDsize | R.
 247                         lspref(b,chlen-2) |= ((uintD)1 << (intDsize-1));
 248                 inc_loop_lsp(b,chlen);
 249         }
 250 }
 251
 252
 253 #ifndef _BIT_REVERSE
 254 #define _BIT_REVERSE
 255 // Reverse an n-bit number x. n>0.
 256 static uintL bit_reverse (uintL n, uintL x)
 257 {
 258         var uintL y = 0;
 259         do {
 260                 y <<= 1;
 261                 y |= (x & 1);
 262                 x >>= 1;
 263         } while (!(--n == 0));
 264         return y;
 265 }
 266 #endif
 267
 268 static void mulu_fftm (const uintL r, const uintL R, // R = 2^r
 269                        const uintL m, const uintL M, // M = 2^m
 270                        const uintL k, // K = intDsize*k
 271                        const uintD* sourceptr1, uintC len1,
 272                        const uintD* sourceptr2, uintC len2,
 273                        uintD* destptr)
 274 // Assume:
 275 //      ceiling(len1/k)+ceiling(len2/k)-1 <= M,
 276 //      2*K+m <= R,
 277 //      R = 2^r, M = 2^m, M | 2R.
 278 //      m > 0.
 279 {
 280         var const uintL chlen = floor(R,intDsize)+1; // chunk length (in words)
 281         CL_ALLOCA_STACK;
 282         var uintD* const arrX = cl_alloc_array(uintD,chlen<<m);
 283         var uintD* const arrY = cl_alloc_array(uintD,chlen<<m);
 284         #ifdef DEBUG_FFTM
 285         var uintD* const arrZ = cl_alloc_array(uintD,chlen<<m);
 286         #else
 287         var uintD* const arrZ = arrX; // put Z in place of X - saves memory
 288         #endif
 289         #define X(i) arrayLSDptr(&arrX[chlen*(i)],chlen)
 290         #define Y(i) arrayLSDptr(&arrY[chlen*(i)],chlen)
 291         #define Z(i) (arrayLSDptr(&arrZ[chlen*(i)],chlen))
 292         var uintD* tmp;
 293         var uintD* sum;
 294         var uintD* diff;
 295         num_stack_alloc(chlen,,tmp=);
 296         num_stack_alloc(chlen,,sum=);
 297         num_stack_alloc(chlen,,diff=);
 298         var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2));
 299         var uintL i;
 300         // Initialize factors X(i) and Y(i).
 301         {
 302                 var const uintD* sptr = sourceptr1;
 303                 var uintL slen = len1;
 304                 for (i = 0; i < M; i++) {
 305                         var uintD* ptr = X(i);
 306                         if (slen >= k) {
 307                                 copy_loop_lsp(sptr,ptr,k);
 308                                 clear_loop_lsp(ptr lspop k,chlen-k);
 309                                 sptr = sptr lspop k;
 310                                 slen -= k;
 311                         } else {
 312                                 copy_loop_lsp(sptr,ptr,slen);
 313                                 clear_loop_lsp(ptr lspop slen,chlen-slen);
 314                                 i++;
 315                                 break;
 316                         }
 317                 }
 318                 // X(i) := ... := X(M-1) := 0
 319                 clear_loop_up(&arrX[chlen*i],chlen*(M-i));
 320         }
 321         if (!squaring) {
 322                 var const uintD* sptr = sourceptr2;
 323                 var uintL slen = len2;
 324                 for (i = 0; i < M; i++) {
 325                         var uintD* ptr = Y(i);
 326                         if (slen >= k) {
 327                                 copy_loop_lsp(sptr,ptr,k);
 328                                 clear_loop_lsp(ptr lspop k,chlen-k);
 329                                 sptr = sptr lspop k;
 330                                 slen -= k;
 331                         } else {
 332                                 copy_loop_lsp(sptr,ptr,slen);
 333                                 clear_loop_lsp(ptr lspop slen,chlen-slen);
 334                                 i++;
 335                                 break;
 336                         }
 337                 }
 338                 // Y(i) := ... := Y(M-1) := 0
 339                 clear_loop_up(&arrY[chlen*i],chlen*(M-i));
 340         }
 341         // Do an FFT of length M on X. w = 2^(2R/M) = 2^(2^(r+1-m)).
 342         {
 343                 var sintL l;
 344                 /* l = m-1 */ {
 345                         var const uintL tmax = M>>1; // tmax = 2^(m-1)
 346                         for (var uintL t = 0; t < tmax; t++) {
 347                                 var uintL i1 = t;
 348                                 var uintL i2 = i1 + tmax;
 349                                 // Butterfly: replace (X(i1),X(i2)) by
 350                                 // (X(i1) + X(i2), X(i1) - X(i2)).
 351                                 assign(R,chlen, X(i2), tmp);
 352                                 subm(R,chlen, X(i1),tmp, X(i2));
 353                                 addm(R,chlen, X(i1),tmp, X(i1));
 354                         }
 355                 }
 356                 for (l = m-2; l>=0; l--) {
 357                         var const uintL smax = (uintL)1 << (m-1-l);
 358                         var const uintL tmax = (uintL)1 << l;
 359                         for (var uintL s = 0; s < smax; s++) {
 360                                 // w^exp = 2^(exp << (r+1-m)).
 361                                 var uintL exp = bit_reverse(m-1-l,s) << (r-(m-1-l));
 362                                 for (var uintL t = 0; t < tmax; t++) {
 363                                         var uintL i1 = (s << (l+1)) + t;
 364                                         var uintL i2 = i1 + tmax;
 365                                         // Butterfly: replace (X(i1),X(i2)) by
 366                                         // (X(i1) + w^exp*X(i2), X(i1) - w^exp*X(i2)).
 367                                         shiftleftm(R,chlen, X(i2),exp, tmp);
 368                                         subm(R,chlen, X(i1),tmp, X(i2));
 369                                         addm(R,chlen, X(i1),tmp, X(i1));
 370                                 }
 371                         }
 372                 }
 373         }
 374         // Do an FFT of length M on Y. w = 2^(2R/M) = 2^(2^(r+1-m)).
 375         if (!squaring) {
 376                 var sintL l;
 377                 /* l = m-1 */ {
 378                         var const uintL tmax = M>>1; // tmax = 2^(m-1)
 379                         for (var uintL t = 0; t < tmax; t++) {
 380                                 var uintL i1 = t;
 381                                 var uintL i2 = i1 + tmax;
 382                                 // Butterfly: replace (Y(i1),Y(i2)) by
 383                                 // (Y(i1) + Y(i2), Y(i1) - Y(i2)).
 384                                 assign(R,chlen, Y(i2), tmp);
 385                                 subm(R,chlen, Y(i1),tmp, Y(i2));
 386                                 addm(R,chlen, Y(i1),tmp, Y(i1));
 387                         }
 388                 }
 389                 for (l = m-2; l>=0; l--) {
 390                         var const uintL smax = (uintL)1 << (m-1-l);
 391                         var const uintL tmax = (uintL)1 << l;
 392                         for (var uintL s = 0; s < smax; s++) {
 393                                 // w^exp = 2^(exp << (r+1-m)).
 394                                 var uintL exp = bit_reverse(m-1-l,s) << (r-(m-1-l));
 395                                 for (var uintL t = 0; t < tmax; t++) {
 396                                         var uintL i1 = (s << (l+1)) + t;
 397                                         var uintL i2 = i1 + tmax;
 398                                         // Butterfly: replace (Y(i1),Y(i2)) by
 399                                         // (Y(i1) + w^exp*Y(i2), Y(i1) - w^exp*Y(i2)).
 400                                         shiftleftm(R,chlen, Y(i2),exp, tmp);
 401                                         subm(R,chlen, Y(i1),tmp, Y(i2));
 402                                         addm(R,chlen, Y(i1),tmp, Y(i1));
 403                                 }
 404                         }
 405                 }
 406         }
 407         // Multiply the transformed vectors into Z.
 408         if (!squaring) {
 409                 for (i = 0; i < M; i++)
 410                         mulm(R,chlen, X(i),Y(i),Z(i));
 411         } else {
 412                 for (i = 0; i < M; i++)
 413                         mulm(R,chlen, X(i),X(i),Z(i));
 414         }
 415         // Undo an FFT of length M on Z. w = 2^(2R/M) = 2^(2^(r+1-m)).
 416         {
 417                 var uintL l;
 418                 for (l = 0; l < m-1; l++) {
 419                         var const uintL smax = (uintL)1 << (m-1-l);
 420                         var const uintL tmax = (uintL)1 << l;
 421                         /* s = 0, exp = 0 */ {
 422                                 for (var uintL t = 0; t < tmax; t++) {
 423                                         var uintL i1 = t;
 424                                         var uintL i2 = i1 + tmax;
 425                                         // Inverse Butterfly: replace (Z(i1),Z(i2)) by
 426                                         // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/(2*w^exp)),
 427                                         // with exp <-- 0.
 428                                         addm(R,chlen, Z(i1),Z(i2), sum);
 429                                         subm(R,chlen, Z(i1),Z(i2), diff);
 430                                         shiftm(R,chlen, sum, Z(i1));
 431                                         shiftm(R,chlen, diff, Z(i2));
 432                                 }
 433                         }
 434                         for (var uintL s = 1; s < smax; s++) {
 435                                 // w^exp = 2^(exp << (r+1-m)).
 436                                 var uintL exp = bit_reverse(m-1-l,s) << (r-(m-1-l));
 437                                 exp = R - exp; // negate exp (use w^-1 instead of w)
 438                                 for (var uintL t = 0; t < tmax; t++) {
 439                                         var uintL i1 = (s << (l+1)) + t;
 440                                         var uintL i2 = i1 + tmax;
 441                                         // Inverse Butterfly: replace (Z(i1),Z(i2)) by
 442                                         // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/(2*w^exp)),
 443                                         // with exp <-- (M/2 - exp).
 444                                         addm(R,chlen, Z(i1),Z(i2), sum);
 445                                         subm(R,chlen, Z(i2),Z(i1), diff); // note that w^(M/2) = 2^R = -1
 446                                         shiftm(R,chlen, sum, Z(i1));
 447                                         shiftleftm(R,chlen, diff,exp-1, Z(i2));
 448                                 }
 449                         }
 450                 }
 451                 /* l = m-1 */ {
 452                         var const uintL tmax = M>>1; // tmax = 2^(m-1)
 453                         for (var uintL t = 0; t < tmax; t++) {
 454                                 var uintL i1 = t;
 455                                 var uintL i2 = i1 + tmax;
 456                                 // Inverse Butterfly: replace (Z(i1),Z(i2)) by
 457                                 // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/2).
 458                                 addm(R,chlen, Z(i1),Z(i2), sum);
 459                                 subm(R,chlen, Z(i1),Z(i2), diff);
 460                                 shiftm(R,chlen, sum, Z(i1));
 461                                 shiftm(R,chlen, diff, Z(i2));
 462                         }
 463                 }
 464         }
 465         var uintC zchlen = 2*k + ceiling(m,intDsize);
 466         #ifdef DEBUG_FFTM
 467         // Check that every Z(i) has at most 2*K+m bits.
 468         {
 469                 var uintC zerodigits = chlen - zchlen;
 470                 for (i = 0; i < M; i++)
 471                         if (DS_test_loop(Z(i) lspop chlen,zerodigits,Z(i) lspop zchlen))
 472                                 cl_abort();
 473         }
 474         #endif
 475         // Put together result.
 476         var uintC destlen = len1+len2;
 477         clear_loop_lsp(destptr,destlen);
 478         for (i = 0; i < M; i++, destptr = destptr lspop k, destlen -= k) {
 479                 if (zchlen <= destlen) {
 480                         if (addto_loop_lsp(Z(i),destptr,zchlen))
 481                                 if (inc_loop_lsp(destptr lspop zchlen,destlen-zchlen))
 482                                         cl_abort();
 483                 } else {
 484                         #ifdef DEBUG_FFTM
 485                         if (DS_test_loop(Z(i) lspop zchlen,zchlen-destlen,Z(i) lspop destlen))
 486                                 cl_abort();
 487                         #endif
 488                         if (addto_loop_lsp(Z(i),destptr,destlen))
 489                                 cl_abort();
 490                 }
 491                 if (destlen <= k) {
 492                         i++;
 493                         break;
 494                 }
 495         }
 496         #ifdef DEBUG_FFTM
 497         // Check that Z(i)..Z(M-1) are all zero.
 498         if (test_loop_up(&arrZ[chlen*i],chlen*(M-i)))
 499                 cl_abort();
 500         #endif
 501         #undef diff
 502         #undef sum
 503         #undef tmp
 504         #undef Z
 505         #undef Y
 506         #undef X
 507 }
 508
 509 // The running time of mulu_fftm() is roughly
 510 //      O(M log(M) * O(2R)) + M * R^(1+c),  where c = log3/log2 - 1 = 0.585...
 511 // Try to minimize this given the constraints
 512 //      ceiling(len1/k)+ceiling(len2/k)-1 <= M,
 513 //      K = intDsize*k, 2*K+m <= R,
 514 //      R = 2^r, M = 2^m, M | 2R.
 515 //      m > 0.
 516 // Necessary conditions:
 517 //      len1+len2 <= k*(M+1),  intDsize*(len1+len2) <= K*(M+1) <= (R-1)/2 * (2*R+1) < R^2.
 518 //      2*intDsize+1 <= R, log2_intDsize+1 < r.
 519 // So we start with len1 <= len2,
 520 //    r := max(log2_intDsize+2,ceiling(ceiling(log2(intDsize*2*len1))/2)), R := 2^r.
 521 //    try
 522 //      kmax := floor((R-(r+1))/(2*intDsize)), Kmax := intDsize*kmax,
 523 //      m := max(1,ceiling(log2(2*ceiling(len1/kmax)-1))), M := 2^m,
 524 //      if m > r+1 retry with r <- r+1.
 525 //    [Now we are sure that we can at least multiply len1 and len1 digits using these
 526 //     values of r and m, symbolically (r,m) OKFOR (len1,len1).]
 527 //    [Normally, we will have m=r+1 or m=r.]
 528 //    For (len1,len2), we might want to split the second integer into pieces.
 529 //    If (r,m) OKFOR (len1,len2)
 530 //      If (r-1,m) OKFOR (len1,ceiling(len2/2))
 531 //        then use (r-1,m) and two pieces
 532 //        else use (r,m) and one piece
 533 //    else
 534 //      q1 := number of pieces len2 needs to be splitted into to be OKFOR (r,m),
 535 //      If m<r+1
 536 //        q2 := number of pieces len2 needs to be splitted into to be OKFOR (r,m+1),
 537 //        If 2*q2 <= q1
 538 //          then use (r,m+1) and q2 pieces
 539 //          else use (r,m) and q1 pieces
 540 //      else m=r+1
 541 //        q2 := number of pieces len2 needs to be splitted into to be OKFOR (r+1,m),
 542 //        If 3*q2 <= q1
 543 //          then use (r+1,m) and q2 pieces
 544 //          else use (r,m) and q1 pieces
 545
 546 // Because we always choose r >= log2_intDsize+2, R >= 4*intDsize, so chlen >= 5.
 547 // To avoid infinite recursion, mulu_fft_modm() must only be called with len1 > 5.
 548
 549 static bool okfor (uintL r, uintL m, uintC len1, uintC len2)
 550 {
 551         var uintL R = (uintL)1 << r;
 552         var uintL M = (uintL)1 << m;
 553         var uintL k = floor(R-m,2*intDsize);
 554         return (ceiling(len1,k)+ceiling(len2,k) <= M+1);
 555 }
 556
 557 static uintL numpieces (uintL r, uintL m, uintC len1, uintC len2)
 558 {
 559         var uintL R = (uintL)1 << r;
 560         var uintL M = (uintL)1 << m;
 561         var uintL k = floor(R-m,2*intDsize);
 562         var uintL piecelen2 = (M+1-ceiling(len1,k))*k;
 563         #ifdef DEBUG_FFTM
 564         if ((sintL)piecelen2 <= 0)
 565                 cl_abort();
 566         #endif
 567         return ceiling(len2,piecelen2);
 568 }
 569
 570 static void mulu_fft_modm (const uintD* sourceptr1, uintC len1,
 571                            const uintD* sourceptr2, uintC len2,
 572                            uintD* destptr)
 573   // Called only with 6 <= len1 <= len2.
 574   {
 575         var uint32 n;
 576         integerlength32(len1-1, n=); // 2^(n-1) < len1 <= 2^n
 577         var uintL r;
 578         var uintL m;
 579         r = ceiling(log2_intDsize+1+n,2);
 580         if (r < log2_intDsize+2)
 581                 r = log2_intDsize+2;
 582         retry: {
 583                 var uintL k = floor(((uintL)1 << r) - (r+1), 2*intDsize);
 584                 var uintL M = 2*ceiling(len1,k)-1;
 585                 integerlength32(M, m=);
 586                 if (m == 0)
 587                         m = 1;
 588                 if (m > r+1) {
 589                         r++;
 590                         goto retry;
 591                 }
 592         }
 593         #ifdef DEBUG_FFTM
 594         if (!(m > 0 && m <= r+1 && okfor(r,m,len1,len1)))
 595                 cl_abort();
 596         #endif
 597         if (okfor(r,m,len1,len2)) {
 598                 if ((m <= r) && (r > log2_intDsize+2) && okfor(r-1,m,len1,ceiling(len2,2)))
 599                         if (!(sourceptr1 == sourceptr2 && len1 == len2)) // when squaring, keep one piece
 600                                 r--;
 601         } else {
 602                 var uintL q1 = numpieces(r,m,len1,len2);
 603                 if (m <= r) {
 604                         var uintL q2 = numpieces(r,m+1,len1,len2);
 605                         if (2*q2 <= q1)
 606                                 m++;
 607                 } else {
 608                         var uintL q2 = numpieces(r+1,m,len1,len2);
 609                         if (3*q2 <= q1)
 610                                 r++;
 611                 }
 612         }
 613         var uintL R = (uintL)1 << r;
 614         var uintL M = (uintL)1 << m;
 615         var uintL k = floor(R-m,2*intDsize);
 616         var uintL piecelen2 = (M+1-ceiling(len1,k))*k;
 617         if (piecelen2 >= len2) {
 618                 // One piece only.
 619                 mulu_fftm(r,R, m,M, k, sourceptr1,len1, sourceptr2,len2, destptr);
 620                 return;
 621         }
 622         CL_ALLOCA_STACK;
 623         var uintD* tmpptr;
 624         num_stack_alloc(len1+piecelen2,,tmpptr=);
 625         var uintL destlen = len1+len2;
 626         clear_loop_lsp(destptr,destlen);
 627         do {
 628                 var uintL len2p; // length of a piece of source2
 629                 len2p = piecelen2;
 630                 if (len2p > len2)
 631                         len2p = len2;
 632                 // len2p = min(piecelen2,len2).
 633                 var uintL destlenp = len1 + len2p;
 634                 // destlenp = min(len1+piecelen2,destlen).
 635                 // Use tmpptr[-destlenp..-1].
 636                 if (len2p == 1) {
 637                         // cheap case
 638                         mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
 639                 } else if (2*len2p < piecelen2) {
 640                         // semi-cheap case
 641                         cl_UDS_mul(sourceptr1,len1, sourceptr2,len2p, tmpptr);
 642                 } else {
 643                         mulu_fftm(r,R, m,M, k, sourceptr1,len1, sourceptr2,len2p, tmpptr);
 644                 }
 645                 if (addto_loop_lsp(tmpptr,destptr,destlenp))
 646                         if (inc_loop_lsp(destptr lspop destlenp,destlen-destlenp))
 647                                 cl_abort();
 648                 // Decrement len2.
 649                 destptr = destptr lspop len2p;
 650                 destlen -= len2p;
 651                 sourceptr2 = sourceptr2 lspop len2p;
 652                 len2 -= len2p;
 653         } while (len2 > 0);
 654 }