src/base/digitseq/cl_DS_mul_nuss.h

   1 // Fast integer multiplication using Nussbaumer's FFT based algorithm.
   2 // [Donald Ervin Knuth: The Art of Computer Programming, Vol. II:
   3 //  Seminumerical Algorithms, second edition.
   4 //  Section 4.6.4, exercise 59, p. 503, 652-654.]
   5 // [Henri Jean Nussbaumer, IEEE Trans. ASSP-28 (1980), 205-215.]
   6 // Bruno Haible 4.-5.5.1996
   7
   8 // This algorithm has the benefit of working on entire words, not single bits,
   9 // and involving no non-integer numbers. (The root of unity is chosen in
  10 // an appropriate polynomial ring.)
  11
  12 // If at the beginning all words x_i, y_i are >= 0 and < M, then
  13 // the intermediate X_{i,j}, Y_{i,j} are < M * N in absolute value
  14 // (where N = number of words), hence the |Z_{i,j}| < M^2 * N^2.
  15 // We therefore reserve 2 32-bit words for every X_{i,j} and 4 32-bit words
  16 // for every Z_{i,j}.
  17
  18 #if !(intDsize==32)
  19 #error "nussbaumer implemented only for intDsize==32"
  20 #endif
  21
  22 // Define this if you want the external loops instead of inline operations.
  23 //#define NUSS_IN_EXTERNAL_LOOPS
  24 #define NUSS_OUT_EXTERNAL_LOOPS
  25
  26 // Define this if you want inline operations which access the stack directly.
  27 // This looks like better code, but is in effect 3% slower. No idea why.
  28 //#define NUSS_ASM_DIRECT
  29
  30 // Define this for (cheap) consistency checks.
  31 //#define DEBUG_NUSS
  32
  33 // Define this for extensive consistency checks.
  34 //#define DEBUG_NUSS_OPERATIONS
  35
  36 #if (intDsize==32)
  37
  38 //typedef struct { sint32 iw1; uint32 iw0; } nuss_inword;
  39 //typedef struct { uint32 iw0; sint32 iw1; } nuss_inword;
  40 typedef struct { uintD _iw[2]; } nuss_inword;
  41 #if CL_DS_BIG_ENDIAN_P
  42   #define iw1 _iw[0]
  43   #define iw0 _iw[1]
  44 #else
  45   #define iw0 _iw[0]
  46   #define iw1 _iw[1]
  47 #endif
  48
  49 //typedef struct { sint32 ow3; uint32 ow2; uint32 ow1; uint32 ow0; } nuss_outword;
  50 //typedef struct { uint32 ow0; uint32 ow1; uint32 ow2; sint32 ow3; } nuss_outword;
  51 typedef struct { uintD _ow[4]; } nuss_outword;
  52 #if CL_DS_BIG_ENDIAN_P
  53   #define ow3 _ow[0]
  54   #define ow2 _ow[1]
  55   #define ow1 _ow[2]
  56   #define ow0 _ow[3]
  57 #else
  58   #define ow0 _ow[0]
  59   #define ow1 _ow[1]
  60   #define ow2 _ow[2]
  61   #define ow3 _ow[3]
  62 #endif
  63
  64 // r := a + b
  65 static inline void add (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
  66 {
  67 #if defined(__GNUC__) && defined(__i386__)
  68         var uintD dummy;
  69   #ifdef NUSS_ASM_DIRECT
  70         __asm__ __volatile__ (
  71                 "movl %1,%0" "\n\t"
  72                 "addl %2,%0" "\n\t"
  73                 "movl %0,%3"
  74                 : "=&q" (dummy)
  75                 : "m" (a.iw0), "m" (b.iw0), "m" (r.iw0)
  76                 : "cc"
  77                 );
  78         __asm__ __volatile__ (
  79                 "movl %1,%0" "\n\t"
  80                 "adcl %2,%0" "\n\t"
  81                 "movl %0,%3"
  82                 : "=&q" (dummy)
  83                 : "m" (a.iw1), "m" (b.iw1), "m" (r.iw1)
  84                 : "cc"
  85                 );
  86   #else
  87     #if CL_DS_BIG_ENDIAN_P
  88         __asm__ __volatile__ (
  89                 "movl 4(%1),%0" "\n\t"
  90                 "addl 4(%2),%0" "\n\t"
  91                 "movl %0,4(%3)" "\n\t"
  92                 "movl (%1),%0"  "\n\t"
  93                 "adcl (%2),%0"  "\n\t"
  94                 "movl %0,(%3)"
  95                 : "=&q" (dummy)
  96                 : "r" (&a), "r" (&b), "r" (&r)
  97                 : "cc"
  98                 );
  99     #else
 100         __asm__ __volatile__ (
 101                 "movl (%1),%0"  "\n\t"
 102                 "addl (%2),%0"  "\n\t"
 103                 "movl %0,(%3)"  "\n\t"
 104                 "movl 4(%1),%0" "\n\t"
 105                 "adcl 4(%2),%0" "\n\t"
 106                 "movl %0,4(%3)"
 107                 : "=&q" (dummy)
 108                 : "r" (&a), "r" (&b), "r" (&r)
 109                 : "cc"
 110                 );
 111     #endif
 112   #endif
 113 #elif defined(NUSS_IN_EXTERNAL_LOOPS)
 114         add_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
 115 #else
 116         var uint32 tmp;
 117
 118         tmp = a.iw0 + b.iw0;
 119         if (tmp >= a.iw0) {
 120                 // no carry
 121                 r.iw0 = tmp;
 122                 r.iw1 = a.iw1 + b.iw1;
 123         } else {
 124                 // carry
 125                 r.iw0 = tmp;
 126                 r.iw1 = a.iw1 + b.iw1 + 1;
 127         }
 128 #endif
 129 }
 130
 131 // r := a - b
 132 static inline void sub (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
 133 {
 134 #if defined(__GNUC__) && defined(__i386__)
 135         var uintD dummy;
 136   #ifdef NUSS_ASM_DIRECT
 137         __asm__ __volatile__ (
 138                 "movl %1,%0" "\n\t"
 139                 "subl %2,%0" "\n\t"
 140                 "movl %0,%3"
 141                 : "=&q" (dummy)
 142                 : "m" (a.iw0), "m" (b.iw0), "m" (r.iw0)
 143                 : "cc"
 144                 );
 145         __asm__ __volatile__ (
 146                 "movl %1,%0" "\n\t"
 147                 "sbbl %2,%0" "\n\t"
 148                 "movl %0,%3"
 149                 : "=&q" (dummy)
 150                 : "m" (a.iw1), "m" (b.iw1), "m" (r.iw1)
 151                 : "cc"
 152                 );
 153   #else
 154     #if CL_DS_BIG_ENDIAN_P
 155         __asm__ __volatile__ (
 156                 "movl 4(%1),%0" "\n\t"
 157                 "subl 4(%2),%0" "\n\t"
 158                 "movl %0,4(%3)" "\n\t"
 159                 "movl (%1),%0"  "\n\t"
 160                 "sbbl (%2),%0"  "\n\t"
 161                 "movl %0,(%3)"
 162                 : "=&q" (dummy)
 163                 : "r" (&a), "r" (&b), "r" (&r)
 164                 : "cc"
 165                 );
 166     #else
 167         __asm__ __volatile__ (
 168                 "movl (%1),%0"  "\n\t"
 169                 "subl (%2),%0"  "\n\t"
 170                 "movl %0,(%3)"  "\n\t"
 171                 "movl 4(%1),%0" "\n\t"
 172                 "sbbl 4(%2),%0" "\n\t"
 173                 "movl %0,4(%3)"
 174                 : "=&q" (dummy)
 175                 : "r" (&a), "r" (&b), "r" (&r)
 176                 : "cc"
 177                 );
 178     #endif
 179   #endif
 180 #elif defined(NUSS_IN_EXTERNAL_LOOPS)
 181         sub_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
 182 #else
 183         var uint32 tmp;
 184
 185         tmp = a.iw0 - b.iw0;
 186         if (tmp <= a.iw0) {
 187                 // no carry
 188                 r.iw0 = tmp;
 189                 r.iw1 = a.iw1 - b.iw1;
 190         } else {
 191                 // carry
 192                 r.iw0 = tmp;
 193                 r.iw1 = a.iw1 - b.iw1 - 1;
 194         }
 195 #endif
 196 }
 197
 198 // r := a * b
 199 static void mul (const nuss_inword& a, const nuss_inword& b, nuss_outword& r)
 200 {
 201 #ifdef NUSS_IN_EXTERNAL_LOOPS
 202         mulu_2loop(arrayLSDptr(a._iw,2),2, arrayLSDptr(b._iw,2),2, arrayLSDptr(r._ow,4));
 203         if ((sintD)mspref(arrayMSDptr(a._iw,2),0) < 0)
 204                 subfrom_loop_lsp(arrayLSDptr(b._iw,2),arrayLSDptr(r._ow,4) lspop 2,2);
 205         if ((sintD)mspref(arrayMSDptr(b._iw,2),0) < 0)
 206                 subfrom_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(r._ow,4) lspop 2,2);
 207 #else
 208         if (a.iw1 == 0) {
 209                 // a small positive
 210                 if (b.iw1 == 0) {
 211                         // a, b small positive
 212                         mulu32(a.iw0, b.iw0, r.ow1 =, r.ow0 =);
 213                         r.ow3 = 0; r.ow2 = 0;
 214                         return;
 215                 }
 216                 else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
 217                         // b small negative
 218                         var uint32 hi, lo;
 219                         mulu32(a.iw0, -b.iw0, hi=, lo=);
 220                         r.ow0 = -lo;
 221                         if (lo) {
 222                                 r.ow1 = ~hi;
 223                         } else if (hi) {
 224                                 r.ow1 = -hi;
 225                         } else /* a.iw0 == 0 */ {
 226                                 r.ow3 = 0; r.ow2 = 0; r.ow1 = 0;
 227                                 return;
 228                         }
 229                         r.ow3 = -(uint32)1; r.ow2 = -(uint32)1;
 230                         return;
 231                 }
 232                 var uint32 hi1, lo1, hi0;
 233                 mulu32(a.iw0, b.iw0, hi0 =, r.ow0 =);
 234                 mulu32(a.iw0, b.iw1, hi1 =, lo1 =);
 235                 if ((lo1 += hi0) < hi0)
 236                         hi1++;
 237                 // hi1|lo1|r.ow0 = a.iw0 * b(unsigned).
 238                 r.ow1 = lo1;
 239                 if ((sint32)b.iw1 >= 0) {
 240                         r.ow2 = hi1;
 241                         r.ow3 = 0;
 242                 } else {
 243                         // b was negative -> subtract a * 2^64
 244                         if (a.iw0) {
 245                                 r.ow2 = hi1 - a.iw0;
 246                                 r.ow3 = -(uint32)1;
 247                         } else /* a.iw0 == 0 */ {
 248                                 r.ow3 = 0; r.ow2 = 0;
 249                         }
 250                 }
 251                 return;
 252         }
 253         else if (a.iw1 == -(uint32)1 && a.iw0 != 0) {
 254                 // a small negative
 255                 if (b.iw1 == 0) {
 256                         // b small positive
 257                         var uint32 hi, lo;
 258                         mulu32(-a.iw0, b.iw0, hi=, lo=);
 259                         r.ow0 = -lo;
 260                         if (lo) {
 261                                 r.ow1 = ~hi;
 262                         } else if (hi) {
 263                                 r.ow1 = -hi;
 264                         } else /* b.iw0 == 0 */ {
 265                                 r.ow3 = 0; r.ow2 = 0; r.ow1 = 0;
 266                                 return;
 267                         }
 268                         r.ow3 = -(uint32)1; r.ow2 = -(uint32)1;
 269                         return;
 270                 }
 271                 else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
 272                         // a, b small negative
 273                         mulu32(-a.iw0, -b.iw0, r.ow1 =, r.ow0 =);
 274                         r.ow3 = 0; r.ow2 = 0;
 275                         return;
 276                 }
 277                 var uint32 hi1, lo1, hi0, lo0;
 278                 mulu32(-a.iw0, b.iw0, hi0 =, lo0 =);
 279                 mulu32(-a.iw0, b.iw1, hi1 =, lo1 =);
 280                 if ((lo1 += hi0) < hi0)
 281                         hi1++;
 282                 // hi1|lo1|lo0 = -a * b(unsigned).
 283                 if (lo0) {
 284                         lo0 = -lo0;
 285                         lo1 = ~lo1;
 286                         hi1 = ~hi1;
 287                 } else if (lo1) {
 288                         lo1 = -lo1;
 289                         hi1 = ~hi1;
 290                 } else
 291                         hi1 = -hi1;
 292                 // hi1|lo1|lo0 = a * b(unsigned).
 293                 r.ow0 = lo0;
 294                 r.ow1 = lo1;
 295                 if ((sint32)b.iw1 >= 0) {
 296                         r.ow2 = hi1;
 297                         r.ow3 = -(uint32)1;
 298                 } else {
 299                         // b was negative -> subtract a * 2^64
 300                         r.ow2 = hi1 - a.iw0;
 301                         r.ow3 = 0;
 302                 }
 303                 return;
 304         }
 305         else if (b.iw1 == 0) {
 306                 // b small positive
 307                 var uint32 hi1, lo1, hi0;
 308                 mulu32(b.iw0, a.iw0, hi0 =, r.ow0 =);
 309                 mulu32(b.iw0, a.iw1, hi1 =, lo1 =);
 310                 if ((lo1 += hi0) < hi0)
 311                         hi1++;
 312                 // hi1|lo1|r.ow0 = a(unsigned) * b.iw0.
 313                 r.ow1 = lo1;
 314                 if ((sint32)a.iw1 >= 0) {
 315                         r.ow2 = hi1;
 316                         r.ow3 = 0;
 317                 } else {
 318                         // a was negative -> subtract b * 2^64
 319                         if (b.iw0) {
 320                                 r.ow2 = hi1 - b.iw0;
 321                                 r.ow3 = -(uint32)1;
 322                         } else /* b.iw0 == 0 */ {
 323                                 r.ow3 = 0; r.ow2 = 0;
 324                         }
 325                 }
 326                 return;
 327         }
 328         else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
 329                 // b small negative
 330                 var uint32 hi1, lo1, hi0, lo0;
 331                 mulu32(-b.iw0, a.iw0, hi0 =, lo0 =);
 332                 mulu32(-b.iw0, a.iw1, hi1 =, lo1 =);
 333                 if ((lo1 += hi0) < hi0)
 334                         hi1++;
 335                 // hi1|lo1|lo0 = a(unsigned) * -b.
 336                 if (lo0) {
 337                         lo0 = -lo0;
 338                         lo1 = ~lo1;
 339                         hi1 = ~hi1;
 340                 } else if (lo1) {
 341                         lo1 = -lo1;
 342                         hi1 = ~hi1;
 343                 } else
 344                         hi1 = -hi1;
 345                 // hi1|lo1|lo0 = a(unsigned) * b.
 346                 r.ow0 = lo0;
 347                 r.ow1 = lo1;
 348                 if ((sint32)a.iw1 >= 0) {
 349                         r.ow2 = hi1;
 350                         r.ow3 = -(uint32)1;
 351                 } else {
 352                         // a was negative -> subtract b * 2^64
 353                         r.ow2 = hi1 - b.iw0;
 354                         r.ow3 = 0;
 355                 }
 356                 return;
 357         }
 358         // This is the main and most frequent case (65% to 80%).
 359         var uint32 w3, w2, w1, hi, lo;
 360         mulu32(a.iw0, b.iw0, w1=, r.ow0=);
 361         mulu32(a.iw1, b.iw1, w3=, w2=);
 362         mulu32(a.iw0, b.iw1, hi=, lo=);
 363         if ((w1 += lo) < lo)
 364                 hi++;
 365         if ((w2 += hi) < hi)
 366                 w3++;
 367         mulu32(a.iw1, b.iw0, hi=, lo=);
 368         if ((w1 += lo) < lo)
 369                 hi++;
 370         if ((w2 += hi) < hi)
 371                 w3++;
 372         // w3|w2|w1|r.ow0 = a(unsigned) * b(unsigned).
 373         r.ow1 = w1;
 374         if ((sint32)a.iw1 < 0) {
 375                 // a was negative -> subtract b * 2^64
 376                 if (w2 >= b.iw0) {
 377                         w2 -= b.iw0;
 378                         w3 -= b.iw1;
 379                 } else {
 380                         // carry
 381                         w2 -= b.iw0;
 382                         w3 = w3 - b.iw1 - 1;
 383                 }
 384         }
 385         if ((sint32)b.iw1 < 0) {
 386                 // b was negative -> subtract a * 2^64
 387                 if (w2 >= a.iw0) {
 388                         w2 -= a.iw0;
 389                         w3 -= a.iw1;
 390                 } else {
 391                         // carry
 392                         w2 -= a.iw0;
 393                         w3 = w3 - a.iw1 - 1;
 394                 }
 395         }
 396         r.ow2 = w2;
 397         r.ow3 = w3;
 398         return;
 399 #endif
 400 }
 401 #ifdef DEBUG_NUSS_OPERATIONS
 402 static void mul_doublecheck (const nuss_inword& a, const nuss_inword& b, nuss_outword& r)
 403 {
 404         nuss_outword or;
 405         mulu_2loop(arrayLSDptr(a._iw,2),2, arrayLSDptr(b._iw,2),2, arrayLSDptr(or._ow,4));
 406         if ((sintD)mspref(arrayMSDptr(a._iw,2),0) < 0)
 407                 subfrom_loop_lsp(arrayLSDptr(b._iw,2),arrayLSDptr(or._ow,4) lspop 2,2);
 408         if ((sintD)mspref(arrayMSDptr(b._iw,2),0) < 0)
 409                 subfrom_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(or._ow,4) lspop 2,2);
 410         mul(a,b, r);
 411         if (compare_loop_msp(arrayMSDptr(r._ow,4),arrayMSDptr(or._ow,4),4))
 412                 cl_abort();
 413 }
 414 #define mul mul_doublecheck
 415 #endif
 416
 417 // r := 0
 418 static inline void zero (nuss_outword& r)
 419 {
 420         r.ow0 = 0;
 421         r.ow1 = 0;
 422         r.ow2 = 0;
 423         r.ow3 = 0;
 424 }
 425
 426 // r := a + b
 427 static inline void add (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
 428 {
 429 #if defined(__GNUC__) && defined(__i386__)
 430         var uintD dummy;
 431   #ifdef NUSS_ASM_DIRECT
 432         __asm__ __volatile__ (
 433                 "movl %1,%0" "\n\t"
 434                 "addl %2,%0" "\n\t"
 435                 "movl %0,%3"
 436                 : "=&q" (dummy)
 437                 : "m" (a.ow0), "m" (b.ow0), "m" (r.ow0)
 438                 : "cc"
 439                 );
 440         __asm__ __volatile__ (
 441                 "movl %1,%0" "\n\t"
 442                 "adcl %2,%0" "\n\t"
 443                 "movl %0,%3"
 444                 : "=&q" (dummy)
 445                 : "m" (a.ow1), "m" (b.ow1), "m" (r.ow1)
 446                 : "cc"
 447                 );
 448         __asm__ __volatile__ (
 449                 "movl %1,%0" "\n\t"
 450                 "adcl %2,%0" "\n\t"
 451                 "movl %0,%3"
 452                 : "=&q" (dummy)
 453                 : "m" (a.ow2), "m" (b.ow2), "m" (r.ow2)
 454                 : "cc"
 455                 );
 456         __asm__ __volatile__ (
 457                 "movl %1,%0" "\n\t"
 458                 "adcl %2,%0" "\n\t"
 459                 "movl %0,%3"
 460                 : "=&q" (dummy)
 461                 : "m" (a.ow3), "m" (b.ow3), "m" (r.ow3)
 462                 : "cc"
 463                 );
 464   #else
 465     #if CL_DS_BIG_ENDIAN_P
 466         __asm__ __volatile__ (
 467                 "movl 12(%1),%0" "\n\t"
 468                 "addl 12(%2),%0" "\n\t"
 469                 "movl %0,12(%3)" "\n\t"
 470                 "movl 8(%1),%0"  "\n\t"
 471                 "adcl 8(%2),%0"  "\n\t"
 472                 "movl %0,8(%3)"  "\n\t"
 473                 "movl 4(%1),%0"  "\n\t"
 474                 "adcl 4(%2),%0"  "\n\t"
 475                 "movl %0,4(%3)"  "\n\t"
 476                 "movl (%1),%0"   "\n\t"
 477                 "adcl (%2),%0"   "\n\t"
 478                 "movl %0,(%3)"
 479                 : "=&q" (dummy)
 480                 : "r" (&a), "r" (&b), "r" (&r)
 481                 : "cc"
 482                 );
 483     #else
 484         __asm__ __volatile__ (
 485                 "movl (%1),%0"   "\n\t"
 486                 "addl (%2),%0"   "\n\t"
 487                 "movl %0,(%3)"   "\n\t"
 488                 "movl 4(%1),%0"  "\n\t"
 489                 "adcl 4(%2),%0"  "\n\t"
 490                 "movl %0,4(%3)"  "\n\t"
 491                 "movl 8(%1),%0"  "\n\t"
 492                 "adcl 8(%2),%0"  "\n\t"
 493                 "movl %0,8(%3)"  "\n\t"
 494                 "movl 12(%1),%0" "\n\t"
 495                 "adcl 12(%2),%0" "\n\t"
 496                 "movl %0,12(%3)"
 497                 : "=&q" (dummy)
 498                 : "r" (&a), "r" (&b), "r" (&r)
 499                 : "cc"
 500                 );
 501     #endif
 502   #endif
 503 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
 504         add_loop_lsp(arrayLSDptr(a._ow,4),arrayLSDptr(b._ow,4),arrayLSDptr(r._ow,4),4);
 505 #else
 506         var uint32 tmp;
 507
 508         tmp = a.ow0 + b.ow0;
 509         if (tmp >= a.ow0) {
 510                 // no carry
 511                 r.ow0 = tmp;
 512                 tmp = a.ow1 + b.ow1;
 513                 if (tmp >= a.ow1) goto no_carry_1; else goto carry_1;
 514         } else {
 515                 // carry
 516                 r.ow0 = tmp;
 517                 tmp = a.ow1 + b.ow1 + 1;
 518                 if (tmp > a.ow1) goto no_carry_1; else goto carry_1;
 519         }
 520         if (1) {
 521                 no_carry_1: // no carry
 522                 r.ow1 = tmp;
 523                 tmp = a.ow2 + b.ow2;
 524                 if (tmp >= a.ow2) goto no_carry_2; else goto carry_2;
 525         } else {
 526                 carry_1: // carry
 527                 r.ow1 = tmp;
 528                 tmp = a.ow2 + b.ow2 + 1;
 529                 if (tmp > a.ow2) goto no_carry_2; else goto carry_2;
 530         }
 531         if (1) {
 532                 no_carry_2: // no carry
 533                 r.ow2 = tmp;
 534                 tmp = a.ow3 + b.ow3;
 535         } else {
 536                 carry_2: // carry
 537                 r.ow2 = tmp;
 538                 tmp = a.ow3 + b.ow3 + 1;
 539         }
 540         r.ow3 = tmp;
 541 #endif
 542 }
 543
 544 // r := a - b
 545 static inline void sub (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
 546 {
 547 #if defined(__GNUC__) && defined(__i386__)
 548         var uintD dummy;
 549   #ifdef NUSS_ASM_DIRECT
 550         __asm__ __volatile__ (
 551                 "movl %1,%0" "\n\t"
 552                 "subl %2,%0" "\n\t"
 553                 "movl %0,%3"
 554                 : "=&q" (dummy)
 555                 : "m" (a.ow0), "m" (b.ow0), "m" (r.ow0)
 556                 : "cc"
 557                 );
 558         __asm__ __volatile__ (
 559                 "movl %1,%0" "\n\t"
 560                 "sbbl %2,%0" "\n\t"
 561                 "movl %0,%3"
 562                 : "=&q" (dummy)
 563                 : "m" (a.ow1), "m" (b.ow1), "m" (r.ow1)
 564                 : "cc"
 565                 );
 566         __asm__ __volatile__ (
 567                 "movl %1,%0" "\n\t"
 568                 "sbbl %2,%0" "\n\t"
 569                 "movl %0,%3"
 570                 : "=&q" (dummy)
 571                 : "m" (a.ow2), "m" (b.ow2), "m" (r.ow2)
 572                 : "cc"
 573                 );
 574         __asm__ __volatile__ (
 575                 "movl %1,%0" "\n\t"
 576                 "sbbl %2,%0" "\n\t"
 577                 "movl %0,%3"
 578                 : "=&q" (dummy)
 579                 : "m" (a.ow3), "m" (b.ow3), "m" (r.ow3)
 580                 : "cc"
 581                 );
 582   #else
 583     #if CL_DS_BIG_ENDIAN_P
 584         __asm__ __volatile__ (
 585                 "movl 12(%1),%0" "\n\t"
 586                 "subl 12(%2),%0" "\n\t"
 587                 "movl %0,12(%3)" "\n\t"
 588                 "movl 8(%1),%0"  "\n\t"
 589                 "sbbl 8(%2),%0"  "\n\t"
 590                 "movl %0,8(%3)"  "\n\t"
 591                 "movl 4(%1),%0"  "\n\t"
 592                 "sbbl 4(%2),%0"  "\n\t"
 593                 "movl %0,4(%3)"  "\n\t"
 594                 "movl (%1),%0"   "\n\t"
 595                 "sbbl (%2),%0"   "\n\t"
 596                 "movl %0,(%3)"
 597                 : "=&q" (dummy)
 598                 : "r" (&a), "r" (&b), "r" (&r)
 599                 : "cc"
 600                 );
 601     #else
 602         __asm__ __volatile__ (
 603                 "movl (%1),%0"   "\n\t"
 604                 "subl (%2),%0"   "\n\t"
 605                 "movl %0,(%3)"   "\n\t"
 606                 "movl 4(%1),%0"  "\n\t"
 607                 "sbbl 4(%2),%0"  "\n\t"
 608                 "movl %0,4(%3)"  "\n\t"
 609                 "movl 8(%1),%0"  "\n\t"
 610                 "sbbl 8(%2),%0"  "\n\t"
 611                 "movl %0,8(%3)"  "\n\t"
 612                 "movl 12(%1),%0" "\n\t"
 613                 "sbbl 12(%2),%0" "\n\t"
 614                 "movl %0,12(%3)"
 615                 : "=&q" (dummy)
 616                 : "r" (&a), "r" (&b), "r" (&r)
 617                 : "cc"
 618                 );
 619     #endif
 620   #endif
 621 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
 622         sub_loop_lsp(arrayLSDptr(a._ow,4),arrayLSDptr(b._ow,4),arrayLSDptr(r._ow,4),4);
 623 #else
 624         var uint32 tmp;
 625
 626         tmp = a.ow0 - b.ow0;
 627         if (tmp <= a.ow0) {
 628                 // no carry
 629                 r.ow0 = tmp;
 630                 tmp = a.ow1 - b.ow1;
 631                 if (tmp <= a.ow1) goto no_carry_1; else goto carry_1;
 632         } else {
 633                 // carry
 634                 r.ow0 = tmp;
 635                 tmp = a.ow1 - b.ow1 - 1;
 636                 if (tmp < a.ow1) goto no_carry_1; else goto carry_1;
 637         }
 638         if (1) {
 639                 no_carry_1: // no carry
 640                 r.ow1 = tmp;
 641                 tmp = a.ow2 - b.ow2;
 642                 if (tmp <= a.ow2) goto no_carry_2; else goto carry_2;
 643         } else {
 644                 carry_1: // carry
 645                 r.ow1 = tmp;
 646                 tmp = a.ow2 - b.ow2 - 1;
 647                 if (tmp < a.ow2) goto no_carry_2; else goto carry_2;
 648         }
 649         if (1) {
 650                 no_carry_2: // no carry
 651                 r.ow2 = tmp;
 652                 tmp = a.ow3 - b.ow3;
 653         } else {
 654                 carry_2: // carry
 655                 r.ow2 = tmp;
 656                 tmp = a.ow3 - b.ow3 - 1;
 657         }
 658         r.ow3 = tmp;
 659 #endif
 660 }
 661
 662 // b := a >> 1
 663 static inline void shift (const nuss_outword& a, nuss_outword& b)
 664 {
 665 #if defined(__GNUC__) && defined(__i386__) && !defined(DEBUG_NUSS)
 666         var uintD dummy;
 667   #ifdef NUSS_ASM_DIRECT
 668         __asm__ __volatile__ (
 669                 "movl %1,%0" "\n\t"
 670                 "sarl $1,%0" "\n\t"
 671                 "movl %0,%2"
 672                 : "=&q" (dummy)
 673                 : "m" (a.ow3), "m" (b.ow3)
 674                 : "cc"
 675                 );
 676         __asm__ __volatile__ (
 677                 "movl %1,%0" "\n\t"
 678                 "rcrl $1,%0" "\n\t"
 679                 "movl %0,%2"
 680                 : "=&q" (dummy)
 681                 : "m" (a.ow2), "m" (b.ow2)
 682                 : "cc"
 683                 );
 684         __asm__ __volatile__ (
 685                 "movl %1,%0" "\n\t"
 686                 "rcrl $1,%0" "\n\t"
 687                 "movl %0,%2"
 688                 : "=&q" (dummy)
 689                 : "m" (a.ow1), "m" (b.ow1)
 690                 : "cc"
 691                 );
 692         __asm__ __volatile__ (
 693                 "movl %1,%0" "\n\t"
 694                 "rcrl $1,%0" "\n\t"
 695                 "movl %0,%2"
 696                 : "=&q" (dummy)
 697                 : "m" (a.ow0), "m" (b.ow0)
 698                 : "cc"
 699                 );
 700   #else
 701     #if CL_DS_BIG_ENDIAN_P
 702         __asm__ __volatile__ (
 703                 "movl (%1),%0"   "\n\t"
 704                 "sarl $1,%0"     "\n\t"
 705                 "movl %0,(%2)"   "\n\t"
 706                 "movl 4(%1),%0"  "\n\t"
 707                 "rcrl $1,%0"     "\n\t"
 708                 "movl %0,4(%2)"  "\n\t"
 709                 "movl 8(%1),%0"  "\n\t"
 710                 "rcrl $1,%0"     "\n\t"
 711                 "movl %0,8(%2)"  "\n\t"
 712                 "movl 12(%1),%0" "\n\t"
 713                 "rcrl $1,%0"     "\n\t"
 714                 "movl %0,12(%2)"
 715                 : "=&q" (dummy)
 716                 : "r" (&a), "r" (&b)
 717                 : "cc"
 718                 );
 719     #else
 720         __asm__ __volatile__ (
 721                 "movl 12(%1),%0" "\n\t"
 722                 "sarl $1,%0"     "\n\t"
 723                 "movl %0,12(%2)" "\n\t"
 724                 "movl 8(%1),%0"  "\n\t"
 725                 "rcrl $1,%0"     "\n\t"
 726                 "movl %0,8(%2)"  "\n\t"
 727                 "movl 4(%1),%0"  "\n\t"
 728                 "rcrl $1,%0"     "\n\t"
 729                 "movl %0,4(%2)"  "\n\t"
 730                 "movl (%1),%0"   "\n\t"
 731                 "rcrl $1,%0"     "\n\t"
 732                 "movl %0,(%2)"
 733                 : "=&q" (dummy)
 734                 : "r" (&a), "r" (&b)
 735                 : "cc"
 736                 );
 737     #endif
 738   #endif
 739 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
 740         #ifdef DEBUG_NUSS
 741         if (shiftrightcopy_loop_msp(arrayMSDptr(a._ow,4),arrayMSDptr(b._ow,4),4,1,mspref(arrayMSDptr(a._ow,4),0)>>31))
 742                 cl_abort();
 743         #else
 744         shiftrightcopy_loop_msp(arrayMSDptr(a._ow,4),arrayMSDptr(b._ow,4),4,1,mspref(arrayMSDptr(a._ow,4),0)>>31);
 745         #endif
 746 #else
 747         var uint32 tmp, carry;
 748
 749         tmp = a.ow3;
 750         b.ow3 = (sint32)tmp >> 1;
 751         carry = tmp << 31;
 752         tmp = a.ow2;
 753         b.ow2 = (tmp >> 1) | carry;
 754         carry = tmp << 31;
 755         tmp = a.ow1;
 756         b.ow1 = (tmp >> 1) | carry;
 757         carry = tmp << 31;
 758         tmp = a.ow0;
 759         b.ow0 = (tmp >> 1) | carry;
 760         #ifdef DEBUG_NUSS
 761         carry = tmp << 31;
 762         if (carry)
 763                 cl_abort();
 764         #endif
 765 #endif
 766 }
 767
 768 #endif // (intDsize==32)
 769
 770 #if (intDsize==64)
 771
 772 //typedef struct { sint64 iw1; uint64 iw0; } nuss_inword;
 773 //typedef struct { uint64 iw0; sint64 iw1; } nuss_inword;
 774 typedef struct { uintD _iw[2]; } nuss_inword;
 775 #if CL_DS_BIG_ENDIAN_P
 776   #define iw1 _iw[0]
 777   #define iw0 _iw[1]
 778 #else
 779   #define iw0 _iw[0]
 780   #define iw1 _iw[1]
 781 #endif
 782
 783 //typedef struct { sint64 ow2; uint64 ow1; uint64 ow0; } nuss_outword;
 784 //typedef struct { uint64 ow0; uint64 ow1; sint64 ow2; } nuss_outword;
 785 typedef struct { uintD _ow[3]; } nuss_outword;
 786 #if CL_DS_BIG_ENDIAN_P
 787   #define ow2 _ow[0]
 788   #define ow1 _ow[1]
 789   #define ow0 _ow[2]
 790 #else
 791   #define ow0 _ow[0]
 792   #define ow1 _ow[1]
 793   #define ow2 _ow[2]
 794 #endif
 795
 796 // r := a + b
 797 static inline void add (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
 798 {
 799 #ifdef NUSS_IN_EXTERNAL_LOOPS
 800         add_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
 801 #else
 802         var uint64 tmp;
 803
 804         tmp = a.iw0 + b.iw0;
 805         if (tmp >= a.iw0) {
 806                 // no carry
 807                 r.iw0 = tmp;
 808                 r.iw1 = a.iw1 + b.iw1;
 809         } else {
 810                 // carry
 811                 r.iw0 = tmp;
 812                 r.iw1 = a.iw1 + b.iw1 + 1;
 813         }
 814 #endif
 815 }
 816
 817 // r := a - b
 818 static inline void sub (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
 819 {
 820 #ifdef NUSS_IN_EXTERNAL_LOOPS
 821         sub_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
 822 #else
 823         var uint64 tmp;
 824
 825         tmp = a.iw0 - b.iw0;
 826         if (tmp <= a.iw0) {
 827                 // no carry
 828                 r.iw0 = tmp;
 829                 r.iw1 = a.iw1 - b.iw1;
 830         } else {
 831                 // carry
 832                 r.iw0 = tmp;
 833                 r.iw1 = a.iw1 - b.iw1 - 1;
 834         }
 835 #endif
 836 }
 837
 838 // r := 0
 839 static inline void zero (nuss_outword& r)
 840 {
 841         r.ow0 = 0;
 842         r.ow1 = 0;
 843         r.ow2 = 0;
 844 }
 845
 846 // r := a + b
 847 static inline void add (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
 848 {
 849 #ifdef NUSS_OUT_EXTERNAL_LOOPS
 850         add_loop_lsp(arrayLSDptr(a._ow,3),arrayLSDptr(b._ow,3),arrayLSDptr(r._ow,3),3);
 851 #else
 852         var uint64 tmp;
 853
 854         tmp = a.ow0 + b.ow0;
 855         if (tmp >= a.ow0) {
 856                 // no carry
 857                 r.ow0 = tmp;
 858                 tmp = a.ow1 + b.ow1;
 859                 if (tmp >= a.ow1) goto no_carry_1; else goto carry_1;
 860         } else {
 861                 // carry
 862                 r.ow0 = tmp;
 863                 tmp = a.ow1 + b.ow1 + 1;
 864                 if (tmp > a.ow1) goto no_carry_1; else goto carry_1;
 865         }
 866         if (1) {
 867                 no_carry_1: // no carry
 868                 r.ow1 = tmp;
 869                 tmp = a.ow2 + b.ow2;
 870         } else {
 871                 carry_1: // carry
 872                 r.ow1 = tmp;
 873                 tmp = a.ow2 + b.ow2 + 1;
 874         }
 875         r.ow2 = tmp;
 876 #endif
 877 }
 878
 879 // r := a - b
 880 static inline void sub (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
 881 {
 882 #ifdef NUSS_OUT_EXTERNAL_LOOPS
 883         sub_loop_lsp(arrayLSDptr(a._ow,3),arrayLSDptr(b._ow,3),arrayLSDptr(r._ow,3),3);
 884 #else
 885         var uint64 tmp;
 886
 887         tmp = a.ow0 - b.ow0;
 888         if (tmp <= a.ow0) {
 889                 // no carry
 890                 r.ow0 = tmp;
 891                 tmp = a.ow1 - b.ow1;
 892                 if (tmp <= a.ow1) goto no_carry_1; else goto carry_1;
 893         } else {
 894                 // carry
 895                 r.ow0 = tmp;
 896                 tmp = a.ow1 - b.ow1 - 1;
 897                 if (tmp < a.ow1) goto no_carry_1; else goto carry_1;
 898         }
 899         if (1) {
 900                 no_carry_1: // no carry
 901                 r.ow1 = tmp;
 902                 tmp = a.ow2 - b.ow2;
 903         } else {
 904                 carry_1: // carry
 905                 r.ow1 = tmp;
 906                 tmp = a.ow2 - b.ow2 - 1;
 907         }
 908         r.ow2 = tmp;
 909 #endif
 910 }
 911
 912 // b := a >> 1
 913 static inline void shift (const nuss_outword& a, nuss_outword& b)
 914 {
 915 #ifdef NUSS_OUT_EXTERNAL_LOOPS
 916         #ifdef DEBUG_NUSS
 917         if (shiftrightcopy_loop_msp(arrayMSDptr(a._ow,3),arrayMSDptr(b._ow,3),3,1,mspref(arrayMSDptr(a._ow,3),0)>>63))
 918                 cl_abort();
 919         #else
 920         shiftrightcopy_loop_msp(arrayMSDptr(a._ow,3),arrayMSDptr(b._ow,3),3,1,mspref(arrayMSDptr(a._ow,3),0)>>63);
 921         #endif
 922 #else
 923         var uint64 tmp, carry;
 924
 925         tmp = a.ow2;
 926         b.ow2 = (sint64)tmp >> 1;
 927         carry = tmp << 63;
 928         tmp = a.ow1;
 929         b.ow1 = (tmp >> 1) | carry;
 930         carry = tmp << 63;
 931         tmp = a.ow0;
 932         b.ow0 = (tmp >> 1) | carry;
 933         #ifdef DEBUG_NUSS
 934         carry = tmp << 63;
 935         if (carry)
 936                 cl_abort();
 937         #endif
 938 #endif
 939 }
 940
 941 #endif // (intDsize==64)
 942
 943 // This is a recursive implementation.
 944 // TODO: Write a non-recursive one.
 945
 946 #ifndef _BIT_REVERSE
 947 #define _BIT_REVERSE
 948 // Reverse an n-bit number x. n>0.
 949 static uintL bit_reverse (uintL n, uintL x)
 950 {
 951         var uintL y = 0;
 952         do {
 953                 y <<= 1;
 954                 y |= (x & 1);
 955                 x >>= 1;
 956         } while (!(--n == 0));
 957         return y;
 958 }
 959 #endif
 960
 961 // Threshold for recursion base in mulu_nuss_negacyclic().
 962 // Time of a multiplication with len1=len2=10000 on Linux i486:
 963 //                     normal    asm-optimized
 964 //   threshold1 = 1:   40.1 sec  25.5 sec
 965 //   threshold1 = 2:   28.6 sec  18.3 sec
 966 //   threshold1 = 3:   25.6 sec  16.6 sec
 967 //   threshold1 = 4:   25.7 sec  17.6 sec
 968 //   threshold1 = 5:   26.1 sec  18.0 sec
 969 const uintL cl_nuss_threshold1 = 3;
 970
 971 // Threshold for recursion base in mulu_nuss_cyclic().
 972 const uintL cl_nuss_threshold2 = 1;
 973
 974 // Computes z[k] := sum(i+j==k mod N, x[i]*y[j]*(-1)^((i+j-k)/N))
 975 // for all k=0..N-1.
 976 static void mulu_nuss_negacyclic (const uintL n, const uintL N, // N = 2^n
 977                                   const nuss_inword * x, // N words
 978                                   const nuss_inword * y, // N words
 979                                   nuss_outword * z       // N words result
 980                                  )
 981 {
 982         #if 0 // always n > 0
 983         if (n == 0) {
 984                 // z[0] := x0 y0
 985                 mul(x[0],y[0], z[0]);
 986                 return;
 987         }
 988         #endif
 989         if (n <= cl_nuss_threshold1) {
 990                 if (n == 1) {
 991                         // z[0] := x0 (y0 + y1) - (x0 + x1) y1
 992                         // z[1] := x0 (y0 + y1) + (x1 - x0) y0
 993                         var nuss_inword x_sum;
 994                         var nuss_inword y_sum;
 995                         var nuss_outword first, second;
 996                         add(x[0],x[1], x_sum);
 997                         add(y[0],y[1], y_sum);
 998                         mul(x[0],y_sum, first);
 999                         mul(x_sum,y[1], second); sub(first,second, z[0]);
1000                         sub(x[1],x[0], x_sum);
1001                         mul(x_sum,y[0], second); add(first,second, z[1]);
1002                         return;
1003                 }
1004                 // 1 < n <= cl_nuss_threshold1.
1005                 #if 0 // straightforward, but slow
1006                 var uintL k;
1007                 for (k = 0; k < N; k++) {
1008                         var uintL i;
1009                         var nuss_outword accu;
1010                         mul(x[0],y[k], accu);
1011                         for (i = 1; i <= k; i++) {
1012                                 var nuss_outword temp;
1013                                 mul(x[i],y[k-i], temp);
1014                                 add(accu,temp, accu);
1015                         }
1016                         for (i = k+1; i < N; i++) {
1017                                 var nuss_outword temp;
1018                                 mul(x[i],y[N-i+k], temp);
1019                                 sub(accu,temp, accu);
1020                         }
1021                         z[k] = accu;
1022                 }
1023                 #else
1024                 var const uintL M = (uintL)1 << (n-1); // M = N/2
1025                 var uintL i, j, k;
1026                 for (k = 0; k < N; k++)
1027                         zero(z[k]);
1028                 for (i = 0; i < M; i++) {
1029                         var uintL iM = i+M;
1030                         for (j = 0; j < M-i; j++) {
1031                                 var uintL jM = j+M;
1032                                 // z[i+j]   += x[i] (y[j] + y[j+M]) - (x[i] + x[i+M]) y[j+M]
1033                                 // z[i+j+M] += x[i] (y[j] + y[j+M]) + (x[i+M] - x[i]) y[j]
1034                                 var nuss_inword x_sum;
1035                                 var nuss_inword y_sum;
1036                                 var nuss_outword first, second, temp;
1037                                 add(x[i],x[iM], x_sum);
1038                                 add(y[j],y[jM], y_sum);
1039                                 mul(x[i],y_sum, first);
1040                                 mul(x_sum,y[jM], second); sub(first,second, temp); add(z[i+j],temp, z[i+j]);
1041                                 sub(x[iM],x[i], x_sum);
1042                                 mul(x_sum,y[j], second); add(first,second, temp); add(z[i+j+M],temp, z[i+j+M]);
1043                         }
1044                         for (j = M-i; j < M; j++) {
1045                                 var uintL jM = j+M;
1046                                 // z[i+j]   += x[i] (y[j] + y[j+M]) - (x[i] + x[i+M]) y[j+M]
1047                                 // z[i+j-M] -= x[i] (y[j] + y[j+M]) + (x[i+M] - x[i]) y[j]
1048                                 var nuss_inword x_sum;
1049                                 var nuss_inword y_sum;
1050                                 var nuss_outword first, second, temp;
1051                                 add(x[i],x[iM], x_sum);
1052                                 add(y[j],y[jM], y_sum);
1053                                 mul(x[i],y_sum, first);
1054                                 mul(x_sum,y[jM], second); sub(first,second, temp); add(z[i+j],temp, z[i+j]);
1055                                 sub(x[iM],x[i], x_sum);
1056                                 mul(x_sum,y[j], second); add(first,second, temp); sub(z[i+j-M],temp, z[i+j-M]);
1057                         }
1058                 }
1059                 #endif
1060                 return;
1061         }
1062         // Recursive FFT.
1063         var const uintL m = n >> 1; // floor(n/2)
1064         var const uintL r = n - m;  // ceiling(n/2)
1065         var const uintL M = (uintL)1 << m; // M = 2^m
1066         var const uintL R = (uintL)1 << r; // R = 2^r
1067         CL_ALLOCA_STACK;
1068         var nuss_inword* const auX = cl_alloc_array(nuss_inword,2*N);
1069         var nuss_inword* const auY = cl_alloc_array(nuss_inword,2*N);
1070         var nuss_outword* const auZ = cl_alloc_array(nuss_outword,2*N);
1071         #define X(i,j) auX[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1072         #define Y(i,j) auY[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1073         #define Z(i,j) auZ[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1074         var nuss_inword* const tmp1 = cl_alloc_array(nuss_inword,R);
1075         var nuss_inword* const tmp2 = cl_alloc_array(nuss_inword,R);
1076         var nuss_outword* const tmpZ = cl_alloc_array(nuss_outword,R);
1077         var bool squaring = (x == y);
1078         var uintL i, j;
1079         // Initialize polynomials X(i) and Y(i).
1080         for (i = 0; i < M; i++) {
1081                 {
1082                         for (j = 0; j < R; j++)
1083                                 X(i,j) = x[(j<<m) + i];
1084                 }
1085                 if (!squaring) {
1086                         for (j = 0; j < R; j++)
1087                                 Y(i,j) = y[(j<<m) + i];
1088                 }
1089         }
1090         // For i = M..2*M-1, the polynomials are implicitly 0.
1091         // Do an FFT of length 2*M on X.
1092         {
1093                 var sintL l;
1094                 // Level l = m:
1095                 for (i = 0; i < M; i++)
1096                         for (j = 0; j < R; j++)
1097                                 X(i+M,j) = X(i,j);
1098                 // Level l = m-1..0:
1099                 for (l = m-1; l>=0; l--) {
1100                         var const uintL smax = (uintL)1 << (m-l);
1101                         var const uintL tmax = (uintL)1 << l;
1102                         for (var uintL s = 0; s < smax; s++) {
1103                                 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1104                                 for (var uintL t = 0; t < tmax; t++) {
1105                                         var uintL i1 = (s << (l+1)) + t;
1106                                         var uintL i2 = i1 + tmax;
1107                                         // Butterfly: replace (X(i1),X(i2)) by
1108                                         // (X(i1) + w^exp*X(i2), X(i1) - w^exp*X(i2)).
1109                                         for (j = 0; j < exp; j++) {
1110                                                 // note that w^R = -1
1111                                                 sub(X(i1,j),X(i2,j-exp+R), tmp1[j]);
1112                                                 add(X(i1,j),X(i2,j-exp+R), tmp2[j]);
1113                                         }
1114                                         for (j = exp; j < R; j++) {
1115                                                 add(X(i1,j),X(i2,j-exp), tmp1[j]);
1116                                                 sub(X(i1,j),X(i2,j-exp), tmp2[j]);
1117                                         }
1118                                         for (j = 0; j < R; j++) {
1119                                                 X(i1,j) = tmp1[j];
1120                                                 X(i2,j) = tmp2[j];
1121                                         }
1122                                 }
1123                         }
1124                 }
1125         }
1126         // Do an FFT of length 2*M on Y.
1127         if (!squaring) {
1128                 var sintL l;
1129                 // Level l = m:
1130                 for (i = 0; i < M; i++)
1131                         for (j = 0; j < R; j++)
1132                                 Y(i+M,j) = Y(i,j);
1133                 // Level l = m-1..0:
1134                 for (l = m-1; l>=0; l--) {
1135                         var const uintL smax = (uintL)1 << (m-l);
1136                         var const uintL tmax = (uintL)1 << l;
1137                         for (var uintL s = 0; s < smax; s++) {
1138                                 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1139                                 for (var uintL t = 0; t < tmax; t++) {
1140                                         var uintL i1 = (s << (l+1)) + t;
1141                                         var uintL i2 = i1 + tmax;
1142                                         // Butterfly: replace (Y(i1),Y(i2)) by
1143                                         // (Y(i1) + w^exp*Y(i2), Y(i1) - w^exp*Y(i2)).
1144                                         for (j = 0; j < exp; j++) {
1145                                                 // note that w^R = -1
1146                                                 sub(Y(i1,j),Y(i2,j-exp+R), tmp1[j]);
1147                                                 add(Y(i1,j),Y(i2,j-exp+R), tmp2[j]);
1148                                         }
1149                                         for (j = exp; j < R; j++) {
1150                                                 add(Y(i1,j),Y(i2,j-exp), tmp1[j]);
1151                                                 sub(Y(i1,j),Y(i2,j-exp), tmp2[j]);
1152                                         }
1153                                         for (j = 0; j < R; j++) {
1154                                                 Y(i1,j) = tmp1[j];
1155                                                 Y(i2,j) = tmp2[j];
1156                                         }
1157                                 }
1158                         }
1159                 }
1160         }
1161         // Recursively compute the negacyclic product X(i)*Y(i) for all i.
1162         if (!squaring) {
1163                 for (i = 0; i < 2*M; i++)
1164                         mulu_nuss_negacyclic(r,R, &X(i,0), &Y(i,0), &Z(i,0));
1165         } else {
1166                 for (i = 0; i < 2*M; i++)
1167                         mulu_nuss_negacyclic(r,R, &X(i,0), &X(i,0), &Z(i,0));
1168         }
1169         // Undo an FFT of length 2*M on Z.
1170         {
1171                 var uintL l;
1172                 // Level l = 0..m-1:
1173                 for (l = 0; l < m; l++) {
1174                         var const uintL smax = (uintL)1 << (m-l);
1175                         var const uintL tmax = (uintL)1 << l;
1176                         for (var uintL s = 0; s < smax; s++) {
1177                                 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1178                                 for (var uintL t = 0; t < tmax; t++) {
1179                                         var uintL i1 = (s << (l+1)) + t;
1180                                         var uintL i2 = i1 + tmax;
1181                                         // Inverse Butterfly: replace (Z(i1),Z(i2)) by
1182                                         // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/(2*w^exp)).
1183                                         for (j = 0; j < exp; j++)
1184                                                 // note that w^R = -1
1185                                                 sub(Z(i2,j),Z(i1,j), tmpZ[j-exp+R]);
1186                                         for (j = exp; j < R; j++)
1187                                                 sub(Z(i1,j),Z(i2,j), tmpZ[j-exp]);
1188                                         for (j = 0; j < R; j++) {
1189                                                 var nuss_outword sum;
1190                                                 add(Z(i1,j),Z(i2,j), sum);
1191                                                 shift(sum, Z(i1,j));
1192                                                 shift(tmpZ[j], Z(i2,j));
1193                                         }
1194                                 }
1195                         }
1196                 }
1197                 // Level l=m:
1198                 for (i = 0; i < M; i++) {
1199                         var uintL i1 = i;
1200                         var uintL i2 = i1 + M;
1201                         // Inverse Butterfly: replace (Z(i1),Z(i2)) by
1202                         // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/2).
1203                         for (j = 0; j < R; j++) {
1204                                 var nuss_outword sum;
1205                                 var nuss_outword diff;
1206                                 add(Z(i1,j),Z(i2,j), sum);
1207                                 sub(Z(i1,j),Z(i2,j), diff);
1208                                 shift(sum, Z(i1,j));
1209                                 shift(diff, Z(i2,j));
1210                         }
1211                 }
1212         }
1213         // Reduce to length M.
1214         for (i = 0; i < M; i++) {
1215                 sub(Z(i,0),Z(i+M,R-1), z[i]);
1216                 for (j = 1; j < R; j++)
1217                         add(Z(i,j),Z(i+M,j-1), z[(j<<m)+i]);
1218         }
1219         #undef Z
1220         #undef Y
1221         #undef X
1222 }
1223
1224 // Computes z[k] := sum(i+j==k mod N, x[i]*y[j])
1225 // for all k=0..N-1.
1226 static void mulu_nuss_cyclic (const uintL n, const uintL N, // N = 2^n
1227                               nuss_inword * x, // N words, modified!
1228                               nuss_inword * y, // N words, modified!
1229                               nuss_outword * z // N words result
1230                              )
1231 {
1232         unused N;
1233         #if 0 // always n > 0
1234         if (n == 0) {
1235                 // z[0] := x0 y0
1236                 mul(x[0],y[0], z[0]);
1237                 return;
1238         }
1239         #endif
1240         if (n == 1) {
1241                 // z[0] := ((x0 + x1) (y0 + y1) + (x0 - x1) (y0 - y1)) / 2
1242                 // z[1] := ((x0 + x1) (y0 + y1) - (x0 - x1) (y0 - y1)) / 2
1243                 var nuss_inword x_sum;
1244                 var nuss_inword y_sum;
1245                 var nuss_inword x_diff;
1246                 var nuss_inword y_diff;
1247                 var nuss_outword first, second;
1248                 add(x[0],x[1], x_sum);
1249                 add(y[0],y[1], y_sum);
1250                 sub(x[0],x[1], x_diff);
1251                 sub(y[0],y[1], y_diff);
1252                 mul(x_sum,y_sum, first);
1253                 mul(x_diff,y_diff, second);
1254                 add(first,second, z[0]); shift(z[0], z[0]);
1255                 sub(first,second, z[1]); shift(z[1], z[1]);
1256                 return;
1257         }
1258         #if 0 // useless code because cl_nuss_threshold2 == 1
1259         if (n <= cl_nuss_threshold2) {
1260                 #if 0 // straightforward, but slow
1261                 var uintL k;
1262                 for (k = 0; k < N; k++) {
1263                         var uintL i;
1264                         var nuss_outword accu;
1265                         mul(x[0],y[k], accu);
1266                         for (i = 1; i <= k; i++) {
1267                                 var nuss_outword temp;
1268                                 mul(x[i],y[k-i], temp);
1269                                 add(accu,temp, accu);
1270                         }
1271                         for (i = k+1; i < N; i++) {
1272                                 var nuss_outword temp;
1273                                 mul(x[i],y[N-i+k], temp);
1274                                 add(accu,temp, accu);
1275                         }
1276                         z[k] = accu;
1277                 }
1278                 #else
1279                 var const uintL M = (uintL)1 << (n-1); // M = N/2
1280                 var uintL i, j, k;
1281                 for (k = 0; k < N; k++)
1282                         zero(z[k]);
1283                 for (i = 0; i < M; i++) {
1284                         var uintL iM = i+M;
1285                         for (j = 0; j < M; j++) {
1286                                 var uintL jM = j+M;
1287                                 // z[i+j]   += ((x[i] + x[i+M]) (y[j] + y[j+M]) + (x[i] - x[i+M]) (y[j] - y[j+M])) / 2
1288                                 // z[i+j+M] += ((x[i] + x[i+M]) (y[j] + y[j+M]) - (x[i] - x[i+M]) (y[j] - y[j+M])) / 2
1289                                 var nuss_inword x_sum;
1290                                 var nuss_inword y_sum;
1291                                 var nuss_inword x_diff;
1292                                 var nuss_inword y_diff;
1293                                 var nuss_outword first, second, temp;
1294                                 add(x[i],x[iM], x_sum);
1295                                 add(y[j],y[jM], y_sum);
1296                                 sub(x[i],x[iM], x_diff);
1297                                 sub(y[j],y[jM], y_diff);
1298                                 mul(x_sum,y_sum, first);
1299                                 mul(x_diff,y_diff, second);
1300                                 add(first,second, temp); add(z[i+j],temp, z[i+j]);
1301                                 var uintL ijM = (i+j+M) & (N-1);
1302                                 sub(first,second, temp); add(z[ijM],temp, z[ijM]);
1303                         }
1304                 }
1305                 for (k = 0; k < N; k++)
1306                         shift(z[k], z[k]);
1307                 #endif
1308                 return;
1309         }
1310         #endif
1311         var const uintL m = n-1;
1312         var const uintL M = (uintL)1 << m; // M = 2^m = N/2
1313         var uintL i;
1314         // Chinese remainder theorem: u^N-1 = (u^M-1)*(u^M+1)
1315         for (i = 0; i < M; i++) {
1316                 // Butterfly: replace (x(i),x(i+M))
1317                 // by (x(i)+x(i+M),x(i)-x(i+M)).
1318                 var nuss_inword tmp;
1319                 sub(x[i],x[i+M], tmp);
1320                 add(x[i],x[i+M], x[i]);
1321                 x[i+M] = tmp;
1322         }
1323         if (!(x == y)) // squaring?
1324         for (i = 0; i < M; i++) {
1325                 // Butterfly: replace (y(i),y(i+M))
1326                 // by (y(i)+y(i+M),y(i)-y(i+M)).
1327                 var nuss_inword tmp;
1328                 sub(y[i],y[i+M], tmp);
1329                 add(y[i],y[i+M], y[i]);
1330                 y[i+M] = tmp;
1331         }
1332         // Recurse.
1333         mulu_nuss_cyclic(m,M, &x[0], &y[0], &z[0]);
1334         mulu_nuss_negacyclic(m,M, &x[M], &y[M], &z[M]);
1335         for (i = 0; i < M; i++) {
1336                 // Inverse Butterfly: replace (z(i),z(i+M))
1337                 // by ((z(i)+z(i+M))/2,(z(i)-z(i+M))/2).
1338                 var nuss_outword sum;
1339                 var nuss_outword diff;
1340                 add(z[i],z[i+M], sum);
1341                 sub(z[i],z[i+M], diff);
1342                 shift(sum, z[i]);
1343                 shift(diff, z[i+M]);
1344         }
1345 }
1346
1347 static void mulu_nussbaumer (const uintD* sourceptr1, uintC len1,
1348                              const uintD* sourceptr2, uintC len2,
1349                              uintD* destptr)
1350 // Es ist 2 <= len1 <= len2.
1351 {
1352         // Methode:
1353         // source1 ist ein Stück der Länge N1, source2 ein oder mehrere Stücke
1354         // der Länge N2, mit N1+N2 <= N, wobei N Zweierpotenz ist.
1355         // sum(i=0..N-1, x_i b^i) * sum(i=0..N-1, y_i b^i) wird errechnet,
1356         // indem man die beiden Polynome
1357         // sum(i=0..N-1, x_i T^i), sum(i=0..N-1, y_i T^i)
1358         // multipliziert, und zwar durch Fourier-Transformation (s.o.).
1359         var uint32 n;
1360         integerlength32(len1-1, n=); // 2^(n-1) < len1 <= 2^n
1361         var uintL len = (uintL)1 << n; // kleinste Zweierpotenz >= len1
1362         // Wählt man N = len, so hat man ceiling(len2/(len-len1+1)) * FFT(len).
1363         // Wählt man N = 2*len, so hat man ceiling(len2/(2*len-len1+1)) * FFT(2*len).
1364         // Wir wählen das billigere von beiden:
1365         // Bei ceiling(len2/(len-len1+1)) <= 2 * ceiling(len2/(2*len-len1+1))
1366         // nimmt man N = len, bei ....... > ........ dagegen N = 2*len.
1367         // (Wahl von N = 4*len oder mehr bringt nur in Extremfällen etwas.)
1368         if (len2 > 2 * (len-len1+1) * (len2 <= (2*len-len1+1) ? 1 : ceiling(len2,(2*len-len1+1)))) {
1369                 n = n+1;
1370                 len = len << 1;
1371         }
1372         var const uintL N = len; // N = 2^n
1373         CL_ALLOCA_STACK;
1374         var nuss_inword* const x = cl_alloc_array(nuss_inword,N);
1375         var nuss_inword* const y = cl_alloc_array(nuss_inword,N);
1376         var nuss_outword* const z = cl_alloc_array(nuss_outword,N);
1377         var uintD* const tmpprod = cl_alloc_array(uintD,len1+1);
1378         var uintP i;
1379         var uintL destlen = len1+len2;
1380         clear_loop_lsp(destptr,destlen);
1381         do {
1382                 var uintL len2p; // length of a piece of source2
1383                 len2p = N - len1 + 1;
1384                 if (len2p > len2)
1385                         len2p = len2;
1386                 // len2p = min(N-len1+1,len2).
1387                 if (len2p == 1) {
1388                         // cheap case
1389                         var uintD* tmpptr = arrayLSDptr(tmpprod,len1+1);
1390                         mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
1391                         if (addto_loop_lsp(tmpptr,destptr,len1+1))
1392                                 if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
1393                                         cl_abort();
1394                 } else {
1395                         var uintL destlenp = len1 + len2p - 1;
1396                         // destlenp = min(N,destlen-1).
1397                         var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
1398                         // Fill factor x.
1399                         {
1400                                 for (i = 0; i < len1; i++) {
1401                                         x[i].iw0 = lspref(sourceptr1,i);
1402                                         x[i].iw1 = 0;
1403                                 }
1404                                 for (i = len1; i < N; i++) {
1405                                         x[i].iw0 = 0;
1406                                         x[i].iw1 = 0;
1407                                 }
1408                         }
1409                         // Fill factor y.
1410                         if (!squaring) {
1411                                 for (i = 0; i < len2p; i++) {
1412                                         y[i].iw0 = lspref(sourceptr2,i);
1413                                         y[i].iw1 = 0;
1414                                 }
1415                                 for (i = len2p; i < N; i++) {
1416                                         y[i].iw0 = 0;
1417                                         y[i].iw1 = 0;
1418                                 }
1419                         }
1420                         // Multiply.
1421                         if (!squaring)
1422                                 mulu_nuss_cyclic(n,N, &x[0], &y[0], &z[0]);
1423                         else
1424                                 mulu_nuss_cyclic(n,N, &x[0], &x[0], &z[0]);
1425                         #ifdef DEBUG_NUSS
1426                         // Check result.
1427                         for (i = 0; i < N; i++)
1428                                 if (!(z[i].ow3 == 0))
1429                                         cl_abort();
1430                         #endif
1431                         // Add result to destptr[-destlen..-1]:
1432                         {
1433                                 var uintD* ptr = destptr;
1434                                 // ac2|ac1|ac0 are an accumulator.
1435                                 var uint32 ac0 = 0;
1436                                 var uint32 ac1 = 0;
1437                                 var uint32 ac2 = 0;
1438                                 var uint32 tmp;
1439                                 for (i = 0; i < destlenp; i++) {
1440                                         // Add z[i] to the accumulator.
1441                                         tmp = z[i].ow0;
1442                                         if ((ac0 += tmp) < tmp) {
1443                                                 if (++ac1 == 0)
1444                                                         ++ac2;
1445                                         }
1446                                         tmp = z[i].ow1;
1447                                         if ((ac1 += tmp) < tmp)
1448                                                 ++ac2;
1449                                         tmp = z[i].ow2;
1450                                         ac2 += tmp;
1451                                         // Add the accumulator's least significant word to destptr:
1452                                         tmp = lspref(ptr,0);
1453                                         if ((ac0 += tmp) < tmp) {
1454                                                 if (++ac1 == 0)
1455                                                         ++ac2;
1456                                         }
1457                                         lspref(ptr,0) = ac0;
1458                                         lsshrink(ptr);
1459                                         ac0 = ac1;
1460                                         ac1 = ac2;
1461                                         ac2 = 0;
1462                                 }
1463                                 // ac2 = 0.
1464                                 if (ac1 > 0) {
1465                                         if (!((i += 2) <= destlen))
1466                                                 cl_abort();
1467                                         tmp = lspref(ptr,0);
1468                                         if ((ac0 += tmp) < tmp)
1469                                                 ++ac1;
1470                                         lspref(ptr,0) = ac0;
1471                                         lsshrink(ptr);
1472                                         tmp = lspref(ptr,0);
1473                                         ac1 += tmp;
1474                                         lspref(ptr,0) = ac1;
1475                                         lsshrink(ptr);
1476                                         if (ac1 < tmp)
1477                                                 if (inc_loop_lsp(ptr,destlen-i))
1478                                                         cl_abort();
1479                                 } else if (ac0 > 0) {
1480                                         if (!((i += 1) <= destlen))
1481                                                 cl_abort();
1482                                         tmp = lspref(ptr,0);
1483                                         ac0 += tmp;
1484                                         lspref(ptr,0) = ac0;
1485                                         lsshrink(ptr);
1486                                         if (ac0 < tmp)
1487                                                 if (inc_loop_lsp(ptr,destlen-i))
1488                                                         cl_abort();
1489                                 }
1490                         }
1491                         #ifdef DEBUG_NUSS
1492                         // If destlenp < N, check that the remaining z[i] are 0.
1493                         for (i = destlenp; i < N; i++)
1494                                 if (z[i].ow2 > 0 || z[i].ow1 > 0 || z[i].ow0 > 0)
1495                                         cl_abort();
1496                         #endif
1497                 }
1498                 // Decrement len2.
1499                 destptr = destptr lspop len2p;
1500                 destlen -= len2p;
1501                 sourceptr2 = sourceptr2 lspop len2p;
1502                 len2 -= len2p;
1503         } while (len2 > 0);
1504 }
1505
1506 #undef iw0
1507 #undef iw1
1508 #undef ow0
1509 #undef ow1
1510 #undef ow2
1511 #undef ow3