1 // Fast integer multiplication using Nussbaumer's FFT based algorithm.
2 // [Donald Ervin Knuth: The Art of Computer Programming, Vol. II:
3 // Seminumerical Algorithms, second edition.
4 // Section 4.6.4, exercise 59, p. 503, 652-654.]
5 // [Henri Jean Nussbaumer, IEEE Trans. ASSP-28 (1980), 205-215.]
6 // Bruno Haible 4.-5.5.1996
8 // This algorithm has the benefit of working on entire words, not single bits,
9 // and involving no non-integer numbers. (The root of unity is chosen in
10 // an appropriate polynomial ring.)
12 // If at the beginning all words x_i, y_i are >= 0 and < M, then
13 // the intermediate X_{i,j}, Y_{i,j} are < M * N in absolute value
14 // (where N = number of words), hence the |Z_{i,j}| < M^2 * N^2.
15 // We therefore reserve 2 32-bit words for every X_{i,j} and 4 32-bit words
19 #error "nussbaumer implemented only for intDsize==32"
22 // Define this if you want the external loops instead of inline operations.
23 //#define NUSS_IN_EXTERNAL_LOOPS
24 #define NUSS_OUT_EXTERNAL_LOOPS
26 // Define this if you want inline operations which access the stack directly.
27 // This looks like better code, but is in effect 3% slower. No idea why.
28 //#define NUSS_ASM_DIRECT
30 // Define this for (cheap) consistency checks.
33 // Define this for extensive consistency checks.
34 //#define DEBUG_NUSS_OPERATIONS
38 //typedef struct { sint32 iw1; uint32 iw0; } nuss_inword;
39 //typedef struct { uint32 iw0; sint32 iw1; } nuss_inword;
40 typedef struct { uintD _iw[2]; } nuss_inword;
41 #if CL_DS_BIG_ENDIAN_P
49 //typedef struct { sint32 ow3; uint32 ow2; uint32 ow1; uint32 ow0; } nuss_outword;
50 //typedef struct { uint32 ow0; uint32 ow1; uint32 ow2; sint32 ow3; } nuss_outword;
51 typedef struct { uintD _ow[4]; } nuss_outword;
52 #if CL_DS_BIG_ENDIAN_P
65 static inline void add (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
67 #if defined(__GNUC__) && defined(__i386__)
69 #ifdef NUSS_ASM_DIRECT
70 __asm__ __volatile__ (
75 : "m" (a.iw0), "m" (b.iw0), "m" (r.iw0)
78 __asm__ __volatile__ (
83 : "m" (a.iw1), "m" (b.iw1), "m" (r.iw1)
87 #if CL_DS_BIG_ENDIAN_P
88 __asm__ __volatile__ (
89 "movl 4(%1),%0" "\n\t"
90 "addl 4(%2),%0" "\n\t"
91 "movl %0,4(%3)" "\n\t"
96 : "r" (&a), "r" (&b), "r" (&r)
100 __asm__ __volatile__ (
101 "movl (%1),%0" "\n\t"
102 "addl (%2),%0" "\n\t"
103 "movl %0,(%3)" "\n\t"
104 "movl 4(%1),%0" "\n\t"
105 "adcl 4(%2),%0" "\n\t"
108 : "r" (&a), "r" (&b), "r" (&r)
113 #elif defined(NUSS_IN_EXTERNAL_LOOPS)
114 add_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
122 r.iw1 = a.iw1 + b.iw1;
126 r.iw1 = a.iw1 + b.iw1 + 1;
132 static inline void sub (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
134 #if defined(__GNUC__) && defined(__i386__)
136 #ifdef NUSS_ASM_DIRECT
137 __asm__ __volatile__ (
142 : "m" (a.iw0), "m" (b.iw0), "m" (r.iw0)
145 __asm__ __volatile__ (
150 : "m" (a.iw1), "m" (b.iw1), "m" (r.iw1)
154 #if CL_DS_BIG_ENDIAN_P
155 __asm__ __volatile__ (
156 "movl 4(%1),%0" "\n\t"
157 "subl 4(%2),%0" "\n\t"
158 "movl %0,4(%3)" "\n\t"
159 "movl (%1),%0" "\n\t"
160 "sbbl (%2),%0" "\n\t"
163 : "r" (&a), "r" (&b), "r" (&r)
167 __asm__ __volatile__ (
168 "movl (%1),%0" "\n\t"
169 "subl (%2),%0" "\n\t"
170 "movl %0,(%3)" "\n\t"
171 "movl 4(%1),%0" "\n\t"
172 "sbbl 4(%2),%0" "\n\t"
175 : "r" (&a), "r" (&b), "r" (&r)
180 #elif defined(NUSS_IN_EXTERNAL_LOOPS)
181 sub_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
189 r.iw1 = a.iw1 - b.iw1;
193 r.iw1 = a.iw1 - b.iw1 - 1;
199 static void mul (const nuss_inword& a, const nuss_inword& b, nuss_outword& r)
201 #ifdef NUSS_IN_EXTERNAL_LOOPS
202 mulu_2loop(arrayLSDptr(a._iw,2),2, arrayLSDptr(b._iw,2),2, arrayLSDptr(r._ow,4));
203 if ((sintD)mspref(arrayMSDptr(a._iw,2),0) < 0)
204 subfrom_loop_lsp(arrayLSDptr(b._iw,2),arrayLSDptr(r._ow,4) lspop 2,2);
205 if ((sintD)mspref(arrayMSDptr(b._iw,2),0) < 0)
206 subfrom_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(r._ow,4) lspop 2,2);
211 // a, b small positive
212 mulu32(a.iw0, b.iw0, r.ow1 =, r.ow0 =);
213 r.ow3 = 0; r.ow2 = 0;
216 else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
219 mulu32(a.iw0, -b.iw0, hi=, lo=);
225 } else /* a.iw0 == 0 */ {
226 r.ow3 = 0; r.ow2 = 0; r.ow1 = 0;
229 r.ow3 = -(uint32)1; r.ow2 = -(uint32)1;
232 var uint32 hi1, lo1, hi0;
233 mulu32(a.iw0, b.iw0, hi0 =, r.ow0 =);
234 mulu32(a.iw0, b.iw1, hi1 =, lo1 =);
235 if ((lo1 += hi0) < hi0)
237 // hi1|lo1|r.ow0 = a.iw0 * b(unsigned).
239 if ((sint32)b.iw1 >= 0) {
243 // b was negative -> subtract a * 2^64
247 } else /* a.iw0 == 0 */ {
248 r.ow3 = 0; r.ow2 = 0;
253 else if (a.iw1 == -(uint32)1 && a.iw0 != 0) {
258 mulu32(-a.iw0, b.iw0, hi=, lo=);
264 } else /* b.iw0 == 0 */ {
265 r.ow3 = 0; r.ow2 = 0; r.ow1 = 0;
268 r.ow3 = -(uint32)1; r.ow2 = -(uint32)1;
271 else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
272 // a, b small negative
273 mulu32(-a.iw0, -b.iw0, r.ow1 =, r.ow0 =);
274 r.ow3 = 0; r.ow2 = 0;
277 var uint32 hi1, lo1, hi0, lo0;
278 mulu32(-a.iw0, b.iw0, hi0 =, lo0 =);
279 mulu32(-a.iw0, b.iw1, hi1 =, lo1 =);
280 if ((lo1 += hi0) < hi0)
282 // hi1|lo1|lo0 = -a * b(unsigned).
292 // hi1|lo1|lo0 = a * b(unsigned).
295 if ((sint32)b.iw1 >= 0) {
299 // b was negative -> subtract a * 2^64
305 else if (b.iw1 == 0) {
307 var uint32 hi1, lo1, hi0;
308 mulu32(b.iw0, a.iw0, hi0 =, r.ow0 =);
309 mulu32(b.iw0, a.iw1, hi1 =, lo1 =);
310 if ((lo1 += hi0) < hi0)
312 // hi1|lo1|r.ow0 = a(unsigned) * b.iw0.
314 if ((sint32)a.iw1 >= 0) {
318 // a was negative -> subtract b * 2^64
322 } else /* b.iw0 == 0 */ {
323 r.ow3 = 0; r.ow2 = 0;
328 else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
330 var uint32 hi1, lo1, hi0, lo0;
331 mulu32(-b.iw0, a.iw0, hi0 =, lo0 =);
332 mulu32(-b.iw0, a.iw1, hi1 =, lo1 =);
333 if ((lo1 += hi0) < hi0)
335 // hi1|lo1|lo0 = a(unsigned) * -b.
345 // hi1|lo1|lo0 = a(unsigned) * b.
348 if ((sint32)a.iw1 >= 0) {
352 // a was negative -> subtract b * 2^64
358 // This is the main and most frequent case (65% to 80%).
359 var uint32 w3, w2, w1, hi, lo;
360 mulu32(a.iw0, b.iw0, w1=, r.ow0=);
361 mulu32(a.iw1, b.iw1, w3=, w2=);
362 mulu32(a.iw0, b.iw1, hi=, lo=);
367 mulu32(a.iw1, b.iw0, hi=, lo=);
372 // w3|w2|w1|r.ow0 = a(unsigned) * b(unsigned).
374 if ((sint32)a.iw1 < 0) {
375 // a was negative -> subtract b * 2^64
385 if ((sint32)b.iw1 < 0) {
386 // b was negative -> subtract a * 2^64
401 #ifdef DEBUG_NUSS_OPERATIONS
402 static void mul_doublecheck (const nuss_inword& a, const nuss_inword& b, nuss_outword& r)
405 mulu_2loop(arrayLSDptr(a._iw,2),2, arrayLSDptr(b._iw,2),2, arrayLSDptr(or._ow,4));
406 if ((sintD)mspref(arrayMSDptr(a._iw,2),0) < 0)
407 subfrom_loop_lsp(arrayLSDptr(b._iw,2),arrayLSDptr(or._ow,4) lspop 2,2);
408 if ((sintD)mspref(arrayMSDptr(b._iw,2),0) < 0)
409 subfrom_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(or._ow,4) lspop 2,2);
411 if (compare_loop_msp(arrayMSDptr(r._ow,4),arrayMSDptr(or._ow,4),4))
414 #define mul mul_doublecheck
418 static inline void zero (nuss_outword& r)
427 static inline void add (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
429 #if defined(__GNUC__) && defined(__i386__)
431 #ifdef NUSS_ASM_DIRECT
432 __asm__ __volatile__ (
437 : "m" (a.ow0), "m" (b.ow0), "m" (r.ow0)
440 __asm__ __volatile__ (
445 : "m" (a.ow1), "m" (b.ow1), "m" (r.ow1)
448 __asm__ __volatile__ (
453 : "m" (a.ow2), "m" (b.ow2), "m" (r.ow2)
456 __asm__ __volatile__ (
461 : "m" (a.ow3), "m" (b.ow3), "m" (r.ow3)
465 #if CL_DS_BIG_ENDIAN_P
466 __asm__ __volatile__ (
467 "movl 12(%1),%0" "\n\t"
468 "addl 12(%2),%0" "\n\t"
469 "movl %0,12(%3)" "\n\t"
470 "movl 8(%1),%0" "\n\t"
471 "adcl 8(%2),%0" "\n\t"
472 "movl %0,8(%3)" "\n\t"
473 "movl 4(%1),%0" "\n\t"
474 "adcl 4(%2),%0" "\n\t"
475 "movl %0,4(%3)" "\n\t"
476 "movl (%1),%0" "\n\t"
477 "adcl (%2),%0" "\n\t"
480 : "r" (&a), "r" (&b), "r" (&r)
484 __asm__ __volatile__ (
485 "movl (%1),%0" "\n\t"
486 "addl (%2),%0" "\n\t"
487 "movl %0,(%3)" "\n\t"
488 "movl 4(%1),%0" "\n\t"
489 "adcl 4(%2),%0" "\n\t"
490 "movl %0,4(%3)" "\n\t"
491 "movl 8(%1),%0" "\n\t"
492 "adcl 8(%2),%0" "\n\t"
493 "movl %0,8(%3)" "\n\t"
494 "movl 12(%1),%0" "\n\t"
495 "adcl 12(%2),%0" "\n\t"
498 : "r" (&a), "r" (&b), "r" (&r)
503 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
504 add_loop_lsp(arrayLSDptr(a._ow,4),arrayLSDptr(b._ow,4),arrayLSDptr(r._ow,4),4);
513 if (tmp >= a.ow1) goto no_carry_1; else goto carry_1;
517 tmp = a.ow1 + b.ow1 + 1;
518 if (tmp > a.ow1) goto no_carry_1; else goto carry_1;
521 no_carry_1: // no carry
524 if (tmp >= a.ow2) goto no_carry_2; else goto carry_2;
528 tmp = a.ow2 + b.ow2 + 1;
529 if (tmp > a.ow2) goto no_carry_2; else goto carry_2;
532 no_carry_2: // no carry
538 tmp = a.ow3 + b.ow3 + 1;
545 static inline void sub (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
547 #if defined(__GNUC__) && defined(__i386__)
549 #ifdef NUSS_ASM_DIRECT
550 __asm__ __volatile__ (
555 : "m" (a.ow0), "m" (b.ow0), "m" (r.ow0)
558 __asm__ __volatile__ (
563 : "m" (a.ow1), "m" (b.ow1), "m" (r.ow1)
566 __asm__ __volatile__ (
571 : "m" (a.ow2), "m" (b.ow2), "m" (r.ow2)
574 __asm__ __volatile__ (
579 : "m" (a.ow3), "m" (b.ow3), "m" (r.ow3)
583 #if CL_DS_BIG_ENDIAN_P
584 __asm__ __volatile__ (
585 "movl 12(%1),%0" "\n\t"
586 "subl 12(%2),%0" "\n\t"
587 "movl %0,12(%3)" "\n\t"
588 "movl 8(%1),%0" "\n\t"
589 "sbbl 8(%2),%0" "\n\t"
590 "movl %0,8(%3)" "\n\t"
591 "movl 4(%1),%0" "\n\t"
592 "sbbl 4(%2),%0" "\n\t"
593 "movl %0,4(%3)" "\n\t"
594 "movl (%1),%0" "\n\t"
595 "sbbl (%2),%0" "\n\t"
598 : "r" (&a), "r" (&b), "r" (&r)
602 __asm__ __volatile__ (
603 "movl (%1),%0" "\n\t"
604 "subl (%2),%0" "\n\t"
605 "movl %0,(%3)" "\n\t"
606 "movl 4(%1),%0" "\n\t"
607 "sbbl 4(%2),%0" "\n\t"
608 "movl %0,4(%3)" "\n\t"
609 "movl 8(%1),%0" "\n\t"
610 "sbbl 8(%2),%0" "\n\t"
611 "movl %0,8(%3)" "\n\t"
612 "movl 12(%1),%0" "\n\t"
613 "sbbl 12(%2),%0" "\n\t"
616 : "r" (&a), "r" (&b), "r" (&r)
621 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
622 sub_loop_lsp(arrayLSDptr(a._ow,4),arrayLSDptr(b._ow,4),arrayLSDptr(r._ow,4),4);
631 if (tmp <= a.ow1) goto no_carry_1; else goto carry_1;
635 tmp = a.ow1 - b.ow1 - 1;
636 if (tmp < a.ow1) goto no_carry_1; else goto carry_1;
639 no_carry_1: // no carry
642 if (tmp <= a.ow2) goto no_carry_2; else goto carry_2;
646 tmp = a.ow2 - b.ow2 - 1;
647 if (tmp < a.ow2) goto no_carry_2; else goto carry_2;
650 no_carry_2: // no carry
656 tmp = a.ow3 - b.ow3 - 1;
663 static inline void shift (const nuss_outword& a, nuss_outword& b)
665 #if defined(__GNUC__) && defined(__i386__) && !defined(DEBUG_NUSS)
667 #ifdef NUSS_ASM_DIRECT
668 __asm__ __volatile__ (
673 : "m" (a.ow3), "m" (b.ow3)
676 __asm__ __volatile__ (
681 : "m" (a.ow2), "m" (b.ow2)
684 __asm__ __volatile__ (
689 : "m" (a.ow1), "m" (b.ow1)
692 __asm__ __volatile__ (
697 : "m" (a.ow0), "m" (b.ow0)
701 #if CL_DS_BIG_ENDIAN_P
702 __asm__ __volatile__ (
703 "movl (%1),%0" "\n\t"
705 "movl %0,(%2)" "\n\t"
706 "movl 4(%1),%0" "\n\t"
708 "movl %0,4(%2)" "\n\t"
709 "movl 8(%1),%0" "\n\t"
711 "movl %0,8(%2)" "\n\t"
712 "movl 12(%1),%0" "\n\t"
720 __asm__ __volatile__ (
721 "movl 12(%1),%0" "\n\t"
723 "movl %0,12(%2)" "\n\t"
724 "movl 8(%1),%0" "\n\t"
726 "movl %0,8(%2)" "\n\t"
727 "movl 4(%1),%0" "\n\t"
729 "movl %0,4(%2)" "\n\t"
730 "movl (%1),%0" "\n\t"
739 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
741 if (shiftrightcopy_loop_msp(arrayMSDptr(a._ow,4),arrayMSDptr(b._ow,4),4,1,mspref(arrayMSDptr(a._ow,4),0)>>31))
744 shiftrightcopy_loop_msp(arrayMSDptr(a._ow,4),arrayMSDptr(b._ow,4),4,1,mspref(arrayMSDptr(a._ow,4),0)>>31);
747 var uint32 tmp, carry;
750 b.ow3 = (sint32)tmp >> 1;
753 b.ow2 = (tmp >> 1) | carry;
756 b.ow1 = (tmp >> 1) | carry;
759 b.ow0 = (tmp >> 1) | carry;
768 #endif // (intDsize==32)
772 //typedef struct { sint64 iw1; uint64 iw0; } nuss_inword;
773 //typedef struct { uint64 iw0; sint64 iw1; } nuss_inword;
774 typedef struct { uintD _iw[2]; } nuss_inword;
775 #if CL_DS_BIG_ENDIAN_P
783 //typedef struct { sint64 ow2; uint64 ow1; uint64 ow0; } nuss_outword;
784 //typedef struct { uint64 ow0; uint64 ow1; sint64 ow2; } nuss_outword;
785 typedef struct { uintD _ow[3]; } nuss_outword;
786 #if CL_DS_BIG_ENDIAN_P
797 static inline void add (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
799 #ifdef NUSS_IN_EXTERNAL_LOOPS
800 add_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
808 r.iw1 = a.iw1 + b.iw1;
812 r.iw1 = a.iw1 + b.iw1 + 1;
818 static inline void sub (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
820 #ifdef NUSS_IN_EXTERNAL_LOOPS
821 sub_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
829 r.iw1 = a.iw1 - b.iw1;
833 r.iw1 = a.iw1 - b.iw1 - 1;
839 static inline void zero (nuss_outword& r)
847 static inline void add (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
849 #ifdef NUSS_OUT_EXTERNAL_LOOPS
850 add_loop_lsp(arrayLSDptr(a._ow,3),arrayLSDptr(b._ow,3),arrayLSDptr(r._ow,3),3);
859 if (tmp >= a.ow1) goto no_carry_1; else goto carry_1;
863 tmp = a.ow1 + b.ow1 + 1;
864 if (tmp > a.ow1) goto no_carry_1; else goto carry_1;
867 no_carry_1: // no carry
873 tmp = a.ow2 + b.ow2 + 1;
880 static inline void sub (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
882 #ifdef NUSS_OUT_EXTERNAL_LOOPS
883 sub_loop_lsp(arrayLSDptr(a._ow,3),arrayLSDptr(b._ow,3),arrayLSDptr(r._ow,3),3);
892 if (tmp <= a.ow1) goto no_carry_1; else goto carry_1;
896 tmp = a.ow1 - b.ow1 - 1;
897 if (tmp < a.ow1) goto no_carry_1; else goto carry_1;
900 no_carry_1: // no carry
906 tmp = a.ow2 - b.ow2 - 1;
913 static inline void shift (const nuss_outword& a, nuss_outword& b)
915 #ifdef NUSS_OUT_EXTERNAL_LOOPS
917 if (shiftrightcopy_loop_msp(arrayMSDptr(a._ow,3),arrayMSDptr(b._ow,3),3,1,mspref(arrayMSDptr(a._ow,3),0)>>63))
920 shiftrightcopy_loop_msp(arrayMSDptr(a._ow,3),arrayMSDptr(b._ow,3),3,1,mspref(arrayMSDptr(a._ow,3),0)>>63);
923 var uint64 tmp, carry;
926 b.ow2 = (sint64)tmp >> 1;
929 b.ow1 = (tmp >> 1) | carry;
932 b.ow0 = (tmp >> 1) | carry;
941 #endif // (intDsize==64)
943 // This is a recursive implementation.
944 // TODO: Write a non-recursive one.
948 // Reverse an n-bit number x. n>0.
949 static uintL bit_reverse (uintL n, uintL x)
956 } while (!(--n == 0));
961 // Threshold for recursion base in mulu_nuss_negacyclic().
962 // Time of a multiplication with len1=len2=10000 on Linux i486:
963 // normal asm-optimized
964 // threshold1 = 1: 40.1 sec 25.5 sec
965 // threshold1 = 2: 28.6 sec 18.3 sec
966 // threshold1 = 3: 25.6 sec 16.6 sec
967 // threshold1 = 4: 25.7 sec 17.6 sec
968 // threshold1 = 5: 26.1 sec 18.0 sec
969 const uintL cl_nuss_threshold1 = 3;
971 // Threshold for recursion base in mulu_nuss_cyclic().
972 const uintL cl_nuss_threshold2 = 1;
974 // Computes z[k] := sum(i+j==k mod N, x[i]*y[j]*(-1)^((i+j-k)/N))
976 static void mulu_nuss_negacyclic (const uintL n, const uintL N, // N = 2^n
977 const nuss_inword * x, // N words
978 const nuss_inword * y, // N words
979 nuss_outword * z // N words result
982 #if 0 // always n > 0
985 mul(x[0],y[0], z[0]);
989 if (n <= cl_nuss_threshold1) {
991 // z[0] := x0 (y0 + y1) - (x0 + x1) y1
992 // z[1] := x0 (y0 + y1) + (x1 - x0) y0
993 var nuss_inword x_sum;
994 var nuss_inword y_sum;
995 var nuss_outword first, second;
996 add(x[0],x[1], x_sum);
997 add(y[0],y[1], y_sum);
998 mul(x[0],y_sum, first);
999 mul(x_sum,y[1], second); sub(first,second, z[0]);
1000 sub(x[1],x[0], x_sum);
1001 mul(x_sum,y[0], second); add(first,second, z[1]);
1004 // 1 < n <= cl_nuss_threshold1.
1005 #if 0 // straightforward, but slow
1007 for (k = 0; k < N; k++) {
1009 var nuss_outword accu;
1010 mul(x[0],y[k], accu);
1011 for (i = 1; i <= k; i++) {
1012 var nuss_outword temp;
1013 mul(x[i],y[k-i], temp);
1014 add(accu,temp, accu);
1016 for (i = k+1; i < N; i++) {
1017 var nuss_outword temp;
1018 mul(x[i],y[N-i+k], temp);
1019 sub(accu,temp, accu);
1024 var const uintL M = (uintL)1 << (n-1); // M = N/2
1026 for (k = 0; k < N; k++)
1028 for (i = 0; i < M; i++) {
1030 for (j = 0; j < M-i; j++) {
1032 // z[i+j] += x[i] (y[j] + y[j+M]) - (x[i] + x[i+M]) y[j+M]
1033 // z[i+j+M] += x[i] (y[j] + y[j+M]) + (x[i+M] - x[i]) y[j]
1034 var nuss_inword x_sum;
1035 var nuss_inword y_sum;
1036 var nuss_outword first, second, temp;
1037 add(x[i],x[iM], x_sum);
1038 add(y[j],y[jM], y_sum);
1039 mul(x[i],y_sum, first);
1040 mul(x_sum,y[jM], second); sub(first,second, temp); add(z[i+j],temp, z[i+j]);
1041 sub(x[iM],x[i], x_sum);
1042 mul(x_sum,y[j], second); add(first,second, temp); add(z[i+j+M],temp, z[i+j+M]);
1044 for (j = M-i; j < M; j++) {
1046 // z[i+j] += x[i] (y[j] + y[j+M]) - (x[i] + x[i+M]) y[j+M]
1047 // z[i+j-M] -= x[i] (y[j] + y[j+M]) + (x[i+M] - x[i]) y[j]
1048 var nuss_inword x_sum;
1049 var nuss_inword y_sum;
1050 var nuss_outword first, second, temp;
1051 add(x[i],x[iM], x_sum);
1052 add(y[j],y[jM], y_sum);
1053 mul(x[i],y_sum, first);
1054 mul(x_sum,y[jM], second); sub(first,second, temp); add(z[i+j],temp, z[i+j]);
1055 sub(x[iM],x[i], x_sum);
1056 mul(x_sum,y[j], second); add(first,second, temp); sub(z[i+j-M],temp, z[i+j-M]);
1063 var const uintL m = n >> 1; // floor(n/2)
1064 var const uintL r = n - m; // ceiling(n/2)
1065 var const uintL M = (uintL)1 << m; // M = 2^m
1066 var const uintL R = (uintL)1 << r; // R = 2^r
1068 var nuss_inword* const auX = cl_alloc_array(nuss_inword,2*N);
1069 var nuss_inword* const auY = cl_alloc_array(nuss_inword,2*N);
1070 var nuss_outword* const auZ = cl_alloc_array(nuss_outword,2*N);
1071 #define X(i,j) auX[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1072 #define Y(i,j) auY[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1073 #define Z(i,j) auZ[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1074 var nuss_inword* const tmp1 = cl_alloc_array(nuss_inword,R);
1075 var nuss_inword* const tmp2 = cl_alloc_array(nuss_inword,R);
1076 var nuss_outword* const tmpZ = cl_alloc_array(nuss_outword,R);
1077 var bool squaring = (x == y);
1079 // Initialize polynomials X(i) and Y(i).
1080 for (i = 0; i < M; i++) {
1082 for (j = 0; j < R; j++)
1083 X(i,j) = x[(j<<m) + i];
1086 for (j = 0; j < R; j++)
1087 Y(i,j) = y[(j<<m) + i];
1090 // For i = M..2*M-1, the polynomials are implicitly 0.
1091 // Do an FFT of length 2*M on X.
1095 for (i = 0; i < M; i++)
1096 for (j = 0; j < R; j++)
1098 // Level l = m-1..0:
1099 for (l = m-1; l>=0; l--) {
1100 var const uintL smax = (uintL)1 << (m-l);
1101 var const uintL tmax = (uintL)1 << l;
1102 for (var uintL s = 0; s < smax; s++) {
1103 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1104 for (var uintL t = 0; t < tmax; t++) {
1105 var uintL i1 = (s << (l+1)) + t;
1106 var uintL i2 = i1 + tmax;
1107 // Butterfly: replace (X(i1),X(i2)) by
1108 // (X(i1) + w^exp*X(i2), X(i1) - w^exp*X(i2)).
1109 for (j = 0; j < exp; j++) {
1110 // note that w^R = -1
1111 sub(X(i1,j),X(i2,j-exp+R), tmp1[j]);
1112 add(X(i1,j),X(i2,j-exp+R), tmp2[j]);
1114 for (j = exp; j < R; j++) {
1115 add(X(i1,j),X(i2,j-exp), tmp1[j]);
1116 sub(X(i1,j),X(i2,j-exp), tmp2[j]);
1118 for (j = 0; j < R; j++) {
1126 // Do an FFT of length 2*M on Y.
1130 for (i = 0; i < M; i++)
1131 for (j = 0; j < R; j++)
1133 // Level l = m-1..0:
1134 for (l = m-1; l>=0; l--) {
1135 var const uintL smax = (uintL)1 << (m-l);
1136 var const uintL tmax = (uintL)1 << l;
1137 for (var uintL s = 0; s < smax; s++) {
1138 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1139 for (var uintL t = 0; t < tmax; t++) {
1140 var uintL i1 = (s << (l+1)) + t;
1141 var uintL i2 = i1 + tmax;
1142 // Butterfly: replace (Y(i1),Y(i2)) by
1143 // (Y(i1) + w^exp*Y(i2), Y(i1) - w^exp*Y(i2)).
1144 for (j = 0; j < exp; j++) {
1145 // note that w^R = -1
1146 sub(Y(i1,j),Y(i2,j-exp+R), tmp1[j]);
1147 add(Y(i1,j),Y(i2,j-exp+R), tmp2[j]);
1149 for (j = exp; j < R; j++) {
1150 add(Y(i1,j),Y(i2,j-exp), tmp1[j]);
1151 sub(Y(i1,j),Y(i2,j-exp), tmp2[j]);
1153 for (j = 0; j < R; j++) {
1161 // Recursively compute the negacyclic product X(i)*Y(i) for all i.
1163 for (i = 0; i < 2*M; i++)
1164 mulu_nuss_negacyclic(r,R, &X(i,0), &Y(i,0), &Z(i,0));
1166 for (i = 0; i < 2*M; i++)
1167 mulu_nuss_negacyclic(r,R, &X(i,0), &X(i,0), &Z(i,0));
1169 // Undo an FFT of length 2*M on Z.
1172 // Level l = 0..m-1:
1173 for (l = 0; l < m; l++) {
1174 var const uintL smax = (uintL)1 << (m-l);
1175 var const uintL tmax = (uintL)1 << l;
1176 for (var uintL s = 0; s < smax; s++) {
1177 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1178 for (var uintL t = 0; t < tmax; t++) {
1179 var uintL i1 = (s << (l+1)) + t;
1180 var uintL i2 = i1 + tmax;
1181 // Inverse Butterfly: replace (Z(i1),Z(i2)) by
1182 // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/(2*w^exp)).
1183 for (j = 0; j < exp; j++)
1184 // note that w^R = -1
1185 sub(Z(i2,j),Z(i1,j), tmpZ[j-exp+R]);
1186 for (j = exp; j < R; j++)
1187 sub(Z(i1,j),Z(i2,j), tmpZ[j-exp]);
1188 for (j = 0; j < R; j++) {
1189 var nuss_outword sum;
1190 add(Z(i1,j),Z(i2,j), sum);
1191 shift(sum, Z(i1,j));
1192 shift(tmpZ[j], Z(i2,j));
1198 for (i = 0; i < M; i++) {
1200 var uintL i2 = i1 + M;
1201 // Inverse Butterfly: replace (Z(i1),Z(i2)) by
1202 // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/2).
1203 for (j = 0; j < R; j++) {
1204 var nuss_outword sum;
1205 var nuss_outword diff;
1206 add(Z(i1,j),Z(i2,j), sum);
1207 sub(Z(i1,j),Z(i2,j), diff);
1208 shift(sum, Z(i1,j));
1209 shift(diff, Z(i2,j));
1213 // Reduce to length M.
1214 for (i = 0; i < M; i++) {
1215 sub(Z(i,0),Z(i+M,R-1), z[i]);
1216 for (j = 1; j < R; j++)
1217 add(Z(i,j),Z(i+M,j-1), z[(j<<m)+i]);
1224 // Computes z[k] := sum(i+j==k mod N, x[i]*y[j])
1225 // for all k=0..N-1.
1226 static void mulu_nuss_cyclic (const uintL n, const uintL N, // N = 2^n
1227 nuss_inword * x, // N words, modified!
1228 nuss_inword * y, // N words, modified!
1229 nuss_outword * z // N words result
1233 #if 0 // always n > 0
1236 mul(x[0],y[0], z[0]);
1241 // z[0] := ((x0 + x1) (y0 + y1) + (x0 - x1) (y0 - y1)) / 2
1242 // z[1] := ((x0 + x1) (y0 + y1) - (x0 - x1) (y0 - y1)) / 2
1243 var nuss_inword x_sum;
1244 var nuss_inword y_sum;
1245 var nuss_inword x_diff;
1246 var nuss_inword y_diff;
1247 var nuss_outword first, second;
1248 add(x[0],x[1], x_sum);
1249 add(y[0],y[1], y_sum);
1250 sub(x[0],x[1], x_diff);
1251 sub(y[0],y[1], y_diff);
1252 mul(x_sum,y_sum, first);
1253 mul(x_diff,y_diff, second);
1254 add(first,second, z[0]); shift(z[0], z[0]);
1255 sub(first,second, z[1]); shift(z[1], z[1]);
1258 #if 0 // useless code because cl_nuss_threshold2 == 1
1259 if (n <= cl_nuss_threshold2) {
1260 #if 0 // straightforward, but slow
1262 for (k = 0; k < N; k++) {
1264 var nuss_outword accu;
1265 mul(x[0],y[k], accu);
1266 for (i = 1; i <= k; i++) {
1267 var nuss_outword temp;
1268 mul(x[i],y[k-i], temp);
1269 add(accu,temp, accu);
1271 for (i = k+1; i < N; i++) {
1272 var nuss_outword temp;
1273 mul(x[i],y[N-i+k], temp);
1274 add(accu,temp, accu);
1279 var const uintL M = (uintL)1 << (n-1); // M = N/2
1281 for (k = 0; k < N; k++)
1283 for (i = 0; i < M; i++) {
1285 for (j = 0; j < M; j++) {
1287 // z[i+j] += ((x[i] + x[i+M]) (y[j] + y[j+M]) + (x[i] - x[i+M]) (y[j] - y[j+M])) / 2
1288 // z[i+j+M] += ((x[i] + x[i+M]) (y[j] + y[j+M]) - (x[i] - x[i+M]) (y[j] - y[j+M])) / 2
1289 var nuss_inword x_sum;
1290 var nuss_inword y_sum;
1291 var nuss_inword x_diff;
1292 var nuss_inword y_diff;
1293 var nuss_outword first, second, temp;
1294 add(x[i],x[iM], x_sum);
1295 add(y[j],y[jM], y_sum);
1296 sub(x[i],x[iM], x_diff);
1297 sub(y[j],y[jM], y_diff);
1298 mul(x_sum,y_sum, first);
1299 mul(x_diff,y_diff, second);
1300 add(first,second, temp); add(z[i+j],temp, z[i+j]);
1301 var uintL ijM = (i+j+M) & (N-1);
1302 sub(first,second, temp); add(z[ijM],temp, z[ijM]);
1305 for (k = 0; k < N; k++)
1311 var const uintL m = n-1;
1312 var const uintL M = (uintL)1 << m; // M = 2^m = N/2
1314 // Chinese remainder theorem: u^N-1 = (u^M-1)*(u^M+1)
1315 for (i = 0; i < M; i++) {
1316 // Butterfly: replace (x(i),x(i+M))
1317 // by (x(i)+x(i+M),x(i)-x(i+M)).
1318 var nuss_inword tmp;
1319 sub(x[i],x[i+M], tmp);
1320 add(x[i],x[i+M], x[i]);
1323 if (!(x == y)) // squaring?
1324 for (i = 0; i < M; i++) {
1325 // Butterfly: replace (y(i),y(i+M))
1326 // by (y(i)+y(i+M),y(i)-y(i+M)).
1327 var nuss_inword tmp;
1328 sub(y[i],y[i+M], tmp);
1329 add(y[i],y[i+M], y[i]);
1333 mulu_nuss_cyclic(m,M, &x[0], &y[0], &z[0]);
1334 mulu_nuss_negacyclic(m,M, &x[M], &y[M], &z[M]);
1335 for (i = 0; i < M; i++) {
1336 // Inverse Butterfly: replace (z(i),z(i+M))
1337 // by ((z(i)+z(i+M))/2,(z(i)-z(i+M))/2).
1338 var nuss_outword sum;
1339 var nuss_outword diff;
1340 add(z[i],z[i+M], sum);
1341 sub(z[i],z[i+M], diff);
1343 shift(diff, z[i+M]);
1347 static void mulu_nussbaumer (const uintD* sourceptr1, uintC len1,
1348 const uintD* sourceptr2, uintC len2,
1350 // Es ist 2 <= len1 <= len2.
1353 // source1 ist ein Stück der Länge N1, source2 ein oder mehrere Stücke
1354 // der Länge N2, mit N1+N2 <= N, wobei N Zweierpotenz ist.
1355 // sum(i=0..N-1, x_i b^i) * sum(i=0..N-1, y_i b^i) wird errechnet,
1356 // indem man die beiden Polynome
1357 // sum(i=0..N-1, x_i T^i), sum(i=0..N-1, y_i T^i)
1358 // multipliziert, und zwar durch Fourier-Transformation (s.o.).
1360 integerlength32(len1-1, n=); // 2^(n-1) < len1 <= 2^n
1361 var uintL len = (uintL)1 << n; // kleinste Zweierpotenz >= len1
1362 // Wählt man N = len, so hat man ceiling(len2/(len-len1+1)) * FFT(len).
1363 // Wählt man N = 2*len, so hat man ceiling(len2/(2*len-len1+1)) * FFT(2*len).
1364 // Wir wählen das billigere von beiden:
1365 // Bei ceiling(len2/(len-len1+1)) <= 2 * ceiling(len2/(2*len-len1+1))
1366 // nimmt man N = len, bei ....... > ........ dagegen N = 2*len.
1367 // (Wahl von N = 4*len oder mehr bringt nur in Extremfällen etwas.)
1368 if (len2 > 2 * (len-len1+1) * (len2 <= (2*len-len1+1) ? 1 : ceiling(len2,(2*len-len1+1)))) {
1372 var const uintL N = len; // N = 2^n
1374 var nuss_inword* const x = cl_alloc_array(nuss_inword,N);
1375 var nuss_inword* const y = cl_alloc_array(nuss_inword,N);
1376 var nuss_outword* const z = cl_alloc_array(nuss_outword,N);
1377 var uintD* const tmpprod = cl_alloc_array(uintD,len1+1);
1379 var uintL destlen = len1+len2;
1380 clear_loop_lsp(destptr,destlen);
1382 var uintL len2p; // length of a piece of source2
1383 len2p = N - len1 + 1;
1386 // len2p = min(N-len1+1,len2).
1389 var uintD* tmpptr = arrayLSDptr(tmpprod,len1+1);
1390 mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
1391 if (addto_loop_lsp(tmpptr,destptr,len1+1))
1392 if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
1395 var uintL destlenp = len1 + len2p - 1;
1396 // destlenp = min(N,destlen-1).
1397 var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
1400 for (i = 0; i < len1; i++) {
1401 x[i].iw0 = lspref(sourceptr1,i);
1404 for (i = len1; i < N; i++) {
1411 for (i = 0; i < len2p; i++) {
1412 y[i].iw0 = lspref(sourceptr2,i);
1415 for (i = len2p; i < N; i++) {
1422 mulu_nuss_cyclic(n,N, &x[0], &y[0], &z[0]);
1424 mulu_nuss_cyclic(n,N, &x[0], &x[0], &z[0]);
1427 for (i = 0; i < N; i++)
1428 if (!(z[i].ow3 == 0))
1431 // Add result to destptr[-destlen..-1]:
1433 var uintD* ptr = destptr;
1434 // ac2|ac1|ac0 are an accumulator.
1439 for (i = 0; i < destlenp; i++) {
1440 // Add z[i] to the accumulator.
1442 if ((ac0 += tmp) < tmp) {
1447 if ((ac1 += tmp) < tmp)
1451 // Add the accumulator's least significant word to destptr:
1452 tmp = lspref(ptr,0);
1453 if ((ac0 += tmp) < tmp) {
1457 lspref(ptr,0) = ac0;
1465 if (!((i += 2) <= destlen))
1467 tmp = lspref(ptr,0);
1468 if ((ac0 += tmp) < tmp)
1470 lspref(ptr,0) = ac0;
1472 tmp = lspref(ptr,0);
1474 lspref(ptr,0) = ac1;
1477 if (inc_loop_lsp(ptr,destlen-i))
1479 } else if (ac0 > 0) {
1480 if (!((i += 1) <= destlen))
1482 tmp = lspref(ptr,0);
1484 lspref(ptr,0) = ac0;
1487 if (inc_loop_lsp(ptr,destlen-i))
1492 // If destlenp < N, check that the remaining z[i] are 0.
1493 for (i = destlenp; i < N; i++)
1494 if (z[i].ow2 > 0 || z[i].ow1 > 0 || z[i].ow0 > 0)
1499 destptr = destptr lspop len2p;
1501 sourceptr2 = sourceptr2 lspop len2p;