]> www.ginac.de Git - cln.git/blob - cl_DS_mul_nuss.h
33b06a1004c8f5e2d42c3f53023a3311b5b0bc60
[cln.git] / cl_DS_mul_nuss.h
1 // Fast integer multiplication using Nussbaumer's FFT based algorithm.
2 // [Donald Ervin Knuth: The Art of Computer Programming, Vol. II:
3 //  Seminumerical Algorithms, second edition.
4 //  Section 4.6.4, exercise 59, p. 503, 652-654.]
5 // [Henri Jean Nussbaumer, IEEE Trans. ASSP-28 (1980), 205-215.]
6 // Bruno Haible 4.-5.5.1996
7
8 // This algorithm has the benefit of working on entire words, not single bits,
9 // and involving no non-integer numbers. (The root of unity is chosen in
10 // an appropriate polynomial ring.)
11
12 // If at the beginning all words x_i, y_i are >= 0 and < M, then
13 // the intermediate X_{i,j}, Y_{i,j} are < M * N in absolute value
14 // (where N = number of words), hence the |Z_{i,j}| < M^2 * N^2.
15 // We therefore reserve 2 32-bit words for every X_{i,j} and 4 32-bit words
16 // for every Z_{i,j}.
17
18 #if !(intDsize==32)
19 #error "nussbaumer implemented only for intDsize==32"
20 #endif
21
22 // Define this if you want the external loops instead of inline operations.
23 //#define NUSS_IN_EXTERNAL_LOOPS
24 #define NUSS_OUT_EXTERNAL_LOOPS
25
26 // Define this if you want inline operations which access the stack directly.
27 // This looks like better code, but is in effect 3% slower. No idea why.
28 //#define NUSS_ASM_DIRECT
29
30 // Define this for (cheap) consistency checks.
31 //#define DEBUG_NUSS
32
33 // Define this for extensive consistency checks.
34 //#define DEBUG_NUSS_OPERATIONS
35
36 #if (intDsize==32)
37
38 //typedef struct { sint32 iw1; uint32 iw0; } nuss_inword;
39 //typedef struct { uint32 iw0; sint32 iw1; } nuss_inword;
40 typedef struct { uintD _iw[2]; } nuss_inword;
41 #if CL_DS_BIG_ENDIAN_P
42   #define iw1 _iw[0]
43   #define iw0 _iw[1]
44 #else
45   #define iw0 _iw[0]
46   #define iw1 _iw[1]
47 #endif
48
49 //typedef struct { sint32 ow3; uint32 ow2; uint32 ow1; uint32 ow0; } nuss_outword;
50 //typedef struct { uint32 ow0; uint32 ow1; uint32 ow2; sint32 ow3; } nuss_outword;
51 typedef struct { uintD _ow[4]; } nuss_outword;
52 #if CL_DS_BIG_ENDIAN_P
53   #define ow3 _ow[0]
54   #define ow2 _ow[1]
55   #define ow1 _ow[2]
56   #define ow0 _ow[3]
57 #else
58   #define ow0 _ow[0]
59   #define ow1 _ow[1]
60   #define ow2 _ow[2]
61   #define ow3 _ow[3]
62 #endif
63
64 // r := a + b
65 static inline void add (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
66 {
67 #if defined(__GNUC__) && defined(__i386__)
68         var uintD dummy;
69   #ifdef NUSS_ASM_DIRECT
70         __asm__ __volatile__ (
71                 "movl %1,%0" "\n\t"
72                 "addl %2,%0" "\n\t"
73                 "movl %0,%3"
74                 : "=&q" (dummy)
75                 : "m" (a.iw0), "m" (b.iw0), "m" (r.iw0)
76                 : "cc"
77                 );
78         __asm__ __volatile__ (
79                 "movl %1,%0" "\n\t"
80                 "adcl %2,%0" "\n\t"
81                 "movl %0,%3"
82                 : "=&q" (dummy)
83                 : "m" (a.iw1), "m" (b.iw1), "m" (r.iw1)
84                 : "cc"
85                 );
86   #else
87     #if CL_DS_BIG_ENDIAN_P
88         __asm__ __volatile__ (
89                 "movl 4(%1),%0" "\n\t"
90                 "addl 4(%2),%0" "\n\t"
91                 "movl %0,4(%3)" "\n\t"
92                 "movl (%1),%0"  "\n\t"
93                 "adcl (%2),%0"  "\n\t"
94                 "movl %0,(%3)"
95                 : "=&q" (dummy)
96                 : "r" (&a), "r" (&b), "r" (&r)
97                 : "cc"
98                 );
99     #else
100         __asm__ __volatile__ (
101                 "movl (%1),%0"  "\n\t"
102                 "addl (%2),%0"  "\n\t"
103                 "movl %0,(%3)"  "\n\t"
104                 "movl 4(%1),%0" "\n\t"
105                 "adcl 4(%2),%0" "\n\t"
106                 "movl %0,4(%3)"
107                 : "=&q" (dummy)
108                 : "r" (&a), "r" (&b), "r" (&r)
109                 : "cc"
110                 );
111     #endif
112   #endif
113 #elif defined(NUSS_IN_EXTERNAL_LOOPS)
114         add_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
115 #else
116         var uint32 tmp;
117
118         tmp = a.iw0 + b.iw0;
119         if (tmp >= a.iw0) {
120                 // no carry
121                 r.iw0 = tmp;
122                 r.iw1 = a.iw1 + b.iw1;
123         } else {
124                 // carry
125                 r.iw0 = tmp;
126                 r.iw1 = a.iw1 + b.iw1 + 1;
127         }
128 #endif
129 }
130
131 // r := a - b
132 static inline void sub (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
133 {
134 #if defined(__GNUC__) && defined(__i386__)
135         var uintD dummy;
136   #ifdef NUSS_ASM_DIRECT
137         __asm__ __volatile__ (
138                 "movl %1,%0" "\n\t"
139                 "subl %2,%0" "\n\t"
140                 "movl %0,%3"
141                 : "=&q" (dummy)
142                 : "m" (a.iw0), "m" (b.iw0), "m" (r.iw0)
143                 : "cc"
144                 );
145         __asm__ __volatile__ (
146                 "movl %1,%0" "\n\t"
147                 "sbbl %2,%0" "\n\t"
148                 "movl %0,%3"
149                 : "=&q" (dummy)
150                 : "m" (a.iw1), "m" (b.iw1), "m" (r.iw1)
151                 : "cc"
152                 );
153   #else
154     #if CL_DS_BIG_ENDIAN_P
155         __asm__ __volatile__ (
156                 "movl 4(%1),%0" "\n\t"
157                 "subl 4(%2),%0" "\n\t"
158                 "movl %0,4(%3)" "\n\t"
159                 "movl (%1),%0"  "\n\t"
160                 "sbbl (%2),%0"  "\n\t"
161                 "movl %0,(%3)"
162                 : "=&q" (dummy)
163                 : "r" (&a), "r" (&b), "r" (&r)
164                 : "cc"
165                 );
166     #else
167         __asm__ __volatile__ (
168                 "movl (%1),%0"  "\n\t"
169                 "subl (%2),%0"  "\n\t"
170                 "movl %0,(%3)"  "\n\t"
171                 "movl 4(%1),%0" "\n\t"
172                 "sbbl 4(%2),%0" "\n\t"
173                 "movl %0,4(%3)"
174                 : "=&q" (dummy)
175                 : "r" (&a), "r" (&b), "r" (&r)
176                 : "cc"
177                 );
178     #endif
179   #endif
180 #elif defined(NUSS_IN_EXTERNAL_LOOPS)
181         sub_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
182 #else
183         var uint32 tmp;
184
185         tmp = a.iw0 - b.iw0;
186         if (tmp <= a.iw0) {
187                 // no carry
188                 r.iw0 = tmp;
189                 r.iw1 = a.iw1 - b.iw1;
190         } else {
191                 // carry
192                 r.iw0 = tmp;
193                 r.iw1 = a.iw1 - b.iw1 - 1;
194         }
195 #endif
196 }
197
198 // r := a * b
199 static void mul (const nuss_inword& a, const nuss_inword& b, nuss_outword& r)
200 {
201 #ifdef NUSS_IN_EXTERNAL_LOOPS
202         mulu_2loop(arrayLSDptr(a._iw,2),2, arrayLSDptr(b._iw,2),2, arrayLSDptr(r._ow,4));
203         if ((sintD)mspref(arrayMSDptr(a._iw,2),0) < 0)
204                 subfrom_loop_lsp(arrayLSDptr(b._iw,2),arrayLSDptr(r._ow,4) lspop 2,2);
205         if ((sintD)mspref(arrayMSDptr(b._iw,2),0) < 0)
206                 subfrom_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(r._ow,4) lspop 2,2);
207 #else
208         if (a.iw1 == 0) {
209                 // a small positive
210                 if (b.iw1 == 0) {
211                         // a, b small positive
212                         mulu32(a.iw0, b.iw0, r.ow1 =, r.ow0 =);
213                         r.ow3 = 0; r.ow2 = 0;
214                         return;
215                 }
216                 else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
217                         // b small negative
218                         var uint32 hi, lo;
219                         mulu32(a.iw0, -b.iw0, hi=, lo=);
220                         r.ow0 = -lo;
221                         if (lo) {
222                                 r.ow1 = ~hi;
223                         } else if (hi) {
224                                 r.ow1 = -hi;
225                         } else /* a.iw0 == 0 */ {
226                                 r.ow3 = 0; r.ow2 = 0; r.ow1 = 0;
227                                 return;
228                         }
229                         r.ow3 = -(uint32)1; r.ow2 = -(uint32)1;
230                         return;
231                 }
232                 var uint32 hi1, lo1, hi0;
233                 mulu32(a.iw0, b.iw0, hi0 =, r.ow0 =);
234                 mulu32(a.iw0, b.iw1, hi1 =, lo1 =);
235                 if ((lo1 += hi0) < hi0)
236                         hi1++;
237                 // hi1|lo1|r.ow0 = a.iw0 * b(unsigned).
238                 r.ow1 = lo1;
239                 if ((sint32)b.iw1 >= 0) {
240                         r.ow2 = hi1;
241                         r.ow3 = 0;
242                 } else {
243                         // b was negative -> subtract a * 2^64
244                         if (a.iw0) {
245                                 r.ow2 = hi1 - a.iw0;
246                                 r.ow3 = -(uint32)1;
247                         } else /* a.iw0 == 0 */ {
248                                 r.ow3 = 0; r.ow2 = 0;
249                         }
250                 }
251                 return;
252         }
253         else if (a.iw1 == -(uint32)1 && a.iw0 != 0) {
254                 // a small negative
255                 if (b.iw1 == 0) {
256                         // b small positive
257                         var uint32 hi, lo;
258                         mulu32(-a.iw0, b.iw0, hi=, lo=);
259                         r.ow0 = -lo;
260                         if (lo) {
261                                 r.ow1 = ~hi;
262                         } else if (hi) {
263                                 r.ow1 = -hi;
264                         } else /* b.iw0 == 0 */ {
265                                 r.ow3 = 0; r.ow2 = 0; r.ow1 = 0;
266                                 return;
267                         }
268                         r.ow3 = -(uint32)1; r.ow2 = -(uint32)1;
269                         return;
270                 }
271                 else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
272                         // a, b small negative
273                         mulu32(-a.iw0, -b.iw0, r.ow1 =, r.ow0 =);
274                         r.ow3 = 0; r.ow2 = 0;
275                         return;
276                 }
277                 var uint32 hi1, lo1, hi0, lo0;
278                 mulu32(-a.iw0, b.iw0, hi0 =, lo0 =);
279                 mulu32(-a.iw0, b.iw1, hi1 =, lo1 =);
280                 if ((lo1 += hi0) < hi0)
281                         hi1++;
282                 // hi1|lo1|lo0 = -a * b(unsigned).
283                 if (lo0) {
284                         lo0 = -lo0;
285                         lo1 = ~lo1;
286                         hi1 = ~hi1;
287                 } else if (lo1) {
288                         lo1 = -lo1;
289                         hi1 = ~hi1;
290                 } else
291                         hi1 = -hi1;
292                 // hi1|lo1|lo0 = a * b(unsigned).
293                 r.ow0 = lo0;
294                 r.ow1 = lo1;
295                 if ((sint32)b.iw1 >= 0) {
296                         r.ow2 = hi1;
297                         r.ow3 = -(uint32)1;
298                 } else {
299                         // b was negative -> subtract a * 2^64
300                         r.ow2 = hi1 - a.iw0;
301                         r.ow3 = 0;
302                 }
303                 return;
304         }
305         else if (b.iw1 == 0) {
306                 // b small positive
307                 var uint32 hi1, lo1, hi0;
308                 mulu32(b.iw0, a.iw0, hi0 =, r.ow0 =);
309                 mulu32(b.iw0, a.iw1, hi1 =, lo1 =);
310                 if ((lo1 += hi0) < hi0)
311                         hi1++;
312                 // hi1|lo1|r.ow0 = a(unsigned) * b.iw0.
313                 r.ow1 = lo1;
314                 if ((sint32)a.iw1 >= 0) {
315                         r.ow2 = hi1;
316                         r.ow3 = 0;
317                 } else {
318                         // a was negative -> subtract b * 2^64
319                         if (b.iw0) {
320                                 r.ow2 = hi1 - b.iw0;
321                                 r.ow3 = -(uint32)1;
322                         } else /* b.iw0 == 0 */ {
323                                 r.ow3 = 0; r.ow2 = 0;
324                         }
325                 }
326                 return;
327         }
328         else if (b.iw1 == -(uint32)1 && b.iw0 != 0) {
329                 // b small negative
330                 var uint32 hi1, lo1, hi0, lo0;
331                 mulu32(-b.iw0, a.iw0, hi0 =, lo0 =);
332                 mulu32(-b.iw0, a.iw1, hi1 =, lo1 =);
333                 if ((lo1 += hi0) < hi0)
334                         hi1++;
335                 // hi1|lo1|lo0 = a(unsigned) * -b.
336                 if (lo0) {
337                         lo0 = -lo0;
338                         lo1 = ~lo1;
339                         hi1 = ~hi1;
340                 } else if (lo1) {
341                         lo1 = -lo1;
342                         hi1 = ~hi1;
343                 } else
344                         hi1 = -hi1;
345                 // hi1|lo1|lo0 = a(unsigned) * b.
346                 r.ow0 = lo0;
347                 r.ow1 = lo1;
348                 if ((sint32)a.iw1 >= 0) {
349                         r.ow2 = hi1;
350                         r.ow3 = -(uint32)1;
351                 } else {
352                         // a was negative -> subtract b * 2^64
353                         r.ow2 = hi1 - b.iw0;
354                         r.ow3 = 0;
355                 }
356                 return;
357         }
358         // This is the main and most frequent case (65% to 80%).
359         var uint32 w3, w2, w1, hi, lo;
360         mulu32(a.iw0, b.iw0, w1=, r.ow0=);
361         mulu32(a.iw1, b.iw1, w3=, w2=);
362         mulu32(a.iw0, b.iw1, hi=, lo=);
363         if ((w1 += lo) < lo)
364                 hi++;
365         if ((w2 += hi) < hi)
366                 w3++;
367         mulu32(a.iw1, b.iw0, hi=, lo=);
368         if ((w1 += lo) < lo)
369                 hi++;
370         if ((w2 += hi) < hi)
371                 w3++;
372         // w3|w2|w1|r.ow0 = a(unsigned) * b(unsigned).
373         r.ow1 = w1;
374         if ((sint32)a.iw1 < 0) {
375                 // a was negative -> subtract b * 2^64
376                 if (w2 >= b.iw0) {
377                         w2 -= b.iw0;
378                         w3 -= b.iw1;
379                 } else {
380                         // carry
381                         w2 -= b.iw0;
382                         w3 = w3 - b.iw1 - 1;
383                 }
384         }
385         if ((sint32)b.iw1 < 0) {
386                 // b was negative -> subtract a * 2^64
387                 if (w2 >= a.iw0) {
388                         w2 -= a.iw0;
389                         w3 -= a.iw1;
390                 } else {
391                         // carry
392                         w2 -= a.iw0;
393                         w3 = w3 - a.iw1 - 1;
394                 }
395         }
396         r.ow2 = w2;
397         r.ow3 = w3;
398         return;
399 #endif
400 }
401 #ifdef DEBUG_NUSS_OPERATIONS
402 static void mul_doublecheck (const nuss_inword& a, const nuss_inword& b, nuss_outword& r)
403 {
404         nuss_outword or;
405         mulu_2loop(arrayLSDptr(a._iw,2),2, arrayLSDptr(b._iw,2),2, arrayLSDptr(or._ow,4));
406         if ((sintD)mspref(arrayMSDptr(a._iw,2),0) < 0)
407                 subfrom_loop_lsp(arrayLSDptr(b._iw,2),arrayLSDptr(or._ow,4) lspop 2,2);
408         if ((sintD)mspref(arrayMSDptr(b._iw,2),0) < 0)
409                 subfrom_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(or._ow,4) lspop 2,2);
410         mul(a,b, r);
411         if (compare_loop_msp(arrayMSDptr(r._ow,4),arrayMSDptr(or._ow,4),4))
412                 cl_abort();
413 }
414 #define mul mul_doublecheck
415 #endif
416
417 // r := 0
418 static inline void zero (nuss_outword& r)
419 {
420         r.ow0 = 0;
421         r.ow1 = 0;
422         r.ow2 = 0;
423         r.ow3 = 0;
424 }
425
426 // r := a + b
427 static inline void add (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
428 {
429 #if defined(__GNUC__) && defined(__i386__)
430         var uintD dummy;
431   #ifdef NUSS_ASM_DIRECT
432         __asm__ __volatile__ (
433                 "movl %1,%0" "\n\t"
434                 "addl %2,%0" "\n\t"
435                 "movl %0,%3"
436                 : "=&q" (dummy)
437                 : "m" (a.ow0), "m" (b.ow0), "m" (r.ow0)
438                 : "cc"
439                 );
440         __asm__ __volatile__ (
441                 "movl %1,%0" "\n\t"
442                 "adcl %2,%0" "\n\t"
443                 "movl %0,%3"
444                 : "=&q" (dummy)
445                 : "m" (a.ow1), "m" (b.ow1), "m" (r.ow1)
446                 : "cc"
447                 );
448         __asm__ __volatile__ (
449                 "movl %1,%0" "\n\t"
450                 "adcl %2,%0" "\n\t"
451                 "movl %0,%3"
452                 : "=&q" (dummy)
453                 : "m" (a.ow2), "m" (b.ow2), "m" (r.ow2)
454                 : "cc"
455                 );
456         __asm__ __volatile__ (
457                 "movl %1,%0" "\n\t"
458                 "adcl %2,%0" "\n\t"
459                 "movl %0,%3"
460                 : "=&q" (dummy)
461                 : "m" (a.ow3), "m" (b.ow3), "m" (r.ow3)
462                 : "cc"
463                 );
464   #else
465     #if CL_DS_BIG_ENDIAN_P
466         __asm__ __volatile__ (
467                 "movl 12(%1),%0" "\n\t"
468                 "addl 12(%2),%0" "\n\t"
469                 "movl %0,12(%3)" "\n\t"
470                 "movl 8(%1),%0"  "\n\t"
471                 "adcl 8(%2),%0"  "\n\t"
472                 "movl %0,8(%3)"  "\n\t"
473                 "movl 4(%1),%0"  "\n\t"
474                 "adcl 4(%2),%0"  "\n\t"
475                 "movl %0,4(%3)"  "\n\t"
476                 "movl (%1),%0"   "\n\t"
477                 "adcl (%2),%0"   "\n\t"
478                 "movl %0,(%3)"
479                 : "=&q" (dummy)
480                 : "r" (&a), "r" (&b), "r" (&r)
481                 : "cc"
482                 );
483     #else
484         __asm__ __volatile__ (
485                 "movl (%1),%0"   "\n\t"
486                 "addl (%2),%0"   "\n\t"
487                 "movl %0,(%3)"   "\n\t"
488                 "movl 4(%1),%0"  "\n\t"
489                 "adcl 4(%2),%0"  "\n\t"
490                 "movl %0,4(%3)"  "\n\t"
491                 "movl 8(%1),%0"  "\n\t"
492                 "adcl 8(%2),%0"  "\n\t"
493                 "movl %0,8(%3)"  "\n\t"
494                 "movl 12(%1),%0" "\n\t"
495                 "adcl 12(%2),%0" "\n\t"
496                 "movl %0,12(%3)"
497                 : "=&q" (dummy)
498                 : "r" (&a), "r" (&b), "r" (&r)
499                 : "cc"
500                 );
501     #endif
502   #endif
503 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
504         add_loop_lsp(arrayLSDptr(a._ow,4),arrayLSDptr(b._ow,4),arrayLSDptr(r._ow,4),4);
505 #else
506         var uint32 tmp;
507
508         tmp = a.ow0 + b.ow0;
509         if (tmp >= a.ow0) {
510                 // no carry
511                 r.ow0 = tmp;
512                 tmp = a.ow1 + b.ow1;
513                 if (tmp >= a.ow1) goto no_carry_1; else goto carry_1;
514         } else {
515                 // carry
516                 r.ow0 = tmp;
517                 tmp = a.ow1 + b.ow1 + 1;
518                 if (tmp > a.ow1) goto no_carry_1; else goto carry_1;
519         }
520         if (1) {
521                 no_carry_1: // no carry
522                 r.ow1 = tmp;
523                 tmp = a.ow2 + b.ow2;
524                 if (tmp >= a.ow2) goto no_carry_2; else goto carry_2;
525         } else {
526                 carry_1: // carry
527                 r.ow1 = tmp;
528                 tmp = a.ow2 + b.ow2 + 1;
529                 if (tmp > a.ow2) goto no_carry_2; else goto carry_2;
530         }
531         if (1) {
532                 no_carry_2: // no carry
533                 r.ow2 = tmp;
534                 tmp = a.ow3 + b.ow3;
535         } else {
536                 carry_2: // carry
537                 r.ow2 = tmp;
538                 tmp = a.ow3 + b.ow3 + 1;
539         }
540         r.ow3 = tmp;
541 #endif
542 }
543
544 // r := a - b
545 static inline void sub (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
546 {
547 #if defined(__GNUC__) && defined(__i386__)
548         var uintD dummy;
549   #ifdef NUSS_ASM_DIRECT
550         __asm__ __volatile__ (
551                 "movl %1,%0" "\n\t"
552                 "subl %2,%0" "\n\t"
553                 "movl %0,%3"
554                 : "=&q" (dummy)
555                 : "m" (a.ow0), "m" (b.ow0), "m" (r.ow0)
556                 : "cc"
557                 );
558         __asm__ __volatile__ (
559                 "movl %1,%0" "\n\t"
560                 "sbbl %2,%0" "\n\t"
561                 "movl %0,%3"
562                 : "=&q" (dummy)
563                 : "m" (a.ow1), "m" (b.ow1), "m" (r.ow1)
564                 : "cc"
565                 );
566         __asm__ __volatile__ (
567                 "movl %1,%0" "\n\t"
568                 "sbbl %2,%0" "\n\t"
569                 "movl %0,%3"
570                 : "=&q" (dummy)
571                 : "m" (a.ow2), "m" (b.ow2), "m" (r.ow2)
572                 : "cc"
573                 );
574         __asm__ __volatile__ (
575                 "movl %1,%0" "\n\t"
576                 "sbbl %2,%0" "\n\t"
577                 "movl %0,%3"
578                 : "=&q" (dummy)
579                 : "m" (a.ow3), "m" (b.ow3), "m" (r.ow3)
580                 : "cc"
581                 );
582   #else
583     #if CL_DS_BIG_ENDIAN_P
584         __asm__ __volatile__ (
585                 "movl 12(%1),%0" "\n\t"
586                 "subl 12(%2),%0" "\n\t"
587                 "movl %0,12(%3)" "\n\t"
588                 "movl 8(%1),%0"  "\n\t"
589                 "sbbl 8(%2),%0"  "\n\t"
590                 "movl %0,8(%3)"  "\n\t"
591                 "movl 4(%1),%0"  "\n\t"
592                 "sbbl 4(%2),%0"  "\n\t"
593                 "movl %0,4(%3)"  "\n\t"
594                 "movl (%1),%0"   "\n\t"
595                 "sbbl (%2),%0"   "\n\t"
596                 "movl %0,(%3)"
597                 : "=&q" (dummy)
598                 : "r" (&a), "r" (&b), "r" (&r)
599                 : "cc"
600                 );
601     #else
602         __asm__ __volatile__ (
603                 "movl (%1),%0"   "\n\t"
604                 "subl (%2),%0"   "\n\t"
605                 "movl %0,(%3)"   "\n\t"
606                 "movl 4(%1),%0"  "\n\t"
607                 "sbbl 4(%2),%0"  "\n\t"
608                 "movl %0,4(%3)"  "\n\t"
609                 "movl 8(%1),%0"  "\n\t"
610                 "sbbl 8(%2),%0"  "\n\t"
611                 "movl %0,8(%3)"  "\n\t"
612                 "movl 12(%1),%0" "\n\t"
613                 "sbbl 12(%2),%0" "\n\t"
614                 "movl %0,12(%3)"
615                 : "=&q" (dummy)
616                 : "r" (&a), "r" (&b), "r" (&r)
617                 : "cc"
618                 );
619     #endif
620   #endif
621 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
622         sub_loop_lsp(arrayLSDptr(a._ow,4),arrayLSDptr(b._ow,4),arrayLSDptr(r._ow,4),4);
623 #else
624         var uint32 tmp;
625
626         tmp = a.ow0 - b.ow0;
627         if (tmp <= a.ow0) {
628                 // no carry
629                 r.ow0 = tmp;
630                 tmp = a.ow1 - b.ow1;
631                 if (tmp <= a.ow1) goto no_carry_1; else goto carry_1;
632         } else {
633                 // carry
634                 r.ow0 = tmp;
635                 tmp = a.ow1 - b.ow1 - 1;
636                 if (tmp < a.ow1) goto no_carry_1; else goto carry_1;
637         }
638         if (1) {
639                 no_carry_1: // no carry
640                 r.ow1 = tmp;
641                 tmp = a.ow2 - b.ow2;
642                 if (tmp <= a.ow2) goto no_carry_2; else goto carry_2;
643         } else {
644                 carry_1: // carry
645                 r.ow1 = tmp;
646                 tmp = a.ow2 - b.ow2 - 1;
647                 if (tmp < a.ow2) goto no_carry_2; else goto carry_2;
648         }
649         if (1) {
650                 no_carry_2: // no carry
651                 r.ow2 = tmp;
652                 tmp = a.ow3 - b.ow3;
653         } else {
654                 carry_2: // carry
655                 r.ow2 = tmp;
656                 tmp = a.ow3 - b.ow3 - 1;
657         }
658         r.ow3 = tmp;
659 #endif
660 }
661
662 // b := a >> 1
663 static inline void shift (const nuss_outword& a, nuss_outword& b)
664 {
665 #if defined(__GNUC__) && defined(__i386__) && !defined(DEBUG_NUSS)
666         var uintD dummy;
667   #ifdef NUSS_ASM_DIRECT
668         __asm__ __volatile__ (
669                 "movl %1,%0" "\n\t"
670                 "sarl $1,%0" "\n\t"
671                 "movl %0,%2"
672                 : "=&q" (dummy)
673                 : "m" (a.ow3), "m" (b.ow3)
674                 : "cc"
675                 );
676         __asm__ __volatile__ (
677                 "movl %1,%0" "\n\t"
678                 "rcrl $1,%0" "\n\t"
679                 "movl %0,%2"
680                 : "=&q" (dummy)
681                 : "m" (a.ow2), "m" (b.ow2)
682                 : "cc"
683                 );
684         __asm__ __volatile__ (
685                 "movl %1,%0" "\n\t"
686                 "rcrl $1,%0" "\n\t"
687                 "movl %0,%2"
688                 : "=&q" (dummy)
689                 : "m" (a.ow1), "m" (b.ow1)
690                 : "cc"
691                 );
692         __asm__ __volatile__ (
693                 "movl %1,%0" "\n\t"
694                 "rcrl $1,%0" "\n\t"
695                 "movl %0,%2"
696                 : "=&q" (dummy)
697                 : "m" (a.ow0), "m" (b.ow0)
698                 : "cc"
699                 );
700   #else
701     #if CL_DS_BIG_ENDIAN_P
702         __asm__ __volatile__ (
703                 "movl (%1),%0"   "\n\t"
704                 "sarl $1,%0"     "\n\t"
705                 "movl %0,(%2)"   "\n\t"
706                 "movl 4(%1),%0"  "\n\t"
707                 "rcrl $1,%0"     "\n\t"
708                 "movl %0,4(%2)"  "\n\t"
709                 "movl 8(%1),%0"  "\n\t"
710                 "rcrl $1,%0"     "\n\t"
711                 "movl %0,8(%2)"  "\n\t"
712                 "movl 12(%1),%0" "\n\t"
713                 "rcrl $1,%0"     "\n\t"
714                 "movl %0,12(%2)"
715                 : "=&q" (dummy)
716                 : "r" (&a), "r" (&b)
717                 : "cc"
718                 );
719     #else
720         __asm__ __volatile__ (
721                 "movl 12(%1),%0" "\n\t"
722                 "sarl $1,%0"     "\n\t"
723                 "movl %0,12(%2)" "\n\t"
724                 "movl 8(%1),%0"  "\n\t"
725                 "rcrl $1,%0"     "\n\t"
726                 "movl %0,8(%2)"  "\n\t"
727                 "movl 4(%1),%0"  "\n\t"
728                 "rcrl $1,%0"     "\n\t"
729                 "movl %0,4(%2)"  "\n\t"
730                 "movl (%1),%0"   "\n\t"
731                 "rcrl $1,%0"     "\n\t"
732                 "movl %0,(%2)"
733                 : "=&q" (dummy)
734                 : "r" (&a), "r" (&b)
735                 : "cc"
736                 );
737     #endif
738   #endif
739 #elif defined(NUSS_OUT_EXTERNAL_LOOPS)
740         #ifdef DEBUG_NUSS
741         if (shiftrightcopy_loop_msp(arrayMSDptr(a._ow,4),arrayMSDptr(b._ow,4),4,1,mspref(arrayMSDptr(a._ow,4),0)>>31))
742                 cl_abort();
743         #else
744         shiftrightcopy_loop_msp(arrayMSDptr(a._ow,4),arrayMSDptr(b._ow,4),4,1,mspref(arrayMSDptr(a._ow,4),0)>>31);
745         #endif
746 #else
747         var uint32 tmp, carry;
748
749         tmp = a.ow3;
750         b.ow3 = (sint32)tmp >> 1;
751         carry = tmp << 31;
752         tmp = a.ow2;
753         b.ow2 = (tmp >> 1) | carry;
754         carry = tmp << 31;
755         tmp = a.ow1;
756         b.ow1 = (tmp >> 1) | carry;
757         carry = tmp << 31;
758         tmp = a.ow0;
759         b.ow0 = (tmp >> 1) | carry;
760         #ifdef DEBUG_NUSS
761         carry = tmp << 31;
762         if (carry)
763                 cl_abort();
764         #endif
765 #endif
766 }
767
768 #endif // (intDsize==32)
769
770 #if (intDsize==64)
771
772 //typedef struct { sint64 iw1; uint64 iw0; } nuss_inword;
773 //typedef struct { uint64 iw0; sint64 iw1; } nuss_inword;
774 typedef struct { uintD _iw[2]; } nuss_inword;
775 #if CL_DS_BIG_ENDIAN_P
776   #define iw1 _iw[0]
777   #define iw0 _iw[1]
778 #else
779   #define iw0 _iw[0]
780   #define iw1 _iw[1]
781 #endif
782
783 //typedef struct { sint64 ow2; uint64 ow1; uint64 ow0; } nuss_outword;
784 //typedef struct { uint64 ow0; uint64 ow1; sint64 ow2; } nuss_outword;
785 typedef struct { uintD _ow[3]; } nuss_outword;
786 #if CL_DS_BIG_ENDIAN_P
787   #define ow2 _ow[0]
788   #define ow1 _ow[1]
789   #define ow0 _ow[2]
790 #else
791   #define ow0 _ow[0]
792   #define ow1 _ow[1]
793   #define ow2 _ow[2]
794 #endif
795
796 // r := a + b
797 static inline void add (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
798 {
799 #ifdef NUSS_IN_EXTERNAL_LOOPS
800         add_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
801 #else
802         var uint64 tmp;
803
804         tmp = a.iw0 + b.iw0;
805         if (tmp >= a.iw0) {
806                 // no carry
807                 r.iw0 = tmp;
808                 r.iw1 = a.iw1 + b.iw1;
809         } else {
810                 // carry
811                 r.iw0 = tmp;
812                 r.iw1 = a.iw1 + b.iw1 + 1;
813         }
814 #endif
815 }
816
817 // r := a - b
818 static inline void sub (const nuss_inword& a, const nuss_inword& b, nuss_inword& r)
819 {
820 #ifdef NUSS_IN_EXTERNAL_LOOPS
821         sub_loop_lsp(arrayLSDptr(a._iw,2),arrayLSDptr(b._iw,2),arrayLSDptr(r._iw,2),2);
822 #else
823         var uint64 tmp;
824
825         tmp = a.iw0 - b.iw0;
826         if (tmp <= a.iw0) {
827                 // no carry
828                 r.iw0 = tmp;
829                 r.iw1 = a.iw1 - b.iw1;
830         } else {
831                 // carry
832                 r.iw0 = tmp;
833                 r.iw1 = a.iw1 - b.iw1 - 1;
834         }
835 #endif
836 }
837
838 // r := 0
839 static inline void zero (nuss_outword& r)
840 {
841         r.ow0 = 0;
842         r.ow1 = 0;
843         r.ow2 = 0;
844 }
845
846 // r := a + b
847 static inline void add (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
848 {
849 #ifdef NUSS_OUT_EXTERNAL_LOOPS
850         add_loop_lsp(arrayLSDptr(a._ow,3),arrayLSDptr(b._ow,3),arrayLSDptr(r._ow,3),3);
851 #else
852         var uint64 tmp;
853
854         tmp = a.ow0 + b.ow0;
855         if (tmp >= a.ow0) {
856                 // no carry
857                 r.ow0 = tmp;
858                 tmp = a.ow1 + b.ow1;
859                 if (tmp >= a.ow1) goto no_carry_1; else goto carry_1;
860         } else {
861                 // carry
862                 r.ow0 = tmp;
863                 tmp = a.ow1 + b.ow1 + 1;
864                 if (tmp > a.ow1) goto no_carry_1; else goto carry_1;
865         }
866         if (1) {
867                 no_carry_1: // no carry
868                 r.ow1 = tmp;
869                 tmp = a.ow2 + b.ow2;
870         } else {
871                 carry_1: // carry
872                 r.ow1 = tmp;
873                 tmp = a.ow2 + b.ow2 + 1;
874         }
875         r.ow2 = tmp;
876 #endif
877 }
878
879 // r := a - b
880 static inline void sub (const nuss_outword& a, const nuss_outword& b, nuss_outword& r)
881 {
882 #ifdef NUSS_OUT_EXTERNAL_LOOPS
883         sub_loop_lsp(arrayLSDptr(a._ow,3),arrayLSDptr(b._ow,3),arrayLSDptr(r._ow,3),3);
884 #else
885         var uint64 tmp;
886
887         tmp = a.ow0 - b.ow0;
888         if (tmp <= a.ow0) {
889                 // no carry
890                 r.ow0 = tmp;
891                 tmp = a.ow1 - b.ow1;
892                 if (tmp <= a.ow1) goto no_carry_1; else goto carry_1;
893         } else {
894                 // carry
895                 r.ow0 = tmp;
896                 tmp = a.ow1 - b.ow1 - 1;
897                 if (tmp < a.ow1) goto no_carry_1; else goto carry_1;
898         }
899         if (1) {
900                 no_carry_1: // no carry
901                 r.ow1 = tmp;
902                 tmp = a.ow2 - b.ow2;
903         } else {
904                 carry_1: // carry
905                 r.ow1 = tmp;
906                 tmp = a.ow2 - b.ow2 - 1;
907         }
908         r.ow2 = tmp;
909 #endif
910 }
911
912 // b := a >> 1
913 static inline void shift (const nuss_outword& a, nuss_outword& b)
914 {
915 #ifdef NUSS_OUT_EXTERNAL_LOOPS
916         #ifdef DEBUG_NUSS
917         if (shiftrightcopy_loop_msp(arrayMSDptr(a._ow,3),arrayMSDptr(b._ow,3),3,1,mspref(arrayMSDptr(a._ow,3),0)>>63))
918                 cl_abort();
919         #else
920         shiftrightcopy_loop_msp(arrayMSDptr(a._ow,3),arrayMSDptr(b._ow,3),3,1,mspref(arrayMSDptr(a._ow,3),0)>>63);
921         #endif
922 #else
923         var uint64 tmp, carry;
924
925         tmp = a.ow2;
926         b.ow2 = (sint64)tmp >> 1;
927         carry = tmp << 63;
928         tmp = a.ow1;
929         b.ow1 = (tmp >> 1) | carry;
930         carry = tmp << 63;
931         tmp = a.ow0;
932         b.ow0 = (tmp >> 1) | carry;
933         #ifdef DEBUG_NUSS
934         carry = tmp << 63;
935         if (carry)
936                 cl_abort();
937         #endif
938 #endif
939 }
940
941 #endif // (intDsize==64)
942
943 // This is a recursive implementation.
944 // TODO: Write a non-recursive one.
945
946 #ifndef _BIT_REVERSE
947 #define _BIT_REVERSE
948 // Reverse an n-bit number x. n>0.
949 static uintL bit_reverse (uintL n, uintL x)
950 {
951         var uintL y = 0;
952         do {
953                 y <<= 1;
954                 y |= (x & 1);
955                 x >>= 1;
956         } while (!(--n == 0));
957         return y;
958 }
959 #endif
960
961 // Threshold for recursion base in mulu_nuss_negacyclic().
962 // Time of a multiplication with len1=len2=10000 on Linux i486:
963 //                     normal    asm-optimized
964 //   threshold1 = 1:   40.1 sec  25.5 sec
965 //   threshold1 = 2:   28.6 sec  18.3 sec
966 //   threshold1 = 3:   25.6 sec  16.6 sec
967 //   threshold1 = 4:   25.7 sec  17.6 sec
968 //   threshold1 = 5:   26.1 sec  18.0 sec
969 const uintL cl_nuss_threshold1 = 3;
970
971 // Threshold for recursion base in mulu_nuss_cyclic().
972 const uintL cl_nuss_threshold2 = 1;
973
974 // Computes z[k] := sum(i+j==k mod N, x[i]*y[j]*(-1)^((i+j-k)/N))
975 // for all k=0..N-1.
976 static void mulu_nuss_negacyclic (const uintL n, const uintL N, // N = 2^n
977                                   const nuss_inword * x, // N words
978                                   const nuss_inword * y, // N words
979                                   nuss_outword * z       // N words result
980                                  )
981 {
982         #if 0 // always n > 0
983         if (n == 0) {
984                 // z[0] := x0 y0
985                 mul(x[0],y[0], z[0]);
986                 return;
987         }
988         #endif
989         if (n <= cl_nuss_threshold1) {
990                 if (n == 1) {
991                         // z[0] := x0 (y0 + y1) - (x0 + x1) y1
992                         // z[1] := x0 (y0 + y1) + (x1 - x0) y0
993                         var nuss_inword x_sum;
994                         var nuss_inword y_sum;
995                         var nuss_outword first, second;
996                         add(x[0],x[1], x_sum);
997                         add(y[0],y[1], y_sum);
998                         mul(x[0],y_sum, first);
999                         mul(x_sum,y[1], second); sub(first,second, z[0]);
1000                         sub(x[1],x[0], x_sum);
1001                         mul(x_sum,y[0], second); add(first,second, z[1]);
1002                         return;
1003                 }
1004                 // 1 < n <= cl_nuss_threshold1.
1005                 #if 0 // straightforward, but slow
1006                 var uintL k;
1007                 for (k = 0; k < N; k++) {
1008                         var uintL i;
1009                         var nuss_outword accu;
1010                         mul(x[0],y[k], accu);
1011                         for (i = 1; i <= k; i++) {
1012                                 var nuss_outword temp;
1013                                 mul(x[i],y[k-i], temp);
1014                                 add(accu,temp, accu);
1015                         }
1016                         for (i = k+1; i < N; i++) {
1017                                 var nuss_outword temp;
1018                                 mul(x[i],y[N-i+k], temp);
1019                                 sub(accu,temp, accu);
1020                         }
1021                         z[k] = accu;
1022                 }
1023                 #else
1024                 var const uintL M = (uintL)1 << (n-1); // M = N/2
1025                 var uintL i, j, k;
1026                 for (k = 0; k < N; k++)
1027                         zero(z[k]);
1028                 for (i = 0; i < M; i++) {
1029                         var uintL iM = i+M;
1030                         for (j = 0; j < M-i; j++) {
1031                                 var uintL jM = j+M;
1032                                 // z[i+j]   += x[i] (y[j] + y[j+M]) - (x[i] + x[i+M]) y[j+M]
1033                                 // z[i+j+M] += x[i] (y[j] + y[j+M]) + (x[i+M] - x[i]) y[j]
1034                                 var nuss_inword x_sum;
1035                                 var nuss_inword y_sum;
1036                                 var nuss_outword first, second, temp;
1037                                 add(x[i],x[iM], x_sum);
1038                                 add(y[j],y[jM], y_sum);
1039                                 mul(x[i],y_sum, first);
1040                                 mul(x_sum,y[jM], second); sub(first,second, temp); add(z[i+j],temp, z[i+j]);
1041                                 sub(x[iM],x[i], x_sum);
1042                                 mul(x_sum,y[j], second); add(first,second, temp); add(z[i+j+M],temp, z[i+j+M]);
1043                         }
1044                         for (j = M-i; j < M; j++) {
1045                                 var uintL jM = j+M;
1046                                 // z[i+j]   += x[i] (y[j] + y[j+M]) - (x[i] + x[i+M]) y[j+M]
1047                                 // z[i+j-M] -= x[i] (y[j] + y[j+M]) + (x[i+M] - x[i]) y[j]
1048                                 var nuss_inword x_sum;
1049                                 var nuss_inword y_sum;
1050                                 var nuss_outword first, second, temp;
1051                                 add(x[i],x[iM], x_sum);
1052                                 add(y[j],y[jM], y_sum);
1053                                 mul(x[i],y_sum, first);
1054                                 mul(x_sum,y[jM], second); sub(first,second, temp); add(z[i+j],temp, z[i+j]);
1055                                 sub(x[iM],x[i], x_sum);
1056                                 mul(x_sum,y[j], second); add(first,second, temp); sub(z[i+j-M],temp, z[i+j-M]);
1057                         }
1058                 }
1059                 #endif
1060                 return;
1061         }
1062         // Recursive FFT.
1063         var const uintL m = n >> 1; // floor(n/2)
1064         var const uintL r = n - m;  // ceiling(n/2)
1065         var const uintL M = (uintL)1 << m; // M = 2^m
1066         var const uintL R = (uintL)1 << r; // R = 2^r
1067         CL_ALLOCA_STACK;
1068         var nuss_inword* const auX = cl_alloc_array(nuss_inword,2*N);
1069         var nuss_inword* const auY = cl_alloc_array(nuss_inword,2*N);
1070         var nuss_outword* const auZ = cl_alloc_array(nuss_outword,2*N);
1071         #define X(i,j) auX[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1072         #define Y(i,j) auY[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1073         #define Z(i,j) auZ[((i)<<r)+(j)] /* 0 <= i < 2*M, 0 <= j < R */
1074         var nuss_inword* const tmp1 = cl_alloc_array(nuss_inword,R);
1075         var nuss_inword* const tmp2 = cl_alloc_array(nuss_inword,R);
1076         var nuss_outword* const tmpZ = cl_alloc_array(nuss_outword,R);
1077         var bool squaring = (x == y);
1078         var uintL i, j;
1079         // Initialize polynomials X(i) and Y(i).
1080         for (i = 0; i < M; i++) {
1081                 {
1082                         for (j = 0; j < R; j++)
1083                                 X(i,j) = x[(j<<m) + i];
1084                 }
1085                 if (!squaring) {
1086                         for (j = 0; j < R; j++)
1087                                 Y(i,j) = y[(j<<m) + i];
1088                 }
1089         }
1090         // For i = M..2*M-1, the polynomials are implicitly 0.
1091         // Do an FFT of length 2*M on X.
1092         {
1093                 var sintL l;
1094                 // Level l = m:
1095                 for (i = 0; i < M; i++)
1096                         for (j = 0; j < R; j++)
1097                                 X(i+M,j) = X(i,j);
1098                 // Level l = m-1..0:
1099                 for (l = m-1; l>=0; l--) {
1100                         var const uintL smax = (uintL)1 << (m-l);
1101                         var const uintL tmax = (uintL)1 << l;
1102                         for (var uintL s = 0; s < smax; s++) {
1103                                 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1104                                 for (var uintL t = 0; t < tmax; t++) {
1105                                         var uintL i1 = (s << (l+1)) + t;
1106                                         var uintL i2 = i1 + tmax;
1107                                         // Butterfly: replace (X(i1),X(i2)) by
1108                                         // (X(i1) + w^exp*X(i2), X(i1) - w^exp*X(i2)).
1109                                         for (j = 0; j < exp; j++) {
1110                                                 // note that w^R = -1
1111                                                 sub(X(i1,j),X(i2,j-exp+R), tmp1[j]);
1112                                                 add(X(i1,j),X(i2,j-exp+R), tmp2[j]);
1113                                         }
1114                                         for (j = exp; j < R; j++) {
1115                                                 add(X(i1,j),X(i2,j-exp), tmp1[j]);
1116                                                 sub(X(i1,j),X(i2,j-exp), tmp2[j]);
1117                                         }
1118                                         for (j = 0; j < R; j++) {
1119                                                 X(i1,j) = tmp1[j];
1120                                                 X(i2,j) = tmp2[j];
1121                                         }
1122                                 }
1123                         }
1124                 }
1125         }
1126         // Do an FFT of length 2*M on Y.
1127         if (!squaring) {
1128                 var sintL l;
1129                 // Level l = m:
1130                 for (i = 0; i < M; i++)
1131                         for (j = 0; j < R; j++)
1132                                 Y(i+M,j) = Y(i,j);
1133                 // Level l = m-1..0:
1134                 for (l = m-1; l>=0; l--) {
1135                         var const uintL smax = (uintL)1 << (m-l);
1136                         var const uintL tmax = (uintL)1 << l;
1137                         for (var uintL s = 0; s < smax; s++) {
1138                                 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1139                                 for (var uintL t = 0; t < tmax; t++) {
1140                                         var uintL i1 = (s << (l+1)) + t;
1141                                         var uintL i2 = i1 + tmax;
1142                                         // Butterfly: replace (Y(i1),Y(i2)) by
1143                                         // (Y(i1) + w^exp*Y(i2), Y(i1) - w^exp*Y(i2)).
1144                                         for (j = 0; j < exp; j++) {
1145                                                 // note that w^R = -1
1146                                                 sub(Y(i1,j),Y(i2,j-exp+R), tmp1[j]);
1147                                                 add(Y(i1,j),Y(i2,j-exp+R), tmp2[j]);
1148                                         }
1149                                         for (j = exp; j < R; j++) {
1150                                                 add(Y(i1,j),Y(i2,j-exp), tmp1[j]);
1151                                                 sub(Y(i1,j),Y(i2,j-exp), tmp2[j]);
1152                                         }
1153                                         for (j = 0; j < R; j++) {
1154                                                 Y(i1,j) = tmp1[j];
1155                                                 Y(i2,j) = tmp2[j];
1156                                         }
1157                                 }
1158                         }
1159                 }
1160         }
1161         // Recursively compute the negacyclic product X(i)*Y(i) for all i.
1162         if (!squaring) {
1163                 for (i = 0; i < 2*M; i++)
1164                         mulu_nuss_negacyclic(r,R, &X(i,0), &Y(i,0), &Z(i,0));
1165         } else {
1166                 for (i = 0; i < 2*M; i++)
1167                         mulu_nuss_negacyclic(r,R, &X(i,0), &X(i,0), &Z(i,0));
1168         }
1169         // Undo an FFT of length 2*M on Z.
1170         {
1171                 var uintL l;
1172                 // Level l = 0..m-1:
1173                 for (l = 0; l < m; l++) {
1174                         var const uintL smax = (uintL)1 << (m-l);
1175                         var const uintL tmax = (uintL)1 << l;
1176                         for (var uintL s = 0; s < smax; s++) {
1177                                 var uintL exp = bit_reverse(m-l,s) << (l + r-m);
1178                                 for (var uintL t = 0; t < tmax; t++) {
1179                                         var uintL i1 = (s << (l+1)) + t;
1180                                         var uintL i2 = i1 + tmax;
1181                                         // Inverse Butterfly: replace (Z(i1),Z(i2)) by
1182                                         // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/(2*w^exp)).
1183                                         for (j = 0; j < exp; j++)
1184                                                 // note that w^R = -1
1185                                                 sub(Z(i2,j),Z(i1,j), tmpZ[j-exp+R]);
1186                                         for (j = exp; j < R; j++)
1187                                                 sub(Z(i1,j),Z(i2,j), tmpZ[j-exp]);
1188                                         for (j = 0; j < R; j++) {
1189                                                 var nuss_outword sum;
1190                                                 add(Z(i1,j),Z(i2,j), sum);
1191                                                 shift(sum, Z(i1,j));
1192                                                 shift(tmpZ[j], Z(i2,j));
1193                                         }
1194                                 }
1195                         }
1196                 }
1197                 // Level l=m:
1198                 for (i = 0; i < M; i++) {
1199                         var uintL i1 = i;
1200                         var uintL i2 = i1 + M;
1201                         // Inverse Butterfly: replace (Z(i1),Z(i2)) by
1202                         // ((Z(i1)+Z(i2))/2, (Z(i1)-Z(i2))/2).
1203                         for (j = 0; j < R; j++) {
1204                                 var nuss_outword sum;
1205                                 var nuss_outword diff;
1206                                 add(Z(i1,j),Z(i2,j), sum);
1207                                 sub(Z(i1,j),Z(i2,j), diff);
1208                                 shift(sum, Z(i1,j));
1209                                 shift(diff, Z(i2,j));
1210                         }
1211                 }
1212         }
1213         // Reduce to length M.
1214         for (i = 0; i < M; i++) {
1215                 sub(Z(i,0),Z(i+M,R-1), z[i]);
1216                 for (j = 1; j < R; j++)
1217                         add(Z(i,j),Z(i+M,j-1), z[(j<<m)+i]);
1218         }
1219         #undef Z
1220         #undef Y
1221         #undef X
1222 }
1223
1224 // Computes z[k] := sum(i+j==k mod N, x[i]*y[j])
1225 // for all k=0..N-1.
1226 static void mulu_nuss_cyclic (const uintL n, const uintL N, // N = 2^n
1227                               nuss_inword * x, // N words, modified!
1228                               nuss_inword * y, // N words, modified!
1229                               nuss_outword * z // N words result
1230                              )
1231 {
1232         unused N;
1233         #if 0 // always n > 0
1234         if (n == 0) {
1235                 // z[0] := x0 y0
1236                 mul(x[0],y[0], z[0]);
1237                 return;
1238         }
1239         #endif
1240         if (n == 1) {
1241                 // z[0] := ((x0 + x1) (y0 + y1) + (x0 - x1) (y0 - y1)) / 2
1242                 // z[1] := ((x0 + x1) (y0 + y1) - (x0 - x1) (y0 - y1)) / 2
1243                 var nuss_inword x_sum;
1244                 var nuss_inword y_sum;
1245                 var nuss_inword x_diff;
1246                 var nuss_inword y_diff;
1247                 var nuss_outword first, second;
1248                 add(x[0],x[1], x_sum);
1249                 add(y[0],y[1], y_sum);
1250                 sub(x[0],x[1], x_diff);
1251                 sub(y[0],y[1], y_diff);
1252                 mul(x_sum,y_sum, first);
1253                 mul(x_diff,y_diff, second);
1254                 add(first,second, z[0]); shift(z[0], z[0]);
1255                 sub(first,second, z[1]); shift(z[1], z[1]);
1256                 return;
1257         }
1258         #if 0 // useless code because cl_nuss_threshold2 == 1
1259         if (n <= cl_nuss_threshold2) {
1260                 #if 0 // straightforward, but slow
1261                 var uintL k;
1262                 for (k = 0; k < N; k++) {
1263                         var uintL i;
1264                         var nuss_outword accu;
1265                         mul(x[0],y[k], accu);
1266                         for (i = 1; i <= k; i++) {
1267                                 var nuss_outword temp;
1268                                 mul(x[i],y[k-i], temp);
1269                                 add(accu,temp, accu);
1270                         }
1271                         for (i = k+1; i < N; i++) {
1272                                 var nuss_outword temp;
1273                                 mul(x[i],y[N-i+k], temp);
1274                                 add(accu,temp, accu);
1275                         }
1276                         z[k] = accu;
1277                 }
1278                 #else
1279                 var const uintL M = (uintL)1 << (n-1); // M = N/2
1280                 var uintL i, j, k;
1281                 for (k = 0; k < N; k++)
1282                         zero(z[k]);
1283                 for (i = 0; i < M; i++) {
1284                         var uintL iM = i+M;
1285                         for (j = 0; j < M; j++) {
1286                                 var uintL jM = j+M;
1287                                 // z[i+j]   += ((x[i] + x[i+M]) (y[j] + y[j+M]) + (x[i] - x[i+M]) (y[j] - y[j+M])) / 2
1288                                 // z[i+j+M] += ((x[i] + x[i+M]) (y[j] + y[j+M]) - (x[i] - x[i+M]) (y[j] - y[j+M])) / 2
1289                                 var nuss_inword x_sum;
1290                                 var nuss_inword y_sum;
1291                                 var nuss_inword x_diff;
1292                                 var nuss_inword y_diff;
1293                                 var nuss_outword first, second, temp;
1294                                 add(x[i],x[iM], x_sum);
1295                                 add(y[j],y[jM], y_sum);
1296                                 sub(x[i],x[iM], x_diff);
1297                                 sub(y[j],y[jM], y_diff);
1298                                 mul(x_sum,y_sum, first);
1299                                 mul(x_diff,y_diff, second);
1300                                 add(first,second, temp); add(z[i+j],temp, z[i+j]);
1301                                 var uintL ijM = (i+j+M) & (N-1);
1302                                 sub(first,second, temp); add(z[ijM],temp, z[ijM]);
1303                         }
1304                 }
1305                 for (k = 0; k < N; k++)
1306                         shift(z[k], z[k]);
1307                 #endif
1308                 return;
1309         }
1310         #endif
1311         var const uintL m = n-1;
1312         var const uintL M = (uintL)1 << m; // M = 2^m = N/2
1313         var uintL i;
1314         // Chinese remainder theorem: u^N-1 = (u^M-1)*(u^M+1)
1315         for (i = 0; i < M; i++) {
1316                 // Butterfly: replace (x(i),x(i+M))
1317                 // by (x(i)+x(i+M),x(i)-x(i+M)).
1318                 var nuss_inword tmp;
1319                 sub(x[i],x[i+M], tmp);
1320                 add(x[i],x[i+M], x[i]);
1321                 x[i+M] = tmp;
1322         }
1323         if (!(x == y)) // squaring?
1324         for (i = 0; i < M; i++) {
1325                 // Butterfly: replace (y(i),y(i+M))
1326                 // by (y(i)+y(i+M),y(i)-y(i+M)).
1327                 var nuss_inword tmp;
1328                 sub(y[i],y[i+M], tmp);
1329                 add(y[i],y[i+M], y[i]);
1330                 y[i+M] = tmp;
1331         }
1332         // Recurse.
1333         mulu_nuss_cyclic(m,M, &x[0], &y[0], &z[0]);
1334         mulu_nuss_negacyclic(m,M, &x[M], &y[M], &z[M]);
1335         for (i = 0; i < M; i++) {
1336                 // Inverse Butterfly: replace (z(i),z(i+M))
1337                 // by ((z(i)+z(i+M))/2,(z(i)-z(i+M))/2).
1338                 var nuss_outword sum;
1339                 var nuss_outword diff;
1340                 add(z[i],z[i+M], sum);
1341                 sub(z[i],z[i+M], diff);
1342                 shift(sum, z[i]);
1343                 shift(diff, z[i+M]);
1344         }
1345 }
1346
1347 static void mulu_nussbaumer (const uintD* sourceptr1, uintC len1,
1348                              const uintD* sourceptr2, uintC len2,
1349                              uintD* destptr)
1350 // Es ist 2 <= len1 <= len2.
1351 {
1352         // Methode:
1353         // source1 ist ein Stück der Länge N1, source2 ein oder mehrere Stücke
1354         // der Länge N2, mit N1+N2 <= N, wobei N Zweierpotenz ist.
1355         // sum(i=0..N-1, x_i b^i) * sum(i=0..N-1, y_i b^i) wird errechnet,
1356         // indem man die beiden Polynome
1357         // sum(i=0..N-1, x_i T^i), sum(i=0..N-1, y_i T^i)
1358         // multipliziert, und zwar durch Fourier-Transformation (s.o.).
1359         var uint32 n;
1360         integerlength32(len1-1, n=); // 2^(n-1) < len1 <= 2^n
1361         var uintL len = (uintL)1 << n; // kleinste Zweierpotenz >= len1
1362         // Wählt man N = len, so hat man ceiling(len2/(len-len1+1)) * FFT(len).
1363         // Wählt man N = 2*len, so hat man ceiling(len2/(2*len-len1+1)) * FFT(2*len).
1364         // Wir wählen das billigere von beiden:
1365         // Bei ceiling(len2/(len-len1+1)) <= 2 * ceiling(len2/(2*len-len1+1))
1366         // nimmt man N = len, bei ....... > ........ dagegen N = 2*len.
1367         // (Wahl von N = 4*len oder mehr bringt nur in Extremfällen etwas.)
1368         if (len2 > 2 * (len-len1+1) * (len2 <= (2*len-len1+1) ? 1 : ceiling(len2,(2*len-len1+1)))) {
1369                 n = n+1;
1370                 len = len << 1;
1371         }
1372         var const uintL N = len; // N = 2^n
1373         CL_ALLOCA_STACK;
1374         var nuss_inword* const x = cl_alloc_array(nuss_inword,N);
1375         var nuss_inword* const y = cl_alloc_array(nuss_inword,N);
1376         var nuss_outword* const z = cl_alloc_array(nuss_outword,N);
1377         var uintD* const tmpprod = cl_alloc_array(uintD,len1+1);
1378         var uintP i;
1379         var uintL destlen = len1+len2;
1380         clear_loop_lsp(destptr,destlen);
1381         do {
1382                 var uintL len2p; // length of a piece of source2
1383                 len2p = N - len1 + 1;
1384                 if (len2p > len2)
1385                         len2p = len2;
1386                 // len2p = min(N-len1+1,len2).
1387                 if (len2p == 1) {
1388                         // cheap case
1389                         var uintD* tmpptr = arrayLSDptr(tmpprod,len1+1);
1390                         mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
1391                         if (addto_loop_lsp(tmpptr,destptr,len1+1))
1392                                 if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
1393                                         cl_abort();
1394                 } else {
1395                         var uintL destlenp = len1 + len2p - 1;
1396                         // destlenp = min(N,destlen-1).
1397                         var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
1398                         // Fill factor x.
1399                         {
1400                                 for (i = 0; i < len1; i++) {
1401                                         x[i].iw0 = lspref(sourceptr1,i);
1402                                         x[i].iw1 = 0;
1403                                 }
1404                                 for (i = len1; i < N; i++) {
1405                                         x[i].iw0 = 0;
1406                                         x[i].iw1 = 0;
1407                                 }
1408                         }
1409                         // Fill factor y.
1410                         if (!squaring) {
1411                                 for (i = 0; i < len2p; i++) {
1412                                         y[i].iw0 = lspref(sourceptr2,i);
1413                                         y[i].iw1 = 0;
1414                                 }
1415                                 for (i = len2p; i < N; i++) {
1416                                         y[i].iw0 = 0;
1417                                         y[i].iw1 = 0;
1418                                 }
1419                         }
1420                         // Multiply.
1421                         if (!squaring)
1422                                 mulu_nuss_cyclic(n,N, &x[0], &y[0], &z[0]);
1423                         else
1424                                 mulu_nuss_cyclic(n,N, &x[0], &x[0], &z[0]);
1425                         #ifdef DEBUG_NUSS
1426                         // Check result.
1427                         for (i = 0; i < N; i++)
1428                                 if (!(z[i].ow3 == 0))
1429                                         cl_abort();
1430                         #endif
1431                         // Add result to destptr[-destlen..-1]:
1432                         {
1433                                 var uintD* ptr = destptr;
1434                                 // ac2|ac1|ac0 are an accumulator.
1435                                 var uint32 ac0 = 0;
1436                                 var uint32 ac1 = 0;
1437                                 var uint32 ac2 = 0;
1438                                 var uint32 tmp;
1439                                 for (i = 0; i < destlenp; i++) {
1440                                         // Add z[i] to the accumulator.
1441                                         tmp = z[i].ow0;
1442                                         if ((ac0 += tmp) < tmp) {
1443                                                 if (++ac1 == 0)
1444                                                         ++ac2;
1445                                         }
1446                                         tmp = z[i].ow1;
1447                                         if ((ac1 += tmp) < tmp)
1448                                                 ++ac2;
1449                                         tmp = z[i].ow2;
1450                                         ac2 += tmp;
1451                                         // Add the accumulator's least significant word to destptr:
1452                                         tmp = lspref(ptr,0);
1453                                         if ((ac0 += tmp) < tmp) {
1454                                                 if (++ac1 == 0)
1455                                                         ++ac2;
1456                                         }
1457                                         lspref(ptr,0) = ac0;
1458                                         lsshrink(ptr);
1459                                         ac0 = ac1;
1460                                         ac1 = ac2;
1461                                         ac2 = 0;
1462                                 }
1463                                 // ac2 = 0.
1464                                 if (ac1 > 0) {
1465                                         if (!((i += 2) <= destlen))
1466                                                 cl_abort();
1467                                         tmp = lspref(ptr,0);
1468                                         if ((ac0 += tmp) < tmp)
1469                                                 ++ac1;
1470                                         lspref(ptr,0) = ac0;
1471                                         lsshrink(ptr);
1472                                         tmp = lspref(ptr,0);
1473                                         ac1 += tmp;
1474                                         lspref(ptr,0) = ac1;
1475                                         lsshrink(ptr);
1476                                         if (ac1 < tmp)
1477                                                 if (inc_loop_lsp(ptr,destlen-i))
1478                                                         cl_abort();
1479                                 } else if (ac0 > 0) {
1480                                         if (!((i += 1) <= destlen))
1481                                                 cl_abort();
1482                                         tmp = lspref(ptr,0);
1483                                         ac0 += tmp;
1484                                         lspref(ptr,0) = ac0;
1485                                         lsshrink(ptr);
1486                                         if (ac0 < tmp)
1487                                                 if (inc_loop_lsp(ptr,destlen-i))
1488                                                         cl_abort();
1489                                 }
1490                         }
1491                         #ifdef DEBUG_NUSS
1492                         // If destlenp < N, check that the remaining z[i] are 0.
1493                         for (i = destlenp; i < N; i++)
1494                                 if (z[i].ow2 > 0 || z[i].ow1 > 0 || z[i].ow0 > 0)
1495                                         cl_abort();
1496                         #endif
1497                 }
1498                 // Decrement len2.
1499                 destptr = destptr lspop len2p;
1500                 destlen -= len2p;
1501                 sourceptr2 = sourceptr2 lspop len2p;
1502                 len2 -= len2p;
1503         } while (len2 > 0);
1504 }
1505
1506 #undef iw0
1507 #undef iw1
1508 #undef ow0
1509 #undef ow1
1510 #undef ow2
1511 #undef ow3