src/base/digitseq/cl_DS_mul.cc

   1 // cl_UDS_mul().
   2
   3 // General includes.
   4 #include "base/cl_sysdep.h"
   5
   6 // Specification.
   7 #include "base/digitseq/cl_DS.h"
   8
   9
  10 // Implementation.
  11
  12 #include "base/cl_low.h"
  13 #include "cln/malloc.h"
  14 #include "cln/exception.h"
  15
  16 namespace cln {
  17
  18 // Multiplikations-Doppelschleife:
  19 // Multipliziert zwei UDS und legt das Ergebnis in einer dritten UDS ab.
  20 // cl_UDS_mul(sourceptr1,len1,sourceptr2,len2,destptr);
  21 // multipliziert die UDS  sourceptr1[-len1..-1]  (len1>0)
  22 //           mit der UDS  sourceptr2[-len1..-1]  (len2>0)
  23 // und legt das Ergebnis in der UDS  destptr[-len..-1]  (len=len1+len2) ab.
  24 // Unterhalb von destptr werden len Digits Platz benötigt.
  25   void cl_UDS_mul (const uintD* sourceptr1, uintC len1,
  26                    const uintD* sourceptr2, uintC len2,
  27                    uintD* destptr);
  28 // Spezialfall sourceptr1 == sourceptr2 && len1 == len2.
  29   void cl_UDS_mul_square (const uintD* sourceptr, uintC len,
  30                           uintD* destptr);
  31
  32 // Multiplikation nach Schulmethode:
  33   static inline void mulu_2loop (const uintD* sourceptr1, uintC len1,
  34                                  const uintD* sourceptr2, uintC len2,
  35                                  uintD* destptr)
  36   { // Es ist 2 <= len1 <= len2.
  37     // Erster Schleifendurchlauf:
  38     mulu_loop_lsp(lsprefnext(sourceptr1),sourceptr2,destptr,len2);
  39     lsshrink(destptr);
  40     var uintD* destptr2 = destptr lspop len2;
  41     // äußere Schleife läuft über source1 :
  42     dotimespC(len1,len1-1,
  43       { // innere Schleife läuft über source2 :
  44         var uintD carry =
  45           muluadd_loop_lsp(lsprefnext(sourceptr1),sourceptr2,destptr,len2);
  46         lsprefnext(destptr2) = carry; // UDS um das Carry-Digit verlängern
  47         lsshrink(destptr);
  48       });
  49   }
  50   static inline void mulu_2loop_square (const uintD* sourceptr, uintC len,
  51                                         uintD* destptr)
  52   { // Es ist 2 <= len.
  53     // Gemischte Produkte:
  54     #if 0
  55     // 2*(  x[1] * x[0..0] * b^1
  56     //    + x[2] * x[1..0] * b^2
  57     //    + ...
  58     //    + x[n-1] * x[n-2..0]*b^(n-1))
  59     { var const uintD* sourceptr1 = sourceptr lspop 1;
  60       var uintD* destptr1 = destptr;
  61       lsprefnext(destptr1) = 0;
  62       var uintD* destptr2 = destptr1;
  63       var uintC count;
  64       for (count = 1; count < len; count++)
  65         { // sourceptr1 = sourceptr lspop count, destptr1 = destptr lspop count,
  66           // destptr2 = destptr lspop (2*count-1).
  67           lsprefnext(destptr2) = 0;
  68           var uintD carry =
  69             muluadd_loop_lsp(lsprefnext(sourceptr1),sourceptr,destptr1,count);
  70           lsprefnext(destptr2) = carry;
  71           destptr1 = destptr1 lspop 1;
  72         }
  73       { var uintD carry = shift1left_loop_lsp(destptr lspop 1,2*len-2);
  74         lspref(destptr2,0) = (carry==0 ? 0 : 1);
  75     } }
  76     #else
  77     // 2*(  x[n-1..1] * x[0] * b^1
  78     //    + x[n-1..2] * x[1] * b^3
  79     //    + ...
  80     //    + x[n-1..n-1] * x[n-2] * b^(2*n-3))
  81     { var const uintD* sourceptr1 = sourceptr;
  82       var uintD* destptr2 = destptr;
  83       lsprefnext(destptr2) = 0;
  84       var uintC count = len-1;
  85       { var uintD digit = lsprefnext(sourceptr1);
  86         mulu_loop_lsp(digit,sourceptr1,destptr2,count);
  87       }
  88       var uintD* destptr1 = destptr lspop (len+1);
  89       while (--count > 0)
  90         { destptr2 = destptr2 lspop 2;
  91           var uintD digit = lsprefnext(sourceptr1);
  92           var uintD carry = muluadd_loop_lsp(digit,sourceptr1,destptr2,count);
  93           lsprefnext(destptr1) = carry;
  94         }
  95       { var uintD carry = shift1left_loop_lsp(destptr lspop 1,2*len-2);
  96         lspref(destptr1,0) = (carry==0 ? 0 : 1);
  97     } }
  98     #endif
  99     // Quadrate:
 100     len = 2*len;
 101     do { len -= 2;
 102          var uintD digit = lsprefnext(sourceptr);
 103          #if HAVE_DD
 104          var uintDD prod = muluD(digit,digit);
 105          var uintDD accu = highlowDD(lspref(destptr,1),lspref(destptr,0));
 106          accu += prod;
 107          lspref(destptr,0) = lowD(accu); lspref(destptr,1) = highD(accu);
 108          destptr = destptr lspop 2;
 109          if (accu < prod) { inc_loop_lsp(destptr,len); }
 110          #else
 111          var uintD hi;
 112          var uintD lo;
 113          muluD(digit,digit, hi=,lo=);
 114          var uintD tmp;
 115          tmp = lspref(destptr,0) + lo; lspref(destptr,0) = tmp;
 116          if (tmp < lo) hi++;
 117          tmp = lspref(destptr,1) + hi; lspref(destptr,1) = tmp;
 118          destptr = destptr lspop 2;
 119          if (tmp < hi) { inc_loop_lsp(destptr,len); }
 120          #endif
 121        } while (len > 0);
 122   }
 123
 124 // Karatsuba-multiplication: O(n^(log 3 / log 2))
 125   static void mulu_karatsuba_square (const uintD* sourceptr, uintC len,
 126                                      uintD* destptr);
 127 #include "base/digitseq/cl_DS_mul_kara.h"
 128   // karatsuba_threshold = length, from which on Karatsuba-multiplication is a
 129   // gain and will be preferred.  The break-even point is determined from
 130   // timings.  The test is (progn (time (! 5000)) nil), which does many small
 131   // and some very large multiplications.  The measured runtimes are:
 132   // OS: Linux 2.2, intDsize==32,        OS: TRU64/4.0, intDsize==64,
 133   // Machine: P-III/450MHz               Machine: EV5/300MHz:
 134   // threshold  time in 0.01 sec.        time in 0.01 sec.
 135   //      5          3.55                     2.29
 136   //     10          2.01                     1.71
 137   //     15          1.61                     1.61
 138   //     20          1.51                     1.60  <-
 139   //     25          1.45                     1.63
 140   //     30          1.39                     1.66
 141   //     35          1.39  <-                 1.67
 142   //     40          1.39                     1.71
 143   //     45          1.40                     1.75
 144   //     50          1.41                     1.78
 145   //     55          1.41                     1.79
 146   //     60          1.44                     1.84
 147   //     65          1.44                     1.85
 148   //     70          1.43                     1.85
 149   //     75          1.45                     1.89
 150   //     80          1.47                     1.91
 151   //     90          1.51                     1.96
 152   //    100          1.53                     1.97
 153   //    150          1.62                     2.13
 154   //    250          1.75                     2.19
 155   //    500          1.87                     2.17
 156   //   1000          1.87                     2.18
 157   //   2000          1.88                     2.17
 158   // The optimum appears to be between 20 and 40.  But since that optimum
 159   // depends on the ratio time(uintD-mul)/time(uintD-add) and the measured
 160   // times are more sensitive to a shift towards lower thresholds we are
 161   // careful and choose a value at the upper end:
 162 #if CL_USE_GMP
 163   const unsigned int cl_karatsuba_threshold = 35;
 164 #else
 165   const unsigned int cl_karatsuba_threshold = 16;
 166   // (In CLN version <= 1.0.3 cl_karatsuba_threshold was always 16)
 167 #endif
 168
 169 #if 0 // Doesn't seem to be worth the effort
 170
 171 // FFT-Multiplikation nach Nussbaumer: O(n log n log log n)
 172 #include "base/digitseq/cl_DS_mul_nuss.h"
 173   // nuss_threshold = Länge, ab der die Nussbaumer-Multiplikation bevorzugt
 174   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 175   // Multiplikation zweier N-Wort-Zahlen unter Linux mit einem 80486:
 176   //    N     kara   nuss  nuss-asm  (time in sec.)
 177   //   1000    0.36   1.05   0.70
 178   //   5000    4.69  10.0    6.71
 179   //  25000   61.6   62.7   40.2
 180   //  32500   91.8   62.7   40.3
 181   //  35000  102.7  124.7   80.4
 182   //  50000  185    132     85.2
 183   int cl_nuss_threshold = 1000000;
 184
 185 // FFT-Multiplikation in Z/pZ: O(n log n log log n)
 186 #include "base/digitseq/cl_DS_mul_fftp.h"
 187   // fftp_threshold = Länge, ab der die FFT-Multiplikation mod p bevorzugt
 188   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 189   // Multiplikation zweier N-Wort-Zahlen unter Linux mit einem 80486:
 190   //    N     kara   fftp  (time in sec.)
 191   //   1000    0.36   1.57
 192   //   5000    4.66  14.86
 193   //  25000   61.1   75.0
 194   //  32500   90.8   75.5
 195   //  35000  101.6  150.1
 196   //  50000  183    160
 197   int cl_fftp_threshold = 1000000;
 198
 199 // FFT-Multiplikation in Z/pZ: O(n log n log log n)
 200 // für drei verschiedene Primzahlen p1,p2,p3 < 2^32.
 201 #include "base/digitseq/cl_DS_mul_fftp3.h"
 202   // fftp3_threshold = Länge, ab der die FFT-Multiplikation mod p_i bevorzugt
 203   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 204   // Multiplikation zweier N-Wort-Zahlen unter Linux mit einem 80486:
 205   //    N     kara   fftp3  fftp  (time in sec.)
 206   //   1000    0.36   0.59   1.57
 207   //   5000    4.66   5.44  14.89
 208   //  10000   13.98  11.91  32.43
 209   //  25000   61.1   27.4   75.4
 210   //  32500   90.5   28.1   75.5
 211   //  35000  101.4   54.8  150.4
 212   //  50000  183     58.9  161.6
 213   int cl_fftp3_threshold = 1000000;
 214
 215 // FFT-Multiplikation in Z/pZ: O(n log n log log n)
 216 // für drei verschiedene Primzahlen p1,p2,p3 < 2^32,
 217 // mit Montgomery-Multiplikation.
 218 #include "base/digitseq/cl_DS_mul_fftp3m.h"
 219   // fftp3_threshold = Länge, ab der die FFT-Multiplikation mod p_i bevorzugt
 220   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 221   // Multiplikation zweier N-Wort-Zahlen unter
 222   // Linux mit einem 80486, 33 MHz, mit Benutzung der GMP-Low-Level-Funktionen:
 223   //    N     kara    fftm  fftp3m  fftp3  fftp  (time in sec.)
 224   //   1000    0.35   0.49   0.54   0.59   1.58
 225   //   2500    1.48   0.97   2.34   2.52   6.99
 226   //   5000    4.43   2.19   5.08   5.48  15.16
 227   //  10000   13.33   4.68  10.93  11.82  32.94
 228   //  25000   58.5   12.0   25.3   27.4   77.0
 229   //  32500   86.0   25.0   26.1   28.0   77.3
 230   //  35000   96.5   25.0   50.8   54.9  152.8
 231   //  50000  176     25.2   54.2   58.5  163.4
 232   // und auf einer SPARC 20 mit 75 MHz, ohne GMP-Low-Level-Funktionen:
 233   //    N     kara    fftm  fftp3m  fftp3  fftp  (time in sec.)
 234   //   1000    0.076  0.096  0.113  0.233  0.415
 235   //   2500    0.32   0.21   0.48   1.03   1.82
 236   //   5000    0.97   0.51   1.03   2.22   3.96
 237   //  10000    2.99   1.03   2.23   4.72   8.59
 238   //  25000   13.22   2.73   4.99  10.78  19.73
 239   //  32500   19.3    5.7    5.2   10.9   19.7
 240   //  35000   21.5    5.9   10.0   21.7   39.4
 241   //  50000   39.5    6.0   11.3   23.1   42.7
 242   int cl_fftp3m_threshold = 1000000;
 243
 244 #endif
 245
 246 // FFT-Multiplikation in Z/pZ: O(n^1.29)
 247 #include "base/digitseq/cl_DS_mul_fftm.h"
 248   // fftm_threshold = length, from which on FFT multiplication mod m is a gain
 249   // and will be preferred.  The break-even point is determined from timings.
 250   // The times to multiply two N-limb numbers are:
 251   // OS: Linux 2.2, intDsize==32,        OS: TRU64/4.0, intDsize==64,
 252   // Machine: P-III/450MHz               Machine: EV5/300MHz:
 253   //    N     kara   fftm  (time in sec.)    kara   fftm
 254   //   1000    0.005  0.016                   0.018  0.028
 255   //   1500    0.009  0.012                   0.032  0.028
 256   //   2000    0.015  0.025                   0.053  0.052  <-
 257   //   2500    0.022  0.026                   0.067  0.052
 258   //   3000    0.029  0.027  <-               0.093  0.053
 259   //   3500    0.035  0.037                   0.12   0.031
 260   //   4000    0.045  0.028                   0.16   0.12
 261   //   5000    0.064  0.050                   0.20   0.11
 262   //   7000    0.110  0.051                   0.37   0.20
 263   //  10000    0.19   0.11                    0.61   0.26
 264   //  20000    0.59   0.23                    1.85   0.55
 265   //  30000    1.10   0.25                    3.79   0.56
 266   //  50000    2.52   1.76                    8.15   1.37
 267   //  70000    4.41   2.30                   14.09   2.94
 268   // 100000    7.55   1.53                   24.48   2.96
 269   // More playing around with timings reveals that there are some values where
 270   // FFT multiplication is somewhat slower than Karatsuba, both for len1==len2
 271   // and also if len1<len2.
 272   // Here are the timigs from CLN version <= 1.0.3:
 273   // //  Linux mit einem 80486:               Solaris, Sparc 10/20:
 274   // //    N     kara   fftm  (time in sec.)    kara   fftm
 275   // //   1000    0.36   0.54                    0.08   0.10
 276   // //   5000    4.66   2.48                    1.01   0.51
 277   // //  25000   61.1   13.22                   13.23   2.73
 278   // //  32500   91.0   27.5                    20.0    5.8
 279   // //  35000  102.1   27.5                    21.5    5.6
 280   // //  50000  183     27.6                    40.7    5.6
 281   // // Multiplikation zweier N-Wort-Zahlen unter
 282   // //  Linux mit einem 80486:               Solaris, Sparc 10/20:
 283   // //    N     kara   fftm  (time in sec.)    kara   fftm
 284   // //   1000    0.36   0.54                    0.08   0.10
 285   // //   1260    0.52   0.50                    0.11   0.10
 286   // //   1590    0.79   0.51                    0.16   0.10
 287   // //   2000    1.09   1.07                    0.23   0.21
 288   // //   2520    1.57   1.08                    0.33   0.21
 289   // //   3180    2.32   1.08                    0.50   0.21
 290   // //   4000    3.29   2.22                    0.70   0.41
 291   // //   5040    4.74   2.44                    0.99   0.50
 292   // //    N1    N2    kara   fftm  (time in sec.)    kara   fftm
 293   // //   1250  1250    0.51   0.50                    0.11   0.10
 294   // //   1250  1580    0.70   0.50                    0.15   0.10
 295   // //   1250  2000    0.89   0.51                    0.18   0.10
 296   // //   1250  2250    0.99   0.51                    0.21   0.10
 297   // //   1250  2500    1.08   1.03     <---           0.22   0.21
 298   // //   1250  2800    1.20   1.07                    0.26   0.21
 299   // //   1250  3100    1.35   1.07                    0.28   0.21
 300   // // Es gibt also noch Werte von (len1,len2) mit 1250 <= len1 <= len2, bei
 301   // // denen "kara" schneller ist als "fftm", aber nicht viele und dort auch
 302   // // nur um 5%. Darum wählen wir ab hier die FFT-Multiplikation.
 303   // // 140000: 4.15s  12.53  23.7
 304   // // 14000:  4.16s
 305   // // 11000:  4.16s
 306   // // 9000:   1.47s
 307   // // 7000:   1.48s
 308   // // 1400:   1.42s   2.80   6.5
 309 #if CL_USE_GMP
 310   const unsigned int cl_fftm_threshold = 2500;
 311   // must be >= 6 (else infinite recursion)
 312 #else
 313   // Use the old default value from CLN version <= 1.0.3 as a crude estimate.
 314   const unsigned int cl_fftm_threshold = 1250;
 315 #endif
 316   // This is the threshold for multiplication of equally sized factors.
 317   // When the lengths differ much, the threshold varies:
 318   //                OS: Linux 2.2, intDsize==32,  OS: TRU64/4.0, intDsize==64,
 319   //                Machine: P-III/450MHz         Machine: EV5/300MHz:
 320   // len2 =  3000   len1 >= 2600                  len1 >= 800
 321   // len2 =  4000   len1 >= 1500                  len1 >= 700
 322   // len2 =  5000   len1 >= 1100                  len1 >= 600
 323   // len2 =  6000   len1 >= 1300                  len1 >= 700
 324   // len2 =  7000   len1 >= 1100                  len1 >= 600
 325   // len2 =  8000   len1 >= 900                   len1 >= 500
 326   // len2 =  9000   len1 >= 1300                  len1 >= 600
 327   // len2 = 10000   len1 >= 1100                  len1 >= 500
 328   // len2 = 11000   len1 >= 1000                  len1 >= 500
 329   // len2 = 12000   len1 >= 900                   len1 >= 700
 330   // len2 = 13000   len1 >= 900                   len1 >= 500
 331   // len2 = 14000   len1 >= 900                   len1 >= 600
 332   // Here are the timigs from CLN version <= 1.0.3:
 333   // //   len2 = 3000   len1 >= 800
 334   // //   len2 = 3500   len1 >= 700
 335   // //   len2 = 4000   len1 >= 580
 336   // //   len2 = 4500   len1 >= 430
 337   // //   len2 = 5000   len1 >= 370
 338   // //   len2 = 5500   len1 >= 320
 339   // //   len2 = 6000   len1 >= 500
 340   // //   len2 = 7000   len1 >= 370
 341   // //   len2 = 8000   len1 >= 330
 342   // //   len2 = 9000   len1 >= 420
 343   // //   len2 =10000   len1 >= 370
 344   // //   len2 =11000   len1 >= 330
 345   // //   len2 =12000   len1 >= 330
 346   // //   len2 =13000   len1 >= 350
 347   // Let's choose the following condition:
 348 #if CL_USE_GMP
 349   const unsigned int cl_fftm_threshold1 = 600;
 350 #else
 351   // Use the old default values from CLN version <= 1.0.3 as a crude estimate.
 352   const unsigned int cl_fftm_threshold1 = 330;
 353 #endif
 354   const unsigned int cl_fftm_threshold2 = 2*cl_fftm_threshold;
 355   //   len1 > cl_fftm_threshold1 && len2 > cl_fftm_threshold2
 356   //   && len1 >= cl_fftm_threshold1 + cl_fftm_threshold/(len2-cl_fftm_threshold1)*(cl_fftm_threshold-cl_fftm_threshold1).
 357   static inline bool cl_fftm_suitable (uintC len1, uintC len2)
 358     { if (len1 >= cl_fftm_threshold)
 359         return true;
 360       if (len1 > cl_fftm_threshold1)
 361         if (len2 > cl_fftm_threshold2)
 362           { const unsigned int prod_threshold = cl_fftm_threshold*(cl_fftm_threshold-cl_fftm_threshold1);
 363             if (len1-cl_fftm_threshold1 >= prod_threshold)
 364               return true;
 365             if (len2-cl_fftm_threshold1 >= prod_threshold)
 366               return true;
 367             var uint32 hi;
 368             var uint32 lo;
 369             mulu32(len1-cl_fftm_threshold1,len2-cl_fftm_threshold1, hi=,lo=);
 370             if (hi > 0 || lo >= prod_threshold)
 371               return true;
 372           }
 373       return false;
 374     }
 375
 376 #if 0 // Doesn't seem to be worth the effort
 377
 378 // FFT-Multiplikation über den komplexen Zahlen.
 379 #include "base/digitseq/cl_DS_mul_fftc.h"
 380   // Multiplikation zweier N-Wort-Zahlen unter
 381   //  Linux mit einem i486 33 MHz
 382   //    N     kara/fftm  fftc   fftclong
 383   //   1000      0.35     1.52     0.94
 384   //   2500      0.98     7.6/8.4  4.7
 385   //   5000      2.2     18.2     10.2
 386   //  10000      4.7     34       22
 387   // Multiplikation zweier N-Wort-Zahlen unter
 388   //  Linux mit einem i586 90/100 MHz
 389   //    N     kara/fftm  fftc   fftclong
 390   //   1000      0.03     0.20   0.16
 391   //   2500      0.16     1.6    0.92
 392   //   5000      0.3      2.6    2.2
 393   //  10000      0.7      7.1    4.8
 394   //  25000      1.6    (50MB)  20.7(22MB)
 395   // Multiplikation zweier N-Wort-Zahlen unter
 396   //  Solaris, Sparc 20, 75 MHz
 397   //    N     kara/fftm  fftc
 398   //   1000      0.07     0.14
 399   //   2500      0.21     0.76
 400   //   5000      0.44     1.75
 401   //  10000      0.88     4.95
 402   //  25000      2.3     (15MB)
 403
 404 // FFT-Multiplikation über den komplexen Zahlen, Symmetrie ausnutzend.
 405 #include "base/digitseq/cl_DS_mul_fftcs.h"
 406   // Multiplikation zweier N-Wort-Zahlen unter
 407   //  Linux mit einem i486 33 MHz
 408   //    N     kara/fftm  fftcs  fftcslong
 409   //   1000      0.34     0.71     0.43
 410   //   2500      0.98     3.4      2.1
 411   //   5000      2.2      8.0      4.7
 412   //  10000      4.7     16.1     10.4
 413   // Multiplikation zweier N-Wort-Zahlen unter
 414   //  Solaris, Sparc 20, 75 MHz
 415   //    N     kara/fftm  fftcs
 416   //    300      0.010    0.012
 417   //    400      0.018    0.027
 418   //    500      0.023    0.027
 419   //    600      0.031    0.027
 420   //    700      0.031    0.027
 421   //    800      0.051    0.058
 422   //    900      0.064    0.059
 423   //   1000      0.069    0.059
 424   //   1250      0.088    0.13
 425   //   1500      0.088    0.13
 426   //   1750      0.088    0.13
 427   //   2000      0.19     0.13
 428   //   2500      0.19     0.29
 429   //   3000      0.19     0.33
 430   //   3500      0.20     0.31
 431   //   4000      0.37     0.70
 432   //   4500      0.38     0.70
 433   //   5000      0.43     0.69
 434   //   6000      0.43     0.69
 435   //   7000      0.43     1.62
 436   //   8000      0.88     1.60
 437   //   9000      0.88     1.6
 438   //  10000      0.90     1.55
 439   //  12000      0.89     4.7
 440   //  14000      0.90     5.2
 441   //  16000      1.43     5.2
 442
 443 #endif
 444
 445 #if 0 // Keine gute Fehlerabschätzung
 446
 447 // FFT-Multiplikation über den komplexen Zahlen, mit reellen Zahlen rechnend.
 448 #include "base/digitseq/cl_DS_mul_fftr.h"
 449   // Multiplikation zweier N-Wort-Zahlen unter
 450   //  Linux mit einem i486 33 MHz
 451   //    N     kara/fftm  fftr   fftrlong
 452   //   1000      0.34     0.64     0.40
 453   //   2500      0.98     3.5      2.0
 454   //   5000      2.2      7.2/7.7  4.6
 455   //  10000      4.7     16.6     10.0
 456
 457 #endif
 458
 459 //  int cl_mul_algo = 0;
 460   void cl_UDS_mul (const uintD* sourceptr1, uintC len1,
 461                    const uintD* sourceptr2, uintC len2,
 462                    uintD* destptr)
 463     { // len1<=len2 erzwingen:
 464       if (len1>len2)
 465         {{var const uintD* temp;
 466           temp = sourceptr1; sourceptr1 = sourceptr2; sourceptr2 = temp;
 467          }
 468          {var uintC temp;
 469           temp = len1; len1 = len2; len2 = temp;
 470         }}
 471       if (len1==1)
 472         // nur eine Einfachschleife
 473         { mulu_loop_lsp(lsprefnext(sourceptr1),sourceptr2,destptr,len2); }
 474       else
 475         {
 476 //          if (cl_mul_algo > 0)
 477 //            mulu_fftcs(sourceptr1,len1,sourceptr2,len2,destptr);
 478 //          else
 479 //          if (cl_mul_algo > 0)
 480 //            mulu_nussbaumer(sourceptr1,len1,sourceptr2,len2,destptr);
 481 //          else
 482           if (len1 < cl_karatsuba_threshold)
 483             // Multiplikation nach Schulmethode
 484             mulu_2loop(sourceptr1,len1,sourceptr2,len2,destptr);
 485           else // len1 groß
 486           if (!cl_fftm_suitable(len1,len2))
 487             // Karatsuba-Multiplikation
 488             // (ausgelagert, um die eigentliche Multiplikationsfunktion nicht
 489             // durch zu viele Registervariablen zu belasten):
 490             mulu_karatsuba(sourceptr1,len1,sourceptr2,len2,destptr);
 491           else
 492             //mulu_fft_modp(sourceptr1,len1,sourceptr2,len2,destptr);
 493             //mulu_nussbaumer(sourceptr1,len1,sourceptr2,len2,destptr);
 494             //mulu_fft_modp3(sourceptr1,len1,sourceptr2,len2,destptr);
 495             mulu_fft_modm(sourceptr1,len1,sourceptr2,len2,destptr);
 496           #ifdef DEBUG_MUL_XXX
 497           { // Check the correctness of an other multiplication algorithm:
 498             CL_ALLOCA_STACK;
 499             var uintD tmpprod_xxx = cl_alloc_array(uintD,len1+len2);
 500             mulu_xxx(sourceptr1,len1,sourceptr2,len2,arrayLSDptr(tmpprod_xxx,len1+len2));
 501             if (compare_loop_msp(destptr lspop (len1+len2),arrayMSDptr(tmpprod_xxx,len1+len2),len1+len2))
 502               throw runtime_exception();
 503           }
 504           #endif
 505         }
 506     }
 507
 508 // Special support for squaring.
 509 // Squaring takes approximately 69% of the time of a normal multiplication.
 510   #include "cl_DS_mul_kara_sqr.h" // defines mulu_karatsuba_square()
 511   void cl_UDS_mul_square (const uintD* sourceptr, uintC len,
 512                           uintD* destptr)
 513   { if (len==1)
 514       { var uintD digit = lspref(sourceptr,0);
 515         #if HAVE_DD
 516         var uintDD prod = muluD(digit,digit);
 517         lspref(destptr,0) = lowD(prod); lspref(destptr,1) = highD(prod);
 518         #else
 519         muluD(digit,digit, lspref(destptr,1)=,lspref(destptr,0)=);
 520         #endif
 521       }
 522     else
 523       { if (len < cl_karatsuba_threshold)
 524           mulu_2loop_square(sourceptr,len,destptr);
 525         else
 526           if (!(len >= cl_fftm_threshold))
 527             mulu_karatsuba_square(sourceptr,len,destptr);
 528           else
 529             mulu_fft_modm(sourceptr,len,sourceptr,len,destptr);
 530       }
 531   }
 532
 533 }  // namespace cln