src/base/digitseq/cl_DS_mul.cc

   1 // cl_UDS_mul().
   2
   3 // General includes.
   4 #include "cl_sysdep.h"
   5
   6 // Specification.
   7 #include "cl_DS.h"
   8
   9
  10 // Implementation.
  11
  12 #include "cl_low.h"
  13 #include "cl_malloc.h"
  14 #include "cl_abort.h"
  15
  16 // Multiplikations-Doppelschleife:
  17 // Multipliziert zwei UDS und legt das Ergebnis in einer dritten UDS ab.
  18 // cl_UDS_mul(sourceptr1,len1,sourceptr2,len2,destptr);
  19 // multipliziert die UDS  sourceptr1[-len1..-1]  (len1>0)
  20 //           mit der UDS  sourceptr2[-len1..-1]  (len2>0)
  21 // und legt das Ergebnis in der UDS  destptr[-len..-1]  (len=len1+len2) ab.
  22 // Unterhalb von destptr werden len Digits Platz benötigt.
  23   void cl_UDS_mul (const uintD* sourceptr1, uintC len1,
  24                    const uintD* sourceptr2, uintC len2,
  25                    uintD* destptr);
  26 // Spezialfall sourceptr1 == sourceptr2 && len1 == len2.
  27   void cl_UDS_mul_square (const uintD* sourceptr, uintC len,
  28                           uintD* destptr);
  29
  30 // Multiplikation nach Schulmethode:
  31   static inline void mulu_2loop (const uintD* sourceptr1, uintC len1,
  32                                  const uintD* sourceptr2, uintC len2,
  33                                  uintD* destptr)
  34   { // Es ist 2 <= len1 <= len2.
  35     // Erster Schleifendurchlauf:
  36     mulu_loop_lsp(lsprefnext(sourceptr1),sourceptr2,destptr,len2);
  37     lsshrink(destptr);
  38     var uintD* destptr2 = destptr lspop len2;
  39     // äußere Schleife läuft über source1 :
  40     dotimespC(len1,len1-1,
  41       { // innere Schleife läuft über source2 :
  42         var uintD carry =
  43           muluadd_loop_lsp(lsprefnext(sourceptr1),sourceptr2,destptr,len2);
  44         lsprefnext(destptr2) = carry; // UDS um das Carry-Digit verlängern
  45         lsshrink(destptr);
  46       });
  47   }
  48   static inline void mulu_2loop_square (const uintD* sourceptr, uintC len,
  49                                         uintD* destptr)
  50   { // Es ist 2 <= len.
  51     // Gemischte Produkte:
  52     #if 0
  53     // 2*(  x[1] * x[0..0] * b^1
  54     //    + x[2] * x[1..0] * b^2
  55     //    + ...
  56     //    + x[n-1] * x[n-2..0]*b^(n-1))
  57     { var const uintD* sourceptr1 = sourceptr lspop 1;
  58       var uintD* destptr1 = destptr;
  59       lsprefnext(destptr1) = 0;
  60       var uintD* destptr2 = destptr1;
  61       var uintC count;
  62       for (count = 1; count < len; count++)
  63         { // sourceptr1 = sourceptr lspop count, destptr1 = destptr lspop count,
  64           // destptr2 = destptr lspop (2*count-1).
  65           lsprefnext(destptr2) = 0;
  66           var uintD carry =
  67             muluadd_loop_lsp(lsprefnext(sourceptr1),sourceptr,destptr1,count);
  68           lsprefnext(destptr2) = carry;
  69           destptr1 = destptr1 lspop 1;
  70         }
  71       { var uintD carry = shift1left_loop_lsp(destptr lspop 1,2*len-2);
  72         lspref(destptr2,0) = (carry==0 ? 0 : 1);
  73     } }
  74     #else
  75     // 2*(  x[n-1..1] * x[0] * b^1
  76     //    + x[n-1..2] * x[1] * b^3
  77     //    + ...
  78     //    + x[n-1..n-1] * x[n-2] * b^(2*n-3))
  79     { var const uintD* sourceptr1 = sourceptr;
  80       var uintD* destptr2 = destptr;
  81       lsprefnext(destptr2) = 0;
  82       var uintC count = len-1;
  83       { var uintD digit = lsprefnext(sourceptr1);
  84         mulu_loop_lsp(digit,sourceptr1,destptr2,count);
  85       }
  86       var uintD* destptr1 = destptr lspop (len+1);
  87       while (--count > 0)
  88         { destptr2 = destptr2 lspop 2;
  89           var uintD digit = lsprefnext(sourceptr1);
  90           var uintD carry = muluadd_loop_lsp(digit,sourceptr1,destptr2,count);
  91           lsprefnext(destptr1) = carry;
  92         }
  93       { var uintD carry = shift1left_loop_lsp(destptr lspop 1,2*len-2);
  94         lspref(destptr1,0) = (carry==0 ? 0 : 1);
  95     } }
  96     #endif
  97     // Quadrate:
  98     len = 2*len;
  99     do { len -= 2;
 100          var uintD digit = lsprefnext(sourceptr);
 101          #if HAVE_DD
 102          var uintDD prod = muluD(digit,digit);
 103          var uintDD accu = highlowDD(lspref(destptr,1),lspref(destptr,0));
 104          accu += prod;
 105          lspref(destptr,0) = lowD(accu); lspref(destptr,1) = highD(accu);
 106          destptr = destptr lspop 2;
 107          if (accu < prod) { inc_loop_lsp(destptr,len); }
 108          #else
 109          var uintD hi;
 110          var uintD lo;
 111          muluD(digit,digit, hi=,lo=);
 112          var uintD tmp;
 113          tmp = lspref(destptr,0) + lo; lspref(destptr,0) = tmp;
 114          if (tmp < lo) hi++;
 115          tmp = lspref(destptr,1) + hi; lspref(destptr,1) = tmp;
 116          destptr = destptr lspop 2;
 117          if (tmp < hi) { inc_loop_lsp(destptr,len); }
 118          #endif
 119        } while (len > 0);
 120   }
 121
 122 // Karatsuba-Multiplikation: O(n^(log 3 / log 2))
 123   static void mulu_karatsuba (const uintD* sourceptr1, uintC len1,
 124                               const uintD* sourceptr2, uintC len2,
 125                               uintD* destptr);
 126   static void mulu_karatsuba_square (const uintD* sourceptr, uintC len,
 127                                      uintD* destptr);
 128 #include "cl_DS_mul_kara.h"
 129   // karatsuba_threshold = Länge, ab der die Karatsuba-Multiplikation bevorzugt
 130   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 131   // Als Test dient (progn (time (! 5000)) nil), das viele kleine und einige
 132   // ganz große Multiplikationen durchführt. Miß die Runtime.
 133   // Unter Linux mit einem 80486:      Auf einer Sparc 2:
 134   // threshold  time in 0.01 sec.
 135   //      5        125   127
 136   //      6        116   117
 137   //      7        107   110
 138   //      8        101   103
 139   //      9         99   102
 140   //     10         98   100
 141   //     11         97   100
 142   //     12         96    99
 143   //     13         97    99
 144   //     14         97   100
 145   //     15         97    99
 146   //     16         98   100
 147   //     17         98   100
 148   //     18         98   100
 149   //     19         98   101
 150   //     20         99   102
 151   //     25        103   105
 152   //     30        109   111
 153   //     40        115   118
 154   //     50        122   125
 155   //     70        132   134
 156   //    100        151   152
 157   //    150        164   167
 158   //    250        183   187
 159   //    500        203   205
 160   //   1000        203   205
 161   //             (clisp)(cln)
 162   // Das Optimum scheint bei karatsuba_threshold = 12 zu liegen.
 163   // Da das Optimum aber vom Verhältnis
 164   //         Zeit für uintD-Multiplikation / Zeit für uintD-Addition
 165   // abhängt und die gemessenen Zeiten auf eine Unterschreitung des Optimums
 166   // empfindlicher reagieren als auf eine Überschreitung des Optimums,
 167   // sind wir vorsichtig und wählen einen Wert etwas über dem Optimum:
 168   const unsigned int cl_karatsuba_threshold = 16;
 169
 170 #if 0 // Lohnt sich nicht
 171
 172 // FFT-Multiplikation nach Nussbaumer: O(n log n log log n)
 173 #include "cl_DS_mul_nuss.h"
 174   // nuss_threshold = Länge, ab der die Nussbaumer-Multiplikation bevorzugt
 175   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 176   // Multiplikation zweier N-Wort-Zahlen unter Linux mit einem 80486:
 177   //    N     kara   nuss  nuss-asm  (time in sec.)
 178   //   1000    0.36   1.05   0.70
 179   //   5000    4.69  10.0    6.71
 180   //  25000   61.6   62.7   40.2
 181   //  32500   91.8   62.7   40.3
 182   //  35000  102.7  124.7   80.4
 183   //  50000  185    132     85.2
 184   int cl_nuss_threshold = 1000000;
 185
 186 // FFT-Multiplikation in Z/pZ: O(n log n log log n)
 187 #include "cl_DS_mul_fftp.h"
 188   // fftp_threshold = Länge, ab der die FFT-Multiplikation mod p bevorzugt
 189   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 190   // Multiplikation zweier N-Wort-Zahlen unter Linux mit einem 80486:
 191   //    N     kara   fftp  (time in sec.)
 192   //   1000    0.36   1.57
 193   //   5000    4.66  14.86
 194   //  25000   61.1   75.0
 195   //  32500   90.8   75.5
 196   //  35000  101.6  150.1
 197   //  50000  183    160
 198   int cl_fftp_threshold = 1000000;
 199
 200 // FFT-Multiplikation in Z/pZ: O(n log n log log n)
 201 // für drei verschiedene Primzahlen p1,p2,p3 < 2^32.
 202 #include "cl_DS_mul_fftp3.h"
 203   // fftp3_threshold = Länge, ab der die FFT-Multiplikation mod p_i bevorzugt
 204   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 205   // Multiplikation zweier N-Wort-Zahlen unter Linux mit einem 80486:
 206   //    N     kara   fftp3  fftp  (time in sec.)
 207   //   1000    0.36   0.59   1.57
 208   //   5000    4.66   5.44  14.89
 209   //  10000   13.98  11.91  32.43
 210   //  25000   61.1   27.4   75.4
 211   //  32500   90.5   28.1   75.5
 212   //  35000  101.4   54.8  150.4
 213   //  50000  183     58.9  161.6
 214   int cl_fftp3_threshold = 1000000;
 215
 216 // FFT-Multiplikation in Z/pZ: O(n log n log log n)
 217 // für drei verschiedene Primzahlen p1,p2,p3 < 2^32,
 218 // mit Montgomery-Multiplikation.
 219 #include "cl_DS_mul_fftp3m.h"
 220   // fftp3_threshold = Länge, ab der die FFT-Multiplikation mod p_i bevorzugt
 221   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 222   // Multiplikation zweier N-Wort-Zahlen unter
 223   // Linux mit einem 80486, 33 MHz, mit Benutzung der GMP-Low-Level-Funktionen:
 224   //    N     kara    fftm  fftp3m  fftp3  fftp  (time in sec.)
 225   //   1000    0.35   0.49   0.54   0.59   1.58
 226   //   2500    1.48   0.97   2.34   2.52   6.99
 227   //   5000    4.43   2.19   5.08   5.48  15.16
 228   //  10000   13.33   4.68  10.93  11.82  32.94
 229   //  25000   58.5   12.0   25.3   27.4   77.0
 230   //  32500   86.0   25.0   26.1   28.0   77.3
 231   //  35000   96.5   25.0   50.8   54.9  152.8
 232   //  50000  176     25.2   54.2   58.5  163.4
 233   // und auf einer SPARC 20 mit 75 MHz, ohne GMP-Low-Level-Funktionen:
 234   //    N     kara    fftm  fftp3m  fftp3  fftp  (time in sec.)
 235   //   1000    0.076  0.096  0.113  0.233  0.415
 236   //   2500    0.32   0.21   0.48   1.03   1.82
 237   //   5000    0.97   0.51   1.03   2.22   3.96
 238   //  10000    2.99   1.03   2.23   4.72   8.59
 239   //  25000   13.22   2.73   4.99  10.78  19.73
 240   //  32500   19.3    5.7    5.2   10.9   19.7
 241   //  35000   21.5    5.9   10.0   21.7   39.4
 242   //  50000   39.5    6.0   11.3   23.1   42.7
 243   int cl_fftp3m_threshold = 1000000;
 244
 245 #endif
 246
 247 // FFT-Multiplikation in Z/pZ: O(n^1.29)
 248 #include "cl_DS_mul_fftm.h"
 249   // fftm_threshold = Länge, ab der die FFT-Multiplikation mod m bevorzugt
 250   // wird. Der Break-Even-Point bestimmt sich aus Zeitmessungen.
 251   // Multiplikation zweier N-Wort-Zahlen unter
 252   //  Linux mit einem 80486:               Solaris, Sparc 10/20:
 253   //    N     kara   fftm  (time in sec.)    kara   fftm
 254   //   1000    0.36   0.54                    0.08   0.10
 255   //   5000    4.66   2.48                    1.01   0.51
 256   //  25000   61.1   13.22                   13.23   2.73
 257   //  32500   91.0   27.5                    20.0    5.8
 258   //  35000  102.1   27.5                    21.5    5.6
 259   //  50000  183     27.6                    40.7    5.6
 260   // Multiplikation zweier N-Wort-Zahlen unter
 261   //  Linux mit einem 80486:               Solaris, Sparc 10/20:
 262   //    N     kara   fftm  (time in sec.)    kara   fftm
 263   //   1000    0.36   0.54                    0.08   0.10
 264   //   1260    0.52   0.50                    0.11   0.10
 265   //   1590    0.79   0.51                    0.16   0.10
 266   //   2000    1.09   1.07                    0.23   0.21
 267   //   2520    1.57   1.08                    0.33   0.21
 268   //   3180    2.32   1.08                    0.50   0.21
 269   //   4000    3.29   2.22                    0.70   0.41
 270   //   5040    4.74   2.44                    0.99   0.50
 271   //    N1    N2    kara   fftm  (time in sec.)    kara   fftm
 272   //   1250  1250    0.51   0.50                    0.11   0.10
 273   //   1250  1580    0.70   0.50                    0.15   0.10
 274   //   1250  2000    0.89   0.51                    0.18   0.10
 275   //   1250  2250    0.99   0.51                    0.21   0.10
 276   //   1250  2500    1.08   1.03     <---           0.22   0.21
 277   //   1250  2800    1.20   1.07                    0.26   0.21
 278   //   1250  3100    1.35   1.07                    0.28   0.21
 279   // Es gibt also noch Werte von (len1,len2) mit 1250 <= len1 <= len2, bei
 280   // denen "kara" schneller ist als "fftm", aber nicht viele und dort auch
 281   // nur um 5%. Darum wählen wir ab hier die FFT-Multiplikation.
 282   const unsigned int cl_fftm_threshold = 1250; // muß stets >= 6 sein (sonst Endlosrekursion!)
 283   // This is the threshold for multiplication of equally sized factors.
 284   // When the lengths differ much, the threshold varies:
 285   //   len2 = 3000   len1 >= 800
 286   //   len2 = 3500   len1 >= 700
 287   //   len2 = 4000   len1 >= 580
 288   //   len2 = 4500   len1 >= 430
 289   //   len2 = 5000   len1 >= 370
 290   //   len2 = 5500   len1 >= 320
 291   //   len2 = 6000   len1 >= 500
 292   //   len2 = 7000   len1 >= 370
 293   //   len2 = 8000   len1 >= 330
 294   //   len2 = 9000   len1 >= 420
 295   //   len2 =10000   len1 >= 370
 296   //   len2 =11000   len1 >= 330
 297   //   len2 =12000   len1 >= 330
 298   //   len2 =13000   len1 >= 350
 299   // Let's choose the following condition:
 300   const unsigned int cl_fftm_threshold1 = 330;
 301   const unsigned int cl_fftm_threshold2 = 2*cl_fftm_threshold;
 302   //   len1 > cl_fftm_threshold1 && len2 > cl_fftm_threshold2
 303   //   && len1 >= cl_fftm_threshold1 + cl_fftm_threshold/(len2-cl_fftm_threshold1)*(cl_fftm_threshold-cl_fftm_threshold1).
 304   static inline cl_boolean cl_fftm_suitable (uintL len1, uintL len2)
 305     { if (len1 >= cl_fftm_threshold)
 306         return cl_true;
 307       if (len1 > cl_fftm_threshold1)
 308         if (len2 > cl_fftm_threshold2)
 309           { var uint32 hi;
 310             var uint32 lo;
 311             mulu32(len1-cl_fftm_threshold1,len2-cl_fftm_threshold1, hi=,lo=);
 312             if (hi > 0 || lo >= cl_fftm_threshold*(cl_fftm_threshold-cl_fftm_threshold1))
 313               return cl_true;
 314           }
 315       return cl_false;
 316     }
 317
 318 #if 0 // Lohnt sich nicht
 319
 320 // FFT-Multiplikation über den komplexen Zahlen.
 321 #include "cl_DS_mul_fftc.h"
 322   // Multiplikation zweier N-Wort-Zahlen unter
 323   //  Linux mit einem i486 33 MHz
 324   //    N     kara/fftm  fftc   fftclong
 325   //   1000      0.35     1.52     0.94
 326   //   2500      0.98     7.6/8.4  4.7
 327   //   5000      2.2     18.2     10.2
 328   //  10000      4.7     34       22
 329   // Multiplikation zweier N-Wort-Zahlen unter
 330   //  Linux mit einem i586 90/100 MHz
 331   //    N     kara/fftm  fftc   fftclong
 332   //   1000      0.03     0.20   0.16
 333   //   2500      0.16     1.6    0.92
 334   //   5000      0.3      2.6    2.2
 335   //  10000      0.7      7.1    4.8
 336   //  25000      1.6    (50MB)  20.7(22MB)
 337   // Multiplikation zweier N-Wort-Zahlen unter
 338   //  Solaris, Sparc 20, 75 MHz
 339   //    N     kara/fftm  fftc
 340   //   1000      0.07     0.14
 341   //   2500      0.21     0.76
 342   //   5000      0.44     1.75
 343   //  10000      0.88     4.95
 344   //  25000      2.3     (15MB)
 345
 346 // FFT-Multiplikation über den komplexen Zahlen, Symmetrie ausnutzend.
 347 #include "cl_DS_mul_fftcs.h"
 348   // Multiplikation zweier N-Wort-Zahlen unter
 349   //  Linux mit einem i486 33 MHz
 350   //    N     kara/fftm  fftcs  fftcslong
 351   //   1000      0.34     0.71     0.43
 352   //   2500      0.98     3.4      2.1
 353   //   5000      2.2      8.0      4.7
 354   //  10000      4.7     16.1     10.4
 355   // Multiplikation zweier N-Wort-Zahlen unter
 356   //  Solaris, Sparc 20, 75 MHz
 357   //    N     kara/fftm  fftcs
 358   //    300      0.010    0.012
 359   //    400      0.018    0.027
 360   //    500      0.023    0.027
 361   //    600      0.031    0.027
 362   //    700      0.031    0.027
 363   //    800      0.051    0.058
 364   //    900      0.064    0.059
 365   //   1000      0.069    0.059
 366   //   1250      0.088    0.13
 367   //   1500      0.088    0.13
 368   //   1750      0.088    0.13
 369   //   2000      0.19     0.13
 370   //   2500      0.19     0.29
 371   //   3000      0.19     0.33
 372   //   3500      0.20     0.31
 373   //   4000      0.37     0.70
 374   //   4500      0.38     0.70
 375   //   5000      0.43     0.69
 376   //   6000      0.43     0.69
 377   //   7000      0.43     1.62
 378   //   8000      0.88     1.60
 379   //   9000      0.88     1.6
 380   //  10000      0.90     1.55
 381   //  12000      0.89     4.7
 382   //  14000      0.90     5.2
 383   //  16000      1.43     5.2
 384
 385 #endif
 386
 387 #if 0 // Keine gute Fehlerabschätzung
 388
 389 // FFT-Multiplikation über den komplexen Zahlen, mit reellen Zahlen rechnend.
 390 #include "cl_DS_mul_fftr.h"
 391   // Multiplikation zweier N-Wort-Zahlen unter
 392   //  Linux mit einem i486 33 MHz
 393   //    N     kara/fftm  fftr   fftrlong
 394   //   1000      0.34     0.64     0.40
 395   //   2500      0.98     3.5      2.0
 396   //   5000      2.2      7.2/7.7  4.6
 397   //  10000      4.7     16.6     10.0
 398
 399 #endif
 400
 401 //  int cl_mul_algo = 0;
 402   void cl_UDS_mul (const uintD* sourceptr1, uintC len1,
 403                    const uintD* sourceptr2, uintC len2,
 404                    uintD* destptr)
 405     { // len1<=len2 erzwingen:
 406       if (len1>len2)
 407         {{var const uintD* temp;
 408           temp = sourceptr1; sourceptr1 = sourceptr2; sourceptr2 = temp;
 409          }
 410          {var uintC temp;
 411           temp = len1; len1 = len2; len2 = temp;
 412         }}
 413       if (len1==1)
 414         // nur eine Einfachschleife
 415         { mulu_loop_lsp(lsprefnext(sourceptr1),sourceptr2,destptr,len2); }
 416       else
 417         {
 418 //          if (cl_mul_algo > 0)
 419 //            mulu_fftcs(sourceptr1,len1,sourceptr2,len2,destptr);
 420 //          else
 421 //          if (cl_mul_algo > 0)
 422 //            mulu_nussbaumer(sourceptr1,len1,sourceptr2,len2,destptr);
 423 //          else
 424           if (len1 < cl_karatsuba_threshold)
 425             // Multiplikation nach Schulmethode
 426             mulu_2loop(sourceptr1,len1,sourceptr2,len2,destptr);
 427           else // len1 groß
 428           if (!cl_fftm_suitable(len1,len2))
 429             // Karatsuba-Multiplikation
 430             // (ausgelagert, um die eigentliche Multiplikationsfunktion nicht
 431             // durch zu viele Registervariablen zu belasten):
 432             mulu_karatsuba(sourceptr1,len1,sourceptr2,len2,destptr);
 433           else
 434             //mulu_fft_modp(sourceptr1,len1,sourceptr2,len2,destptr);
 435             //mulu_nussbaumer(sourceptr1,len1,sourceptr2,len2,destptr);
 436             //mulu_fft_modp3(sourceptr1,len1,sourceptr2,len2,destptr);
 437             mulu_fft_modm(sourceptr1,len1,sourceptr2,len2,destptr);
 438           #ifdef DEBUG_MUL_XXX
 439           { // Check the correctness of an other multiplication algorithm:
 440             CL_ALLOCA_STACK;
 441             var uintD tmpprod_xxx = cl_alloc_array(uintD,len1+len2);
 442             mulu_xxx(sourceptr1,len1,sourceptr2,len2,arrayLSDptr(tmpprod_xxx,len1+len2));
 443             if (compare_loop_msp(destptr lspop (len1+len2),arrayMSDptr(tmpprod_xxx,len1+len2),len1+len2))
 444               cl_abort();
 445           }
 446           #endif
 447         }
 448     }
 449
 450 // Special support for squaring.
 451 // Squaring takes approximately 69% of the time of a normal multiplication.
 452   #include "cl_DS_mul_kara_sqr.h" // defines mulu_karatsuba_square()
 453   void cl_UDS_mul_square (const uintD* sourceptr, uintC len,
 454                           uintD* destptr)
 455   { if (len==1)
 456       { var uintD digit = lspref(sourceptr,0);
 457         #if HAVE_DD
 458         var uintDD prod = muluD(digit,digit);
 459         lspref(destptr,0) = lowD(prod); lspref(destptr,1) = highD(prod);
 460         #else
 461         muluD(digit,digit, lspref(destptr,1)=,lspref(destptr,0)=);
 462         #endif
 463       }
 464     else
 465       { if (len < cl_karatsuba_threshold)
 466           mulu_2loop_square(sourceptr,len,destptr);
 467         else
 468           if (!(len >= cl_fftm_threshold))
 469             mulu_karatsuba_square(sourceptr,len,destptr);
 470           else
 471             mulu_fft_modm(sourceptr,len,sourceptr,len,destptr);
 472       }
 473   }