src/base/digitseq/cl_2DS_div.cc

   1 // div2adic().
   2
   3 // General includes.
   4 #include "cl_sysdep.h"
   5
   6 // Specification.
   7 #include "cl_2DS.h"
   8
   9
  10 // Implementation.
  11
  12 #include "cl_2D.h"
  13 #include "cl_DS.h"
  14 #include "cln/exception.h"
  15
  16 namespace cln {
  17
  18 // Time for dividing a n word number by a n word number, this is the common
  19 // case and therefore the important one:
  20 // OS: Linux 2.2, intDsize==32,        OS: TRU64/4.0, intDsize==64,
  21 // Machine: P-III/450MHz               Machine: EV5/300MHz:
  22 //      n   standard  Newton             standard  Newton
  23 //      30   0.00002   0.00006            0.00004   0.00020
  24 //     100   0.00009   0.00045            0.00033   0.0015
  25 //     300   0.00069   0.0028             0.0028    0.0085
  26 //    1000   0.018     0.019              0.031     0.065
  27 //    2000   0.028     0.057              0.12      0.20
  28 //    3000   0.078     0.11  <-(~4500)    0.28      0.23  <-(~2700)
  29 //   10000   1.09      0.48               3.14      1.13
  30 //   30000  10.1       1.21              29.7       2.70
  31 // Time for dividing a 2*n word number by a n word number:
  32 // OS: Linux 2.2, intDsize==32,        OS: TRU64/4.0, intDsize==64,
  33 // Machine: P-III/450MHz               Machine: EV5/300MHz:
  34 //      n   standard  Newton             standard  Newton
  35 //      30   0.00004   0.00019            0.00013   0.00067
  36 //     100   0.00032   0.0014             0.0013    0.0046
  37 //     300   0.0027    0.0084             0.011     0.025
  38 //    1000   0.029     0.057              0.12      0.20
  39 //    2000   0.16      0.18  <-(~2400)    0.50      0.46  <-(~1800)
  40 //    3000   0.38      0.22               1.1       0.50
  41 //   10000   4.5       1.05              13.0       2.48
  42 //   30000  51.7       2.67             120.0       6.31
  43 //          Newton faster for:         Newton faster for:
  44 // 1.0*N / N  3300<N<3800, 4400<N        2700<N<3600, N<3800
  45 // 1.1*N / N  3100<N<3700, 4100<N        2450<N<3300, N<3450
  46 // 1.2*N / N  2850<N<3200, 3700<N        2250<N
  47 // 1.3*N / N  2650<N<3000, 3450<N        2050<N
  48 // 1.4*N / N  3400<N                     1850<N
  49 // 1.5*N / N  3100<N                     1750<N
  50 // 1.6*N / N  2850<N                     1650<N
  51 // 1.7*N / N  2650<N                     1600<N
  52 // 1.8*N / N  2550<N                     1500<N
  53 // 1.9*N / N  2450<N                     1400<N
  54 // 2.0*N / N  2400<N                     1350<N
  55 //
  56 // Break-even-point. When in doubt, prefer to choose the standard algorithm.
  57 #if CL_USE_GMP
  58   static inline cl_boolean cl_recip_suitable (uintC m, uintC n) // n <= m
  59     { if (n < 2000)
  60         return cl_false;
  61       else // when n >= 4400/(m/n)^2, i.e. (m/66)^2 > n
  62         { var uintC mq = floor(m,66);
  63           if ((mq >= bit(intCsize/2)) || (mq*mq > n))
  64             return cl_true;
  65           else
  66             return cl_false;
  67         }
  68     }
  69 #else
  70 // Use the old default values from CLN version <= 1.0.3 as a crude estimate.
  71 // They came from timings on a i486 33 MHz running Linux:
  72 // Divide N digits by N digits          Divide 2*N digits by N digits
  73 //     N   standard  Newton                 N   standard  Newton
  74 //     10    0.00015 0.00054                10    0.00023 0.00054
  75 //     25    0.00065 0.00256                25    0.00116 0.00256
  76 //     50    0.0024  0.0083                 50    0.0044  0.0082
  77 //    100    0.0089  0.027                 100    0.0172  0.027
  78 //    250    0.054   0.130                 250    0.107   0.130
  79 //    500    0.22    0.42                  500    0.425   0.42  <-(~500)
  80 //   1000    0.86    1.30                 1000    1.72    1.30
  81 //   2500    5.6     4.1  <-(~2070)       2500   11.0     4.1
  82 //   5000   22.3     9.4                  5000   44.7     9.3
  83 //  10000   91.2    20.6                 10000  182      20.5
  84 //
  85 // 1.0*N / N : Newton for N >= 2070 or 1790 >= N >= 1460
  86 // 1.1*N / N : Newton for N >= 1880 or 1790 >= N >= 1320
  87 // 1.2*N / N : Newton for N >= 1250
  88 // 1.3*N / N : Newton for N >= 1010
  89 // 1.4*N / N : Newton for N >=  940
  90 // 1.5*N / N : Newton for N >=  750
  91 // 1.6*N / N : Newton for N >=  625
  92 // 1.7*N / N : Newton for N >=  550
  93 // 1.8*N / N : Newton for N >=  500
  94 // 1.9*N / N : Newton for N >=  500
  95 // 2.0*N / N : Newton for N >=  500
  96   static inline cl_boolean cl_recip_suitable (uintC m, uintC n) // n <= m
  97     { if (n < 500)
  98         return cl_false;
  99       else // when n >= 2100/(m/n)^2, i.e. (m/46)^2 > n
 100         { var uintC mq = floor(m,46);
 101           if ((mq >= bit(intCsize/2)) || (mq*mq > n))
 102             return cl_true;
 103           else
 104             return cl_false;
 105         }
 106     }
 107 #endif
 108
 109 void div2adic (uintC a_len, const uintD* a_LSDptr, uintC b_len, const uintD* b_LSDptr, uintD* dest_LSDptr)
 110 {
 111   var uintC lendiff = a_len - b_len;
 112   if (cl_recip_suitable(a_len,b_len))
 113     { // Division using reciprocal (Newton-Hensel algorithm).
 114       CL_ALLOCA_STACK;
 115       // Bestimme Kehrwert c von b mod 2^(intDsize*b_len).
 116       var uintD* c_LSDptr;
 117       num_stack_alloc(b_len,,c_LSDptr=);
 118       recip2adic(b_len,b_LSDptr,c_LSDptr);
 119       // Bestimme q := a * c mod 2^(intDsize*b_len).
 120       var uintD* q_LSDptr;
 121       num_stack_alloc(2*b_len,,q_LSDptr=);
 122       cl_UDS_mul(a_LSDptr,b_len,c_LSDptr,b_len,q_LSDptr);
 123       // Zur Bestimmung des Restes wieder mit b multiplizieren:
 124       var uintD* p_LSDptr;
 125       num_stack_alloc(2*b_len,,p_LSDptr=);
 126       cl_UDS_mul(q_LSDptr,b_len,b_LSDptr,b_len,p_LSDptr);
 127       // Überprüfen, daß p == a mod 2^(intDsize*b_len):
 128       if (compare_loop_msp(a_LSDptr lspop b_len,p_LSDptr lspop b_len,b_len))
 129         throw runtime_exception();
 130       // Quotient q und "Rest" (a-b*q)/2^(intDsize*b_len) ablegen:
 131       copy_loop_lsp(q_LSDptr,dest_LSDptr,b_len);
 132       if (lendiff <= b_len)
 133         { sub_loop_lsp(a_LSDptr lspop b_len,p_LSDptr lspop b_len,dest_LSDptr lspop b_len,lendiff); }
 134         else
 135         { var uintD carry = sub_loop_lsp(a_LSDptr lspop b_len,p_LSDptr lspop b_len,dest_LSDptr lspop b_len,b_len);
 136           copy_loop_lsp(a_LSDptr lspop 2*b_len,dest_LSDptr lspop 2*b_len,lendiff-b_len);
 137           if (carry) { dec_loop_lsp(dest_LSDptr lspop 2*b_len,lendiff-b_len); }
 138         }
 139     }
 140     else
 141     { // Standard division.
 142       var uintD b0inv = div2adic(1,lspref(b_LSDptr,0)); // b'
 143       copy_loop_lsp(a_LSDptr,dest_LSDptr,a_len); // d := a
 144       do { var uintD digit = lspref(dest_LSDptr,0); // nächstes d[j]
 145            digit = mul2adic(b0inv,digit);
 146            // digit = nächstes c[j]
 147            if (a_len <= b_len)
 148              { mulusub_loop_lsp(digit,b_LSDptr,dest_LSDptr,a_len); } // d := d - b * c[j] * beta^j
 149              else
 150              // a_len > b_len, b wird als durch Nullen fortgesetzt gedacht.
 151              { var uintD carry = mulusub_loop_lsp(digit,b_LSDptr,dest_LSDptr,b_len);
 152                if (lspref(dest_LSDptr,b_len) >= carry)
 153                  { lspref(dest_LSDptr,b_len) -= carry; }
 154                else
 155                  { lspref(dest_LSDptr,b_len) -= carry;
 156                    dec_loop_lsp(dest_LSDptr lspop (b_len+1),a_len-(b_len+1));
 157              }   }
 158            // Nun ist lspref(dest_LSDptr,0) = 0.
 159            lspref(dest_LSDptr,0) = digit; // c[j] ablegen
 160            lsshrink(dest_LSDptr); a_len--; // nächstes j
 161          }
 162          until (a_len==lendiff);
 163     }
 164 }
 165 // Bit complexity (N = max(a_len,b_len)): O(M(N)).
 166
 167 }  // namespace cln