src/base/digitseq/cl_DS_sqrt.cc

   1 // cl_UDS_sqrt().
   2
   3 // General includes.
   4 #include "cl_sysdep.h"
   5
   6 // Specification.
   7 #include "cl_DS.h"
   8
   9
  10 // Implementation.
  11
  12 #include "cl_low.h"
  13 #include "cln/abort.h"
  14
  15 namespace cln {
  16
  17 // We observe the following timings:
  18 // Time for square root of a_len = 2*N by b_len = N digits,
  19 // OS: Linux 2.2, intDsize==32,        OS: TRU64/4.0, intDsize==64,
  20 // Machine: P-III/450MHz               Machine: EV5/300MHz:
  21 //      N   standard  Newton           standard  Newton
  22 //      30   0.00002   0.00009          0.00011   0.00027
  23 //     100   0.00012   0.00052          0.00057   0.0017
  24 //     300   0.00087   0.0031           0.0037    0.0091
  25 //    1000   0.0089    0.020            0.037     0.069
  26 //    3000   0.087     0.11  <-(~3200)  0.30      0.28  <- (~2750)
  27 //   10000   1.27      0.55             3.5       1.3
  28 //   30000  12.7       1.35            31.1       3.4
  29 // Newton faster for 3200<N            Newton faster for 2750<N
  30 // When in doubt, prefer to choose the standard algorithm.
  31 #if CL_USE_GMP
  32   static inline cl_boolean cl_recipsqrt_suitable (uintL n)
  33   { return (cl_boolean)(n >= 3200); }
  34 #else
  35 // Use the old default values from CLN version <= 1.0.3 as a crude estimate.
  36 // Time for square root of a_len = 2*N by b_len = N digits,
  37 // on a i486 33 MHz running Linux:
  38 //      N   standard  Newton
  39 //      10    0.00022 0.00132
  40 //      25    0.00082 0.0047
  41 //      50    0.0026  0.0130
  42 //     100    0.0095  0.038
  43 //     250    0.057   0.154
  44 //     500    0.22    0.46
  45 //    1000    0.90    1.39
  46 //    2500    6.0     4.6
  47 //    5000   24.1    10.7
  48 //   10000   98      23.2
  49 //   -----> Newton faster for 1570 <= N <= 1790 and for N >= 2100.
  50   static inline cl_boolean cl_recipsqrt_suitable (uintL n)
  51   { return (cl_boolean)(n >= 2100); }
  52 #endif
  53
  54 // Bildet zu einer Unsigned Digit sequence a die Wurzel
  55 // (genauer: Gaußklammer aus Wurzel aus a).
  56 // squarep = cl_UDS_sqrt(a_MSDptr,a_len,a_LSDptr, &b);
  57 // > a_MSDptr/a_len/a_LSDptr: eine UDS
  58 // < NUDS b: Gaußklammer der Wurzel aus a
  59 // < squarep: cl_true falls a = b^2, cl_false falls b^2 < a < (b+1)^2.
  60 // Methode:
  61 // erst A normalisieren. A=0 --> B=0, fertig.
  62 // Wähle n so, daß beta^(2n-2) <= A < beta^(2n).
  63 // Wähle s (0<=s<16) so, daß beta^(2n)/4 <= A*2^(2s) < beta^(2n).
  64 // Setze A:=A*2^(2s) und kopiere dabei A. Suche B=floor(sqrt(A)).
  65 // Mache Platz für B=[0,b[n-1],...,b[0]], (mit einem Nulldigit Platz davor,
  66 // da dort nicht B, sondern 2*B abgespeichert werden wird).
  67 // Auf den Plätzen [a[2n-1],...,a[2n-2j]] wird die Differenz
  68 // [a[2n-1],...,a[2n-2j]] - [b[n-1],...,b[n-j]] ^ 2 abgespeichert.
  69 // Bestimme b[n-1] = floor(sqrt(a[2n-1]*beta+a[2n-2])) mit Heron/Newton:
  70 //   {x:=beta als vorheriger Anfangswert, dann:}
  71 //   x := floor((beta+a[2n-1])/2)
  72 //   wiederhole: d:=floor((a[2n-1]*beta+a[2n-2])/x).
  73 //               Falls d<beta (kein Überlauf) und d<x,
  74 //                 setze x:=floor((x+d)/2), nochmals.
  75 //   b[n-1]:=x. In B um ein Bit nach links verschoben abspeichern.
  76 // {Wegen a[2n-1]>=beta/4 ist b[n-1]>=beta/2.}
  77 // Erniedrige [a[2n-1],a[2n-2]] um b[n-1]^2.
  78 // Für j=1,...,n:
  79 //   {Hier [b[n-1],...,b[n-j]] = floor(sqrt(altes [a[2n-1],...,a[2n-2j]])),
  80 //     in [a[2n-1],...,a[2n-2j]] steht jetzt der Rest
  81 //     [a[2n-1],...,a[2n-2j]] - [b[n-1],...,b[n-j]]^2, er ist >=0 und
  82 //     und <= 2 * [b[n-1],...,b[n-j]], belegt daher höchstens j Digits und 1 Bit.
  83 //     Daher sind nur [a[2n-j],...,a[2n-2j]] von Belang.}
  84 //   Für j<n: Bestimme die nächste Ziffer:
  85 //     b* := min(beta-1,floor([a[2n-j],...,a[2n-2j-1]]/(2*[b[n-1],...,b[n-j]]))).
  86 //     und [a[2n-j],...,a[2n-2j-1]] :=
  87 //         [a[2n-j],...,a[2n-2j-1]] - b* * 2 * [b[n-1],...,b[n-j]] (>= 0).
  88 //     Im einzelnen:
  89 //       b* := min(beta-1,floor([a[2n-j],a[2n-j-1],a[2n-j-2]]/(2*b[n-1]))),
  90 //       [a[2n-j],...,a[2n-2j-1]] wie angegeben erniedigen.
  91 //       Solange die Differenz <0 ist, setze b* := b* - 1 und
  92 //         erhöhe [a[2n-j],...,a[2n-2j-1]] um 2 * [b[n-1],...,b[n-j]].
  93 //     Erniedrige [a[2n-j],...,a[2n-2j-2]] um b* ^ 2.
  94 //     Tritt dabei ein negativer Carry auf,
  95 //       so setze b* := b* - 1,
  96 //          setze b[n-j-1] := b* (im Speicher um 1 Bit nach links verschoben),
  97 //          erhöhe [a[2n-j],...,a[2n-2j-2]] um 2*[b[n-1],...,b[n-j-1]]+1.
  98 //       Sonst setze b[n-j-1] := b* (im Speicher um 1 Bit nach links verschoben).
  99 //     Nächstes j.
 100 //   Für j=n:
 101 //     Falls [a[n],...,a[0]] = [0,...,0], ist die Wurzel exakt, sonst nicht.
 102 //     Ergebnis ist [b[n-1],...,b[0]] * 2^(-s), schiebe also im Speicher
 103 //       [b[n],...,b[0]] um s+1 Bits nach rechts.
 104 //     Das Ergebnis ist eine NUDS der Länge n.
 105 cl_boolean cl_UDS_sqrt (const uintD* a_MSDptr, uintC a_len, const uintD* a_LSDptr, DS* b_)
 106 {
 107       // A normalisieren:
 108       while ((a_len>0) && (mspref(a_MSDptr,0)==0)) { msshrink(a_MSDptr); a_len--; }
 109       if (a_len==0) // A=0 -> B := NUDS 0
 110         { b_->LSDptr = b_->MSDptr; b_->len = 0; return cl_true; }
 111       CL_ALLOCA_STACK;
 112       // n und s bestimmen:
 113       var uintC n = ceiling(a_len,2); // a_len = 2n oder 2n-1, n>0.
 114       var uintL s;
 115       { var uintD msd = mspref(a_MSDptr,0); // a[2n] bzw. a[2n-1]
 116         #if 0
 117         s = 0;
 118         while /* ((msd & (bit(intDsize-1)|bit(intDsize-2))) ==0) */
 119               (((sintD)msd >= 0) && ((sintD)(msd<<1) >= 0))
 120           { msd = msd<<2; s++; }
 121         #else
 122         integerlengthD(msd, s = intDsize - ); s = s>>1;
 123         #endif
 124       }
 125       // Noch ist s nur modulo intDsize/2 bestimmt.
 126       // A um 2s Bits nach links verschoben kopieren:
 127       var uintD* new_a_MSDptr;
 128       { var uintD* new_a_LSDptr;
 129         num_stack_alloc(2*(uintL)n,new_a_MSDptr=,new_a_LSDptr=); // 2n Digits Platz belegen
 130        {var uintL shiftcount = 2*s;
 131         if (!((a_len & bit(0)) ==0)) // a_len ungerade?
 132           { s += intDsize/2; lsprefnext(new_a_LSDptr) = 0; } // ja -> ein Nulldigit einschieben
 133         if (shiftcount==0)
 134           { copy_loop_lsp(a_LSDptr,new_a_LSDptr,a_len); }
 135           else
 136           { shiftleftcopy_loop_lsp(a_LSDptr,new_a_LSDptr,a_len,shiftcount); }
 137       }}
 138       #define a_MSDptr  new_a_MSDptr
 139       // Nun ist A = a_MSDptr/2n/..
 140       if (cl_recipsqrt_suitable(n))
 141         { // C := 1/sqrt(A) und dann D := A*C näherungsweise errechnen.
 142           // D evtl. korrigieren, liefert B.
 143           var uintD* c_MSDptr;
 144           var uintD* c_LSDptr;
 145           var uintD* d_MSDptr;
 146           var uintD* d_LSDptr;
 147           var uintD* d2_MSDptr;
 148           num_stack_alloc(n+2, c_MSDptr=,c_LSDptr=);
 149           num_stack_alloc(2*n+3, d_MSDptr=,d_LSDptr=);
 150           num_stack_alloc(2*n, d2_MSDptr=,);
 151           // 1/4 <= a < 1.
 152           cl_UDS_recipsqrt(a_MSDptr,2*n,c_MSDptr,n);
 153           // 1 <= c <= 2, | 1/sqrt(a) - c | < 1/2*beta^-n.
 154           cl_UDS_mul(a_MSDptr mspop (n+1),n+1,c_LSDptr,n+2,d_LSDptr);
 155           // 1/4 <= d < 2, | sqrt(a) - d | < beta^-n.
 156           if (mspref(d_MSDptr,0) > 0)
 157             { dec_loop_lsp(d_MSDptr mspop (n+1),n+1);
 158               if (mspref(d_MSDptr,0) > 0) cl_abort();
 159             }
 160           // D is our guess for B. Square to see how much we have to correct.
 161           cl_UDS_mul_square(d_MSDptr mspop (1+n),n,d2_MSDptr mspop 2*n);
 162           // Store D.
 163           b_->LSDptr = copy_loop_msp(d_MSDptr mspop 1,b_->MSDptr,n);
 164           b_->len = n;
 165           // Store 2*D in place of D.
 166           if (shift1left_loop_lsp(d_MSDptr mspop (1+n),n))
 167             mspref(d_MSDptr,0) = 1;
 168           // Compare D^2 against A.
 169           if (subfrom_loop_lsp(d2_MSDptr mspop 2*n,a_MSDptr mspop 2*n,2*n))
 170             // guessed too high, decrement D
 171             { dec_loop_lsp(b_->LSDptr,n);
 172               dec_loop_lsp(d_MSDptr mspop (1+n),1+n); // store 2*D+1
 173               if (!addto_loop_lsp(d_MSDptr mspop (1+n),a_MSDptr mspop 2*n,1+n))
 174                 cl_abort();
 175               if (!inc_loop_lsp(a_MSDptr mspop (n-1),n-1))
 176                 cl_abort();
 177             }
 178           else if (test_loop_msp(a_MSDptr,n-1))
 179             // guessed way too low
 180             cl_abort();
 181           else if (compare_loop_msp(a_MSDptr mspop (n-1),d_MSDptr,1+n) > 0)
 182             // guessed too low, increment D
 183             { inc_loop_lsp(b_->LSDptr,n);
 184               mspref(d_MSDptr,n) |= bit(0); // store 2*D-1
 185               subfrom_loop_lsp(d_MSDptr mspop (1+n),a_MSDptr mspop 2*n,1+n);
 186               inc_loop_lsp(d_MSDptr mspop (1+n),1+n); // store 2*D
 187               if (compare_loop_msp(a_MSDptr mspop (n-1),d_MSDptr,1+n) > 0)
 188                 cl_abort();
 189             }
 190           else
 191             // guessed ok
 192             {}
 193           // Schiebe b um s Bits nach rechts:
 194           if (s > 0)
 195             shiftright_loop_msp(b_->MSDptr,n,s);
 196           // Teste, ob alle a[n],...,a[0]=0 sind:
 197           if (test_loop_msp(a_MSDptr mspop (n-1),n+1))
 198             return cl_false;
 199           else
 200             return cl_true; // ja -> Wurzel exakt
 201         }
 202       // Platz für B belegen:
 203       { var uintD* b_MSDptr = b_->MSDptr mspop -1; // ab hier n+1 Digits Platz
 204         var uintD b_msd;
 205         // B = [0,b[n-1],...,b[0]] = b_MSDptr/n+1/..
 206         // Bestimmung von b[n-1]:
 207         { var uintD a_msd = mspref(a_MSDptr,0); // a[2n-1]
 208           var uintD a_2msd = mspref(a_MSDptr,1); // a[2n-2]
 209           #if HAVE_DD
 210           var uintDD a_msdd = highlowDD(a_msd,a_2msd); // a[2n-1]*beta+a[2n-2]
 211           #endif
 212           // Anfangswert: x := floor((beta + a[2n-1])/2)
 213           var uintD x = floor(a_msd,2) | bit(intDsize-1);
 214           loop // Heron-Iterationsschleife
 215             { var uintD d;
 216               // Dividiere d := floor((a[2n-1]*beta+a[2n-2])/x) :
 217               if (a_msd>=x) break; // Überlauf -> d>=beta -> fertig
 218               #if HAVE_DD
 219                 divuD(a_msdd,x, d=,);
 220               #else
 221                 divuD(a_msd,a_2msd,x, d=,);
 222               #endif
 223               if (d >= x) break; // d>=x -> fertig
 224               // Nächste Iteration: x := floor((x+d)/2)
 225               // (Da die Folge der x bekanntlich monoton fallend ist
 226               // und bei b[n-1] >= beta/2 endet, muß x >= beta/2 werden,
 227               // d.h. x+d>=beta.)
 228               #if HAVE_DD
 229                 x = (uintD)(floor((uintDD)x + (uintDD)d, 2));
 230               #else
 231                 x = floor((uintD)(x+d),2) | bit(intDsize-1);
 232               #endif
 233             }
 234           // x = b[n-1] fertig berechnet.
 235           b_msd = x;
 236           // Quadrieren und von [a[2n-1],a[2n-2]] abziehen:
 237           #if HAVE_DD
 238             a_msdd -= muluD(x,x);
 239             mspref(a_MSDptr,0) = highD(a_msdd); mspref(a_MSDptr,1) = lowD(a_msdd);
 240           #else
 241             {var uintD x2hi;
 242              var uintD x2lo;
 243              muluD(x,x, x2hi=,x2lo=);
 244              mspref(a_MSDptr,0) = a_msd - x2hi;
 245              if (a_2msd < x2lo) { mspref(a_MSDptr,0) -= 1; }
 246              mspref(a_MSDptr,1) = a_2msd - x2lo;
 247             }
 248           #endif
 249           mspref(b_MSDptr,0) = 1; mspref(b_MSDptr,1) = x<<1; // b[n-1] ablegen
 250         }
 251        {var uintC j = 0;
 252         var uintD* a_mptr = a_MSDptr mspop 0;
 253         var uintD* a_lptr = a_MSDptr mspop 2;
 254         var uintD* b_ptr = b_MSDptr mspop 2;
 255         // Wurzel-Hauptschleife
 256         until (++j == n) // j=1,...,n
 257           { // b_MSDptr = Pointer auf b[n], b_ptr = Pointer hinter b[n-j].
 258             // a_mptr = Pointer auf a[2n-j], a_lptr = Pointer hinter a[2n-2j].
 259             // Bestimme b* :
 260             var uintD b_stern;
 261             { var uintD a_1d = mspref(a_mptr,0); // a[2n-j], =0 oder =1
 262               var uintD a_2d = mspref(a_mptr,1); // a[2n-j-1]
 263               var uintD a_3d = mspref(a_mptr,2); // a[2n-j-2]
 264               // a[2n-j]*beta^2+a[2n-j-1]*beta+a[2n-j-2] durch 2 dividieren,
 265               // dann durch b_msd = b[n-1] dividieren:
 266               #if HAVE_DD
 267                 var uintDD a_123dd = highlowDD(a_2d,a_3d);
 268                 a_123dd = a_123dd>>1; if (!(a_1d==0)) { a_123dd |= bit(2*intDsize-1); }
 269                 if (highD(a_123dd) >= b_msd)
 270                   { b_stern = bitm(intDsize)-1; } // bei Überlauf: beta-1
 271                   else
 272                   { divuD(a_123dd,b_msd, b_stern=,); }
 273               #else
 274                 a_3d = a_3d>>1; if (!((a_2d & bit(0)) ==0)) { a_3d |= bit(intDsize-1); }
 275                 a_2d = a_2d>>1; if (!(a_1d==0)) { a_2d |= bit(intDsize-1); }
 276                 if (a_2d >= b_msd)
 277                   { b_stern = bitm(intDsize)-1; } // bei Überlauf: beta-1
 278                   else
 279                   { divuD(a_2d,a_3d,b_msd, b_stern=,); }
 280               #endif
 281             }
 282             // b_stern = b* in der ersten Schätzung.
 283             a_lptr = a_lptr mspop 1; // Pointer hinter a[2n-2j-1]
 284             // Subtraktion [a[2n-j],...,a[2n-2j-1]] -= b* * [b[n],b[n-1],...,b[n-j]] :
 285             { var uintD carry = mulusub_loop_lsp(b_stern,b_ptr,a_lptr,j+1);
 286               if (mspref(a_mptr,0) >= carry)
 287                 { mspref(a_mptr,0) -= carry; }
 288                 else
 289                 { mspref(a_mptr,0) -= carry; // a[2n-j] wird <0
 290                   // negativer Übertrag -> b* nach unten korrigieren:
 291                   loop
 292                     { b_stern = b_stern-1; // b* := b* - 1
 293                       // erhöhe [a[2n-j],...,a[2n-2j-1]] um [b[n],...,b[n-j]]:
 294                       if (!(( addto_loop_lsp(b_ptr,a_lptr,j+1) ==0)))
 295                         if ((mspref(a_mptr,0) += 1) ==0) // Übertrag zu a[2n-j]
 296                           break; // macht a[2n-j] wieder >=0 -> Subtraktionsergebnis >=0
 297             }   }   }
 298             // b_stern = b* in der zweiten Schätzung.
 299             a_mptr = a_mptr mspop 1; // Pointer auf a[2n-j-1]
 300             a_lptr = a_lptr mspop 1; // Pointer hinter a[2n-2j-2]
 301             // Ziehe b* ^ 2 von [a[2n-j],...,a[2n-2j-2]] ab:
 302             #if HAVE_DD
 303             { var uintDD b_stern_2 = muluD(b_stern,b_stern);
 304               var uintDD a_12dd = highlowDD(lspref(a_lptr,1),lspref(a_lptr,0)); // a[2n-2j-1]*beta+a[2n-2j-2]
 305               var uintDD a_12dd_new = a_12dd - b_stern_2;
 306               lspref(a_lptr,1) = highD(a_12dd_new); lspref(a_lptr,0) = lowD(a_12dd_new);
 307               if (a_12dd >= b_stern_2) goto b_stern_ok;
 308             }
 309             #else
 310             { var uintD b_stern_2_hi;
 311               var uintD b_stern_2_lo;
 312               muluD(b_stern,b_stern, b_stern_2_hi=,b_stern_2_lo=);
 313              {var uintD a_1d = lspref(a_lptr,1); // a[2n-2j-1]
 314               var uintD a_2d = lspref(a_lptr,0); // a[2n-2j-2]
 315               var uintD a_1d_new = a_1d - b_stern_2_hi;
 316               var uintD a_2d_new = a_2d - b_stern_2_lo;
 317               if (a_2d < b_stern_2_lo) { a_1d_new -= 1; }
 318               lspref(a_lptr,1) = a_1d_new; lspref(a_lptr,0) = a_2d_new;
 319               if ((a_1d > b_stern_2_hi)
 320                   || ((a_1d == b_stern_2_hi) && (a_2d >= b_stern_2_lo))
 321                  )
 322                 goto b_stern_ok;
 323             }}
 324             #endif
 325             if (TRUE)
 326               { // muß noch [a[2n-j],...,a[2n-2j]] um 1 erniedrigen:
 327                 if ( dec_loop_lsp(a_lptr lspop 2,j+1) ==0) goto b_stern_ok;
 328                 // Subtraktion von b*^2 lieferte negativen Carry
 329                 b_stern = b_stern-1; // b* := b* - 1
 330                 // erhöhe [a[2n-j-1],...,a[2n-2j-2]] um [b[n],...,b[n-j],0] + 2 * b* + 1
 331                 if ((sintD)b_stern < 0) { mspref(b_ptr,-1) |= bit(0); } // höchstes Bit von b* in b[n-j] ablegen
 332                 mspref(b_ptr,0) = (uintD)(b_stern<<1)+1; // niedrige Bits von b* und eine 1 als b[n-j-1] ablegen
 333                 addto_loop_lsp(b_ptr mspop 1,a_lptr,j+2);
 334                 // (a[2n-j] wird nicht mehr gebraucht.)
 335                 mspref(b_ptr,0) -= 1; // niedrige Bits von b* in b[n-j-1] ablegen
 336                 b_ptr = b_ptr mspop 1;
 337               }
 338               else
 339               b_stern_ok:
 340               { // b* als b[n-j-1] ablegen:
 341                 if ((sintD)b_stern < 0) { mspref(b_ptr,-1) |= bit(0); } // höchstes Bit von b* in b[n-j] ablegen
 342                 mspref(b_ptr,0) = (uintD)(b_stern<<1); // niedrige Bits von b* als b[n-j-1] ablegen
 343                 b_ptr = b_ptr mspop 1;
 344               }
 345           }
 346         // b_MSDptr = Pointer auf b[n], b_ptr = Pointer hinter b[0].
 347         // a_mptr = Pointer auf a[n].
 348         // Schiebe [b[n],...,b[0]] um s+1 Bits nach rechts:
 349         if (s == intDsize-1)
 350           { lsshrink(b_ptr); }
 351           else
 352           { shiftright_loop_msp(b_MSDptr,n+1,s+1); msshrink(b_MSDptr); }
 353         // b = b_MSDptr/n/b_ptr ist fertig, eine NUDS.
 354         b_->MSDptr = b_MSDptr; b_->len = n; b_->LSDptr = b_ptr;
 355         // Teste, ob alle a[n],...,a[0]=0 sind:
 356         if (test_loop_msp(a_mptr,n+1))
 357           { return cl_false; }
 358           else
 359           { return cl_true; } // ja -> Wurzel exakt
 360       }}
 361 }
 362 // Bit complexity (N := a_len): O(M(N)).
 363
 364 }  // namespace cln