cl_DS_recip.cc

   1 // cl_UDS_recip().
   2
   3 // General includes.
   4 #include "cl_sysdep.h"
   5
   6 // Specification.
   7 #include "cl_DS.h"
   8
   9
  10 // Implementation.
  11
  12 #include "cln/abort.h"
  13
  14 namespace cln {
  15
  16 // Compute the reciprocal value of a digit sequence.
  17 // Input: UDS a_MSDptr/a_len/.. of length a_len,
  18 //        with  1/2*beta^a_len <= a < beta^a_len.
  19 // Output: UDS b_MSDptr/b_len+2/.. of length b_len+1 (b_len>1), plus 1 more bit
  20 //        in the last limb, such that
  21 //        beta^b_len <= b <= 2*beta^b_len  and
  22 //        | beta^(a_len+b_len)/a - b | < 1.
  23 // If a_len > b_len, only the most significant b_len limbs + 3 bits of a
  24 // are used.
  25   extern void cl_UDS_recip (const uintD* a_MSDptr, uintC a_len,
  26                             uintD* b_MSDptr, uintC b_len);
  27 // Method:
  28 // Using Newton/Heron iteration.
  29 // Write x = a/beta^a_len and y = b/beta^b_len.
  30 // So we start out with 1/2 <= x < 1 and search an y with 1 <= y <= 2
  31 // and  | 1/x - y | < beta^(-b_len).
  32 // For n = 1,2,...,b_len we compute approximations y with 1 <= yn <= 2
  33 // and  | 1/x - yn | < beta^(-n). The first n limbs of x, plus the
  34 // next 3 bits (of the (n+1)st limb) enter the computation of yn. Apart
  35 // from that, yn remains valid for any x which shares the same n+1
  36 // most significant limbs.
  37 // Step n = 1:
  38 //   Write  x = x1/beta + x2/beta^2 + xr with 0 <= xr < 1/(8*beta).
  39 //   Divide (beta^2-beta*x1-x2) by x1, gives  beta^2-x1*beta-x2 = q*x1+r.
  40 //   If this division overflows, i.e. q >= beta, then x1 = beta/2, x2 = 0,
  41 //   and we just return y1 = 2.
  42 //   Else set qd := ceiling(max(q*x2/beta - r, 0) / (x1+1)) and return
  43 //   y1 = (beta+q-qd)/beta.
  44 //   Rationale: Obviously 0 <= qd <= q and 0 <= qd <= 2. We have
  45 //     beta^2 - beta*(beta+q-qd)*x
  46 //     <= beta^2 - (beta+q-qd)*(x1 + x2/beta)
  47 //     = beta^2 - beta*x1 - x2 - q*(x1 + x2/beta) + qd*(x1 + x2/beta)
  48 //     = q*x1 + r - q*(x1 + x2/beta) + qd*(x1 + x2/beta)
  49 //     = r - q*x2/beta + qd*(x1 + x2/beta)
  50 //     if qd=0: <= r <= x1-1 < x1
  51 //     if qd>0: < r - q*x2/beta + qd*(x1+1) <= x1
  52 //     hence always < x1 <= beta*x, hence
  53 //     1 - x*y1 <= x/beta, hence 1/x - y1 <= 1/beta.
  54 //     And on the other hand
  55 //     beta^2 - beta*(beta+q-qd)*x
  56 //     = beta^2 - (beta+q-qd)*(x1 + x2/beta) - beta*(beta+q-qd)*xr
  57 //     where the third term is
  58 //       <= 2*beta^2*xr < beta/4 <= x1/2 <= beta*x/2.
  59 //     Hence
  60 //     beta^2 - beta*(beta+q-qd)*x >
  61 //     > beta^2 - (beta+q-qd)*(x1 + x2/beta) - beta*x/2
  62 //     = r - q*x2/beta + qd*(x1 + x2/beta) - beta*x/2
  63 //     >= - qd - beta*x/2 > - beta*x, hence
  64 //     1 - x*y1 >= -x/beta, hence 1/x - y1 >= -1/beta.
  65 // Step n -> m with n < m <= 2*n:
  66 //   Write x = xm + xr with 0 <= xr < 1/(8*beta^m).
  67 //   Set ym' = 2*yn - xm*yn*yn,
  68 //   ym = ym' rounded up to be a multiple of 1/(2*beta^m).
  69 //   Rationale:
  70 //     1/x - ym <= 1/x - ym' = 1/x - 2*yn + (x-xr)*yn*yn
  71 //     <= 1/x - 2*yn + x*yn*yn = x * (1/x - yn)^2 < x*beta^(-2n)
  72 //     < beta^(-2n) <= beta^(-m), and
  73 //     1/x - ym' = 1/x - 2*yn + (x-xr)*yn*yn
  74 //     > 1/x - 2*yn + x*yn*yn - 1/(2*beta^m)
  75 //     = x * (1/x - yn)^2 - 1/(2*beta^m) >= - 1/(2*beta^m), hence
  76 //     1/x - ym > 1/x - ym' - 1/(2*beta^m) >= -1/beta^m.
  77 //   Since it is needed to compute ym as a multiple of 1/(2*beta^m),
  78 //   not only as a multiple of 1/beta^m, we compute with zn = 2*yn.
  79 //   The iteration now reads  zm = round_up(2*zn - xm*zn*zn/2).
  80 // Choice of n:
  81 //   So that the computation is minimal, e.g. in the case b_len=10:
  82 //   1 -> 2 -> 3 -> 5 -> 10 and not 1 -> 2 -> 4 -> 8 -> 10.
  83   void cl_UDS_recip (const uintD* a_MSDptr, uintC a_len,
  84                      uintD* b_MSDptr, uintC b_len)
  85     {
  86       var uintC y_len = b_len+1;
  87       var uintC x_len = (a_len <= b_len ? a_len+1 : y_len);
  88       var uintD* x_MSDptr;
  89       var uintD* y_MSDptr;
  90       var uintD* y2_MSDptr;
  91       var uintD* y3_MSDptr;
  92       CL_ALLOCA_STACK;
  93       num_stack_alloc(x_len,x_MSDptr=,);
  94       num_stack_alloc(y_len,y_MSDptr=,);
  95       num_stack_alloc(2*y_len,y2_MSDptr=,);
  96       num_stack_alloc(x_len+2*y_len,y3_MSDptr=,);
  97       // Prepare x/2 at x_MSDptr by shifting a right by 1 bit.
  98       if (a_len <= b_len)
  99         { mspref(x_MSDptr,a_len) =
 100             shiftrightcopy_loop_msp(a_MSDptr,x_MSDptr,a_len,1,0);
 101         }
 102         else
 103         { mspref(x_MSDptr,b_len) =
 104             shiftrightcopy_loop_msp(a_MSDptr,x_MSDptr,b_len,1,0)
 105             | ((mspref(a_MSDptr,b_len) & -bit(intDsize-3)) >> 1);
 106         }
 107       // Step n = 1.
 108       { var uintD x1 = mspref(a_MSDptr,0);
 109         var uintD x2 = (a_len > 1 ? (mspref(a_MSDptr,1) & -bit(intDsize-3)) : 0);
 110         if ((x1 == (uintD)bit(intDsize-1)) && (x2 == 0))
 111           { mspref(y_MSDptr,0) = 4; mspref(y_MSDptr,1) = 0; }
 112           else
 113           { var uintD q;
 114             var uintD r;
 115             var uintD chi;
 116             var uintD clo;
 117             #if HAVE_DD
 118               divuD((uintDD)(-highlowDD(x1,x2)),x1, q=,r=);
 119               var uintDD c = muluD(q,x2);
 120               chi = highD(c); clo = lowD(c);
 121             #else
 122               divuD((uintD)(-x1 - (x2>0 ? 1 : 0)),(uintD)(-x2),x1, q=,r=);
 123               muluD(q,x2,chi=,clo=);
 124             #endif
 125             if (clo > 0)
 126               chi++;
 127             // qd := ceiling(max(chi-r,0)/(x1+1))
 128             if (chi > r)
 129               { chi -= r;
 130                 if (chi > x1)
 131                   { q--; }
 132                 q--;
 133               }
 134             mspref(y_MSDptr,0) = 2 + (q>>(intDsize-1));
 135             mspref(y_MSDptr,1) = q<<1;
 136           }
 137       }
 138       // Other steps.
 139       var int k;
 140       integerlength32((uint32)b_len-1,k=);
 141       // 2^(k-1) < b_len <= 2^k, so we need k steps.
 142       var uintC n = 1;
 143       for (; k>0; k--)
 144         { // n = ceiling(b_len/2^k) limbs of y have already been computed.
 145           var uintC m = ((b_len-1)>>(k-1))+1; // = ceiling(b_len/2^(k-1))
 146           // Compute zm := 2*zn - round_down(xm/2*zn*zn).
 147           cl_UDS_mul_square(y_MSDptr mspop (n+1),n+1,y2_MSDptr mspop 2*(n+1));
 148           var uintC xm_len = (m < x_len ? m+1 : x_len);
 149           cl_UDS_mul(x_MSDptr mspop xm_len,xm_len,
 150                      y2_MSDptr mspop 2*(n+1),2*n+1,
 151                      y3_MSDptr mspop (xm_len+2*n+1));
 152           // Round down by just taking the first m+1 limbs at y3_MSDptr.
 153           shift1left_loop_lsp(y_MSDptr mspop (n+1),n+1);
 154           clear_loop_msp(y_MSDptr mspop (n+1),m-n);
 155           subfrom_loop_lsp(y3_MSDptr mspop (m+1),y_MSDptr mspop (m+1),m+1);
 156           n = m;
 157         }
 158       // All n = b_len limbs of y have been computed. Divide by 2.
 159       mspref(b_MSDptr,b_len+1) =
 160         shiftrightcopy_loop_msp(y_MSDptr,b_MSDptr,b_len+1,1,0);
 161     }
 162 // Bit complexity (N := b_len): O(M(N)).
 163
 164 }  // namespace cln