src/base/low/cl_low_div.cc

   1 // Low level: division.
   2
   3 // General includes.
   4 #include "base/cl_sysdep.h"
   5
   6 // Specification.
   7 #include "base/cl_low.h"
   8
   9
  10 // Implementation.
  11
  12 #ifdef NEED_VAR_divu_16_rest
  13 uint16 divu_16_rest;
  14 #endif
  15
  16 #ifdef NEED_FUNCTION_divu_3216_1616_
  17 uint16 divu_16_rest;
  18 namespace cln {
  19 #if 1
  20 // Most processors have a good 32 by 32 bit division, use that.
  21 uint16 divu_3216_1616_ (uint32 x, uint16 y)
  22 {
  23         var uint16 q = floor(x,(uint32)y);
  24         divu_16_rest = x - (uint32)q * (uint32)y;
  25         return q;
  26 }
  27 #else
  28 // On processors without a hardware division, we have to produce the quotient's
  29 // bits individually. Basically the algorithm is like this:
  30 //     q := 0; r := x;
  31 //     if (r >= 2^15*y) { r -= 2^15*y; q |= 2^15; }
  32 //     if (r >= 2^14*y) { r -= 2^14*y; q |= 2^14; }
  33 //     ...
  34 //     if (r >= 2^0*y) { r -= 2^0*y; q |= 2^0; }
  35 // We don't want to shift 2^k*y to the right. Instead, we shift r to the left:
  36 //     q := 0; r := x;
  37 //     if (r >= 2^15*y) { r -= 2^15*y; q |= 2^15; }
  38 //     if (2*r >= 2^15*y) { 2*r -= 2^15*y; q |= 2^14; }
  39 //     ...
  40 //     if (2^15*r >= 2^15*y) { 2^15*r -= 2^15*y; q |= 2^0; }
  41 // In other terms:
  42 //     q := 0; r := x; s := 2^15*y;
  43 //     if (r >= s) { r -= s; q |= 2^15; }
  44 //     r := 2*r;
  45 //     if (r >= s) { r -= s; q |= 2^14; }
  46 //     r := 2*r;
  47 //     ...
  48 //     r := 2*r;
  49 //     if (r >= s) { r -= s; q |= 2^0; }
  50 //     r := r >> 15;
  51 // Now, we combine r and q into a single register. The bits of q won't disturb
  52 // the "r >= s" comparisons because up to the last comparisons only max 15
  53 // bits have accumulated, and s is a multiple of 2^15.
  54 // We can thus shift r and q with a single instruction.
  55 //     q := 0; r := x; s := 2^15*y;
  56 //     if (r >= s) { r -= s; r := 2*r+1; } else r := 2*r;
  57 //     if (r >= s) { r -= s; r := 2*r+1; } else r := 2*r;
  58 //     ...
  59 //     if (r >= s) { r -= s; r := 2*r+1; } else r := 2*r;
  60 //     q := r & (2^16-1); r := r >> 16;
  61 // Up to now, this is pretty standard.
  62 // On most hardware the comparison "r >= s" already implies doing a subtraction
  63 // r - s. It is wasteful to do the subtraction twice. Better do it once and
  64 // test the carry afterwards:
  65 //     q := 0; r := x; s := 2^15*y;
  66 //     r -= s; if (no subcarry) { r := 2*r+1; } else { r := 2*r+2*s; }
  67 //     r -= s; if (no subcarry) { r := 2*r+1; } else { r := 2*r+2*s; }
  68 //     ...
  69 //     r -= s; if (no subcarry) { r := 2*r+1; } else { r := 2*r+2*s; }
  70 //     q := r & (2^16-1); r := r >> 16;
  71 // In the case of carry we can combine the "+2*s" with the next "-s" operation.
  72 // So this becomes "r := 2*r+s". But note about the carries: the case
  73 // "(2*r+2*s)-s gives no subcarry" is equivalent to "2*r+s gives an addcarry".
  74 // On most processors, subcarry and addcarry are the same bit. So we turn
  75 // the subcarry into an addcarry by writing "r += -s" instead of "r -= s"
  76 // (or vice versa: writing "2*r-(-s)" instead of "2*r+s").
  77 //     q := 0; r := x; s := 2^15*y;
  78 //     r += -s;
  79 //     if (addcarry) { r := 2*r+1; r += -s; } else { r := 2*r+s; }
  80 //     if (addcarry) { r := 2*r+1; r += -s; } else { r := 2*r+s; }
  81 //     ...
  82 //     if (addcarry) { r := 2*r+1; } else { r := 2*r+2*s; }
  83 //     q := r & (2^16-1); r := r >> 16;
  84 // This algorithm is implemented in cl_asm_arm.cc and (in slightly modified
  85 // form) in cl_asm_sparc.cc.
  86 #endif
  87 }  // namespace cln
  88 #endif
  89
  90 #ifdef NEED_FUNCTION_divu_3232_3232_
  91 namespace cln {
  92 // Dies dient nur noch als Hilfsfunktion für floorD().
  93 // Die Rückgabe des Restes in divu_32_rest ist also hier nicht nötig.
  94 uint32 divu_3232_3232_(uint32 x, uint32 y)
  95 {
  96         var uint32 q;
  97         divu_3232_3232(x,y,q=,);
  98         return q;
  99 }
 100 }  // namespace cln
 101 #endif
 102
 103 #ifdef NEED_VAR_divu_32_rest
 104 uint32 divu_32_rest;
 105 #endif
 106
 107 #ifdef NEED_FUNCTION_divu_6432_3232_
 108 uint32 divu_32_rest;
 109 namespace cln {
 110 uint32 divu_6432_3232_(uint32 xhi, uint32 xlo, uint32 y)
 111 // Methode:
 112 // Wie UDS_divide mit intDsize=16, a_len=4, b_len=2.
 113 {
 114     if (y <= (uint32)(bit(16)-1))
 115         // 48-durch-16-Bit-Division,
 116         // aufgebaut aus zwei 32-durch-16-Bit-Divisionen:
 117         { var uint16 q1;
 118           var uint16 q0;
 119           var uint16 r1;
 120           divu_3216_1616(highlow32(low16(xhi),high16(xlo)),y, q1=,r1=);
 121           divu_3216_1616(highlow32(r1,low16(xlo)),y, q0=,divu_32_rest=);
 122           return highlow32(q1,q0);
 123         }
 124     // y>=2^16
 125     {// y shiften:
 126       var uintL s = 0;
 127       while ((sint32)y >= 0) { y = y<<1; s++; }
 128       // x entsprechend shiften:
 129       if (!(s==0))
 130         { xhi = (xhi << s) | (xlo >> (32-s)); xlo = xlo << s; }
 131       // 64-durch-32-Bit-Division,
 132       // aufgebaut aus zwei 48-durch-32-Bit-Divisionen.
 133       // Methode für eine 48-durch-32-Bit-Division x/y mit 0 <= x < 2^16*y :
 134       // (beta = 2^n = 2^16, n = 16)
 135       // Wir wissen beta^2/2 <= y < beta^2, Quotient  q = floor(x/y) < beta.
 136       // Schreibe  x = beta*x1 + x0  mit  x1 := floor(x/beta)
 137       // und       y = beta*y1 + y0  mit  y1 := floor(y/beta)
 138       // und bilde den Näherungs-Quotienten floor(x1/y1)
 139       // oder (noch besser) floor(x1/(y1+1)).
 140       // Wegen 0 <= x1 < 2^(2n) und 0 < 2^(n-1) <= y1 < 2^n
 141       // und  x1/(y1+1) <= x/y < x1/(y1+1) + 2
 142       // (denn x1/(y1+1) = (x1*beta)/((y1+1)*beta) <= (x1*beta)/y <= x/y
 143       // und x/y - x1/(y1+1) = (x+x*y1-x1*y)/(y*(y1+1))
 144       // = (x+x0*y1-x1*y0)/(y*(y1+1)) <= (x+x0*y1)/(y*(y1+1))
 145       // <= x/(y*(y1+1)) + x0/y = (x/y)/(y1+1) + x0/y
 146       // <= 2^n/(2^(n-1)+1) + 2^n/2^(2n-1) = 2^n/(2^(n-1)+1) + 2^(1-n) < 2 )
 147       // gilt  floor(x1/(y1+1)) <= floor(x/y) <= floor(x1/(y1+1)) + 2  .
 148       // Man bildet also  q:=floor(x1/(y1+1))  (ein Shift um n Bit oder
 149       // eine (2n)-durch-n-Bit-Division, mit Ergebnis q <= floor(x/y) < beta)
 150       // und x-q*y und muß hiervon noch höchstens 2 mal y abziehen und q
 151       // incrementieren, um den Quotienten  q = floor(x/y)  und den Rest
 152       // x-floor(x/y)*y  der Division zu bekommen.
 153       { var uint16 y1_1 = high16(y)+1; // y1+1
 154         var uint16 q1;
 155         var uint16 q0;
 156         var uint32 r;
 157         // 2^16*xhi+high16(xlo) durch y dividieren:
 158        {var uint16 r16;
 159         var uint32 r2;
 160         if (y1_1==0)
 161           { q1 = high16(xhi); r16 = low16(xhi); }
 162           else
 163           { divu_3216_1616(xhi,y1_1, q1=,r16=); }
 164         // q1 = floor(xhi/(y1+1)), r16 = xhi - (y1+1)*q1 (>=0, <=y1)
 165         // Bilde r := (2^16*xhi+high16(xlo)) - y*q1
 166         //          = 2^16*(xhi-y1*q1) + high16(xlo) - y0*q1
 167         //          = 2^16*r16 + 2^16*q1 + high16(xlo) - y0*q1 (>=0)
 168         // Dies ist < 2^16*y1 + 2^32 <= y + 2^32 <= 3*y, kann überlaufen!
 169         r = highlow32(r16,high16(xlo)); // 2^16*r16 + high16(xlo) < 2^32
 170         r2 = highlow32_0(q1) - mulu16(low16(y),q1); // 2^16*q1 - y0*q1 < 2^32
 171         // 0 <= r+r2 < 3*y. Bei der Addition auf Carry testen!
 172         // Carry -> jedenfalls y <= r+r2 < y + 2^32 <= 3*y.
 173         // kein Carry -> jedenfalls 0 <= r+r2 < 2^32 <= 2*y.
 174         if ((r += r2) < r2) // addieren, r >= 2^32 ?
 175           { q1 += 1; r -= y; }
 176         // jetzt noch 0 <= r < 2^32 <= 2*y
 177         if (r >= y)
 178           { q1 += 1; r -= y; }
 179        }// Quotient q1, Rest r fertig.
 180         // 2^16*r+low16(xlo) durch y dividieren:
 181        {var uint16 r16;
 182         var uint32 r2;
 183         if (y1_1==0)
 184           { q0 = high16(r); r16 = low16(r); }
 185           else
 186           { divu_3216_1616(r,y1_1, q0=,r16=); }
 187         // q0 = floor(r/(y1+1)), r16 = r - (y1+1)*q0 (>=0, <=y1)
 188         // Bilde r := (2^16*r+low16(xlo)) - y*q0
 189         //          = 2^16*(r-y1*q0) + low16(xlo) - y0*q0
 190         //          = 2^16*r16 + 2^16*q0 + low16(xlo) - y0*q0 (>=0)
 191         // Dies ist < 2^16*y1 + 2^32 <= y + 2^32 <= 3*y, kann überlaufen!
 192         r = highlow32(r16,low16(xlo)); // 2^16*r16 + low16(xlo) < 2^32
 193         r2 = highlow32_0(q0) - mulu16(low16(y),q0); // 2^16*q0 - y0*q0 < 2^32
 194         // 0 <= r+r2 < 3*y. Bei der Addition auf Carry testen!
 195         // Carry -> jedenfalls y <= r+r2 < y + 2^32 <= 3*y.
 196         // kein Carry -> jedenfalls 0 <= r+r2 < 2^32 <= 2*y.
 197         if ((r += r2) < r2) // addieren, r >= 2^32 ?
 198           { q0 += 1; r -= y; }
 199         // jetzt noch 0 <= r < 2^32 <= 2*y
 200         if (r >= y)
 201           { q0 += 1; r -= y; }
 202        }// Quotient q0, Rest r fertig.
 203         divu_32_rest = r >> s; // Rest
 204         return highlow32(q1,q0); // Quotient
 205 }   } }
 206 }  // namespace cln
 207 #endif
 208
 209 #ifdef NEED_VAR_divu_64_rest
 210 uint64 divu_64_rest;
 211 #endif
 212
 213 #ifdef NEED_FUNCTION_divu_6464_6464_
 214 namespace cln {
 215 uint64 divu_6464_6464_(uint64 x, uint64 y)
 216 // Methode: (beta = 2^n = 2^32, n = 32)
 217 // Falls y < beta, handelt es sich um eine 64-durch-32-Bit-Division.
 218 // Falls y >= beta:
 219 // Quotient  q = floor(x/y) < beta  (da 0 <= x < beta^2, y >= beta).
 220 // y habe genau n+k Bits (1 <= k <= n), d.h. 2^(n+k-1) <= y < 2^(n+k).
 221 // Schreibe  x = 2^k*x1 + x0  mit  x1 := floor(x/2^k)
 222 // und       y = 2^k*y1 + y0  mit  y1 := floor(y/2^k)
 223 // und bilde den Näherungs-Quotienten floor(x1/y1)
 224 // oder (noch besser) floor(x1/(y1+1)).
 225 // Wegen 0 <= x1 < 2^(2n) und 0 < 2^(n-1) <= y1 < 2^n
 226 // und  x1/(y1+1) <= x/y < x1/(y1+1) + 2
 227 // (denn x1/(y1+1) = (x1*2^k)/((y1+1)*2^k) <= (x1*2^k)/y <= x/y
 228 // und x/y - x1/(y1+1) = (x+x*y1-x1*y)/(y*(y1+1))
 229 // = (x+x0*y1-x1*y0)/(y*(y1+1)) <= (x+x0*y1)/(y*(y1+1))
 230 // <= x/(y*(y1+1)) + x0/y
 231 // <= 2^(2n)/(2^(n+k-1)*(2^(n-1)+1)) + 2^k/2^(n+k-1)
 232 // = 2^(n-k+1)/(2^(n-1)+1) + 2^(1-n) <= 2^n/(2^(n-1)+1) + 2^(1-n) < 2 )
 233 // gilt  floor(x1/(y1+1)) <= floor(x/y) <= floor(x1/(y1+1)) + 2  .
 234 // Man bildet also  q:=floor(x1/(y1+1))  (ein Shift um n Bit oder
 235 // eine (2n)-durch-n-Bit-Division, mit Ergebnis q <= floor(x/y) < beta)
 236 // und x-q*y und muss hiervon noch höchstens 2 mal y abziehen und q
 237 // incrementieren, um den Quotienten  q = floor(x/y)  und den Rest
 238 // x-floor(x/y)*y  der Division zu bekommen.
 239 {
 240   if (y <= (uint64)(((uint64)1<<32)-1))
 241     { var uint32 q1;
 242       var uint32 q0;
 243       var uint32 r1;
 244       divu_6432_3232(0,high32(x),y, q1 = , r1 = );
 245       divu_6432_3232(r1,low32(x),y, q0 = , divu_64_rest = );
 246       return highlow64(q1,q0);
 247     }
 248     else
 249     { var uint64 x1 = x; // x1 := x
 250       var uint64 y1 = y; // y1 := y
 251       var uint32 q;
 252       do { x1 = floor(x1,2); y1 = floor(y1,2); } // k erhöhen
 253          while (!(y1 <= (uint64)(((uint64)1<<32)-1))); // bis y1 < beta
 254       { var uint32 y2 = low32(y1)+1; // y1+1 bilden
 255         if (y2==0)
 256           { q = high32(x1); } // y1+1=beta -> ein Shift
 257           else
 258           { divu_6432_3232(high32(x1),low32(x1),y2,q=,); } // Division von x1 durch y1+1
 259       }
 260       // q = floor(x1/(y1+1))
 261       // x-q*y bilden (eine 32-mal-64-Bit-Multiplikation ohne Überlauf):
 262       x -= highlow64_0(mulu32_w(q,high32(y))); // q * high32(y) * beta
 263       // gefahrlos, da q*high32(y) <= q*y/beta <= x/beta < beta
 264       x -= mulu32_w(q,low32(y)); // q * low32(y)
 265       // gefahrlos, da q*high32(y)*beta + q*low32(y) = q*y <= x
 266       // Noch höchstens 2 mal y abziehen:
 267       if (x >= y)
 268         { q += 1; x -= y;
 269           if (x >= y)
 270             { q += 1; x -= y;
 271         }   }
 272       divu_64_rest = x;
 273       return (uint64)q;
 274    }
 275 }
 276 }  // namespace cln
 277 #endif
 278
 279 #ifdef NEED_FUNCTION_divu_12864_6464_
 280 namespace cln {
 281 uint64 divu_12864_6464_(uint64 xhi, uint64 xlo, uint64 y)
 282 // Methode:
 283 // Wie UDS_divide mit intDsize=32, a_len=4, b_len=2.
 284 {
 285     if (y <= (uint64)(bit(32)-1))
 286         // 96-durch-32-Bit-Division,
 287         // aufgebaut aus zwei 64-durch-32-Bit-Divisionen:
 288         { var uint32 q1;
 289           var uint32 q0;
 290           var uint32 r1;
 291           divu_6432_3232(low32(xhi),high32(xlo),y, q1=,r1=);
 292           divu_6432_3232(r1,low32(xlo),y, q0=,divu_64_rest=);
 293           return highlow64(q1,q0);
 294         }
 295     // y>=2^32
 296     {// y shiften:
 297       var uintL s = 0;
 298       while ((sint64)y >= 0) { y = y<<1; s++; }
 299       // x entsprechend shiften:
 300       if (!(s==0))
 301         { xhi = (xhi << s) | (xlo >> (64-s)); xlo = xlo << s; }
 302       // 128-durch-64-Bit-Division,
 303       // aufgebaut aus zwei 96-durch-64-Bit-Divisionen.
 304       // Methode für eine 96-durch-64-Bit-Division x/y mit 0 <= x < 2^32*y :
 305       // (beta = 2^n = 2^32, n = 32)
 306       // Wir wissen beta^2/2 <= y < beta^2, Quotient  q = floor(x/y) < beta.
 307       // Schreibe  x = beta*x1 + x0  mit  x1 := floor(x/beta)
 308       // und       y = beta*y1 + y0  mit  y1 := floor(y/beta)
 309       // und bilde den Näherungs-Quotienten floor(x1/y1)
 310       // oder (noch besser) floor(x1/(y1+1)).
 311       // Wegen 0 <= x1 < 2^(2n) und 0 < 2^(n-1) <= y1 < 2^n
 312       // und  x1/(y1+1) <= x/y < x1/(y1+1) + 2
 313       // (denn x1/(y1+1) = (x1*beta)/((y1+1)*beta) <= (x1*beta)/y <= x/y
 314       // und x/y - x1/(y1+1) = (x+x*y1-x1*y)/(y*(y1+1))
 315       // = (x+x0*y1-x1*y0)/(y*(y1+1)) <= (x+x0*y1)/(y*(y1+1))
 316       // <= x/(y*(y1+1)) + x0/y = (x/y)/(y1+1) + x0/y
 317       // <= 2^n/(2^(n-1)+1) + 2^n/2^(2n-1) = 2^n/(2^(n-1)+1) + 2^(1-n) < 2 )
 318       // gilt  floor(x1/(y1+1)) <= floor(x/y) <= floor(x1/(y1+1)) + 2  .
 319       // Man bildet also  q:=floor(x1/(y1+1))  (ein Shift um n Bit oder
 320       // eine (2n)-durch-n-Bit-Division, mit Ergebnis q <= floor(x/y) < beta)
 321       // und x-q*y und muß hiervon noch höchstens 2 mal y abziehen und q
 322       // incrementieren, um den Quotienten  q = floor(x/y)  und den Rest
 323       // x-floor(x/y)*y  der Division zu bekommen.
 324       { var uint32 y1_1 = high32(y)+1; // y1+1
 325         var uint32 q1;
 326         var uint32 q0;
 327         var uint64 r;
 328         // 2^32*xhi+high32(xlo) durch y dividieren:
 329        {var uint32 r32;
 330         var uint64 r2;
 331         if (y1_1==0)
 332           { q1 = high32(xhi); r32 = low32(xhi); }
 333           else
 334           { divu_6432_3232_w(xhi,y1_1, q1=,r32=); }
 335         // q1 = floor(xhi/(y1+1)), r32 = xhi - (y1+1)*q1 (>=0, <=y1)
 336         // Bilde r := (2^32*xhi+high32(xlo)) - y*q1
 337         //          = 2^32*(xhi-y1*q1) + high32(xlo) - y0*q1
 338         //          = 2^32*r32 + 2^32*q1 + high32(xlo) - y0*q1 (>=0)
 339         // Dies ist < 2^32*y1 + 2^64 <= y + 2^64 <= 3*y, kann überlaufen!
 340         r = highlow64(r32,high32(xlo)); // 2^32*r32 + high32(xlo) < 2^64
 341         r2 = highlow64_0(q1) - mulu32_w(low32(y),q1); // 2^32*q1 - y0*q1 < 2^64
 342         // 0 <= r+r2 < 3*y. Bei der Addition auf Carry testen!
 343         // Carry -> jedenfalls y <= r+r2 < y + 2^64 <= 3*y.
 344         // kein Carry -> jedenfalls 0 <= r+r2 < 2^64 <= 2*y.
 345         if ((r += r2) < r2) // addieren, r >= 2^64 ?
 346           { q1 += 1; r -= y; }
 347         // jetzt noch 0 <= r < 2^64 <= 2*y
 348         if (r >= y)
 349           { q1 += 1; r -= y; }
 350        }// Quotient q1, Rest r fertig.
 351         // 2^32*r+low32(xlo) durch y dividieren:
 352        {var uint32 r32;
 353         var uint64 r2;
 354         if (y1_1==0)
 355           { q0 = high32(r); r32 = low32(r); }
 356           else
 357           { divu_6432_3232_w(r,y1_1, q0=,r32=); }
 358         // q0 = floor(r/(y1+1)), r32 = r - (y1+1)*q0 (>=0, <=y1)
 359         // Bilde r := (2^32*r+low32(xlo)) - y*q0
 360         //          = 2^32*(r-y1*q0) + low32(xlo) - y0*q0
 361         //          = 2^32*r32 + 2^32*q0 + low32(xlo) - y0*q0 (>=0)
 362         // Dies ist < 2^32*y1 + 2^64 <= y + 2^64 <= 3*y, kann überlaufen!
 363         r = highlow64(r32,low32(xlo)); // 2^32*r32 + low32(xlo) < 2^64
 364         r2 = highlow64_0(q0) - mulu32_w(low32(y),q0); // 2^32*q0 - y0*q0 < 2^64
 365         // 0 <= r+r2 < 3*y. Bei der Addition auf Carry testen!
 366         // Carry -> jedenfalls y <= r+r2 < y + 2^64 <= 3*y.
 367         // kein Carry -> jedenfalls 0 <= r+r2 < 2^64 <= 2*y.
 368         if ((r += r2) < r2) // addieren, r >= 2^64 ?
 369           { q0 += 1; r -= y; }
 370         // jetzt noch 0 <= r < 2^64 <= 2*y
 371         if (r >= y)
 372           { q0 += 1; r -= y; }
 373        }// Quotient q0, Rest r fertig.
 374         divu_64_rest = r >> s; // Rest
 375         return highlow64(q1,q0); // Quotient
 376 }   } }
 377 }  // namespace cln
 378 #endif
 379