src/float/transcendental/cl_LF_pi.cc

   1 // pi().
   2
   3 // General includes.
   4 #include "cl_sysdep.h"
   5
   6 // Specification.
   7 #include "cl_F_tran.h"
   8
   9
  10 // Implementation.
  11
  12 #include "cln/lfloat.h"
  13 #include "cl_LF_tran.h"
  14 #include "cl_LF.h"
  15 #include "cln/integer.h"
  16 #include "cl_alloca.h"
  17
  18 namespace cln {
  19
  20 ALL_cl_LF_OPERATIONS_SAME_PRECISION()
  21
  22 // For the next algorithms, I warmly recommend
  23 // [Jörg Arndt: hfloat documentation, august 1996,
  24 //  http://www.tu-chemnitz.de/~arndt/ hfdoc.dvi
  25 //  But beware of the typos in his formulas! ]
  26
  27 const cl_LF compute_pi_brent_salamin (uintC len)
  28 {
  29         // Methode:
  30         // [Richard P. Brent: Fast multiple-precision evaluation of elementary
  31         //  functions. J. ACM 23(1976), 242-251.]
  32         // [Jonathan M. Borwein, Peter B. Borwein: Pi and the AGM.
  33         //  Wiley 1987. Algorithm 2.2, p. 48.]
  34         // [Jörg Arndt, formula (4.51)-(4.52).]
  35         //    pi = AGM(1,1/sqrt(2))^2 * 2/(1 - sum(k=0..infty, 2^k c_k^2)).
  36         // where the AGM iteration reads
  37         //    a_0 := 1, b_0 := 1/sqrt(2).
  38         //    a_(k+1) := (a_k + b_k)/2, b_(k+1) := sqrt(a_k*b_k).
  39         // and we set
  40         //    c_k^2 := a_k^2 - b_k^2,
  41         // i.e. c_0^2 = 1/2,
  42         //      c_(k+1)^2 = a_(k+1)^2 - b_(k+1)^2 = (a_k - b_k)^2/4
  43         //                = (a_k - a_(k+1))^2.
  44         // (Actually c_(k+1) is _defined_ as  = a_k - a_(k+1) = (a_k - b_k)/2.)
  45         // d=len, n:=intDsize*d. Verwende Long-Floats mit intDsize*(d+1)
  46         // Mantissenbits.
  47         // (let* ((a (coerce 1 'long-float)) ; 1
  48         //        (b (sqrt (scale-float a -1))) ; 2^-(1/2)
  49         //        (eps (scale-float a (- n))) ; 2^-n
  50         //        (t (scale-float a -2)) ; 1/4
  51         //        (k 0)
  52         //       )
  53         //   (loop
  54         //     (when (< (- a b) eps) (return))
  55         //     (let ((y a))
  56         //       (setq a (scale-float (+ a b) -1))
  57         //       (setq b (sqrt (* b y)))
  58         //       (setq t (- t (scale-float (expt (- a y) 2) k)))
  59         //     )
  60         //     (incf k)
  61         //   )
  62         //   (/ (expt a 2) t)
  63         // )
  64         var uintC actuallen = len + 1; // 1 guard digit
  65         var uintE uexp_limit = LF_exp_mid - intDsize*len;
  66         // Ein Long-Float ist genau dann betragsmäßig <2^-n, wenn
  67         // sein Exponent < LF_exp_mid-n = uexp_limit ist.
  68         var cl_LF a = cl_I_to_LF(1,actuallen);
  69         var cl_LF b = sqrt(scale_float(a,-1));
  70         var uintL k = 0;
  71         var cl_LF t = scale_float(a,-2);
  72         until (TheLfloat(a-b)->expo < uexp_limit) {
  73                 // |a-b| < 2^-n -> fertig
  74                 var cl_LF new_a = scale_float(a+b,-1); // (a+b)/2
  75                 b = sqrt(a*b);
  76                 var cl_LF a_diff = new_a - a;
  77                 t = t - scale_float(square(a_diff),k);
  78                 a = new_a;
  79                 k++;
  80         }
  81         var cl_LF pires = square(a)/t; // a^2/t
  82         return shorten(pires,len); // verkürzen und fertig
  83 }
  84 // Bit complexity (N := len): O(log(N)*M(N)).
  85
  86 const cl_LF compute_pi_brent_salamin_quartic (uintC len)
  87 {
  88         // See [Borwein, Borwein, section 1.4, exercise 3, p. 17].
  89         // See also [Jörg Arndt], formulas (4.52) and (3.30)[wrong!].
  90         // As above, we are using the formula
  91         //    pi = AGM(1,1/sqrt(2))^2 * 2/(1 - sum(k=0..infty, 2^k c_k^2)).
  92         // where the AGM iteration reads
  93         //    a_0 := 1, b_0 := 1/sqrt(2).
  94         //    a_(k+1) := (a_k + b_k)/2, b_(k+1) := sqrt(a_k*b_k).
  95         // But we keep computing with
  96         //    wa_k := sqrt(a_k)  and  wb_k := sqrt(b_k)
  97         // at the same time and do two iteration steps at once.
  98         // The iteration takes now takes the form
  99         //    wa_0 := 1, wb_0 := 2^-1/4,
 100         //    (wa_k, wb_k, a_k, b_k)
 101         //    -->         ((wa_k^2 + wb_k^2)/2, wa_k*wb_k)
 102         //    -->         (((wa_k + wb_k)/2)^2, sqrt(wa_k*wb_k*(wa_k^2 + wb_k^2)/2)),
 103         //    i.e. wa_(k+2) = (wa_k + wb_k)/2 and
 104         //         wb_(k+2) = sqrt(sqrt(wa_k*wb_k*(wa_k^2 + wb_k^2)/2)).
 105         // For the sum, we can group two successive items together:
 106         //   2^k * c_k^2 + 2^(k+1) * c_(k+1)^2
 107         //   = 2^k * [(a_k^2 - b_k^2) + 2*((a_k - b_k)/2)^2]
 108         //   = 2^k * [3/2*a_k^2 - a_k*b_k - 1/2*b_k^2]
 109         //   = 2^k * [2*a_k^2 - 1/2*(a_k+b_k)^2]
 110         //   = 2^(k+1) * [a_k^2 - ((a_k+b_k)/2)^2]
 111         //   = 2^(k+1) * [wa_k^4 - ((wa_k^2+wb_k^2)/2)^2].
 112         // Hence,
 113         //   pi = AGM(1,1/sqrt(2))^2 * 1/(1/2 - sum(k even, 2^k*[....])).
 114         var uintC actuallen = len + 1; // 1 guard digit
 115         var uintE uexp_limit = LF_exp_mid - intDsize*len;
 116         var cl_LF one = cl_I_to_LF(1,actuallen);
 117         var cl_LF a = one;
 118         var cl_LF wa = one;
 119         var cl_LF b = sqrt(scale_float(one,-1));
 120         var cl_LF wb = sqrt(b);
 121         // We keep a = wa^2, b = wb^2.
 122         var uintL k = 0;
 123         var cl_LF t = scale_float(one,-1);
 124         until (TheLfloat(wa-wb)->expo < uexp_limit) {
 125                 // |wa-wb| < 2^-n -> fertig
 126                 var cl_LF wawb = wa*wb;
 127                 var cl_LF new_wa = scale_float(wa+wb,-1);
 128                 var cl_LF a_b = scale_float(a+b,-1);
 129                 var cl_LF new_a = scale_float(a_b+wawb,-1);
 130                 var cl_LF new_b = sqrt(wawb*a_b);
 131                 var cl_LF new_wb = sqrt(new_b);
 132                 t = t - scale_float((a - a_b)*(a + a_b),k);
 133                 a = new_a; wa = new_wa;
 134                 b = new_b; wb = new_wb;
 135                 k += 2;
 136         }
 137         var cl_LF pires = square(a)/t;
 138         return shorten(pires,len); // verkürzen und fertig
 139 }
 140 // Bit complexity (N := len): O(log(N)*M(N)).
 141
 142 const cl_LF compute_pi_ramanujan_163 (uintC len)
 143 {
 144         // 1/pi = 1/sqrt(-1728 J)
 145         //        * sum(n=0..infty, (6n)! (A+nB) / 12^(3n) (3n)! n!^3 J^n)
 146         // mit J = -53360^3 = - (2^4 5 23 29)^3
 147         //     A = 163096908 = 2^2 3 13 1045493
 148         //     B = 6541681608 = 2^3 3^3 7 11 19 127 163
 149         // See [Jörg Arndt], formulas (4.27)-(4.30).
 150         // This is also the formula used in Pari.
 151         // The absolute value of the n-th summand is approximately
 152         //   |J|^-n * n^(-1/2) * B/(2*pi^(3/2)),
 153         // hence every summand gives more than 14 new decimal digits
 154         // in precision.
 155         // The sum is best evaluated using fixed-point arithmetic,
 156         // so that the precision is reduced for the later summands.
 157         var uintC actuallen = len + 4; // 4 guard digits
 158         var sintC scale = intDsize*actuallen;
 159         static const cl_I A = "163096908";
 160         static const cl_I B = "6541681608";
 161         //static const cl_I J1 = "10939058860032000"; // 72*abs(J)
 162         static const cl_I J2 = "333833583375"; // odd part of J1
 163         var cl_I sum = 0;
 164         var cl_I n = 0;
 165         var cl_I factor = ash(1,scale);
 166         while (!zerop(factor)) {
 167                 sum = sum + factor * (A+n*B);
 168                 factor = factor * ((6*n+1)*(2*n+1)*(6*n+5));
 169                 n = n+1;
 170                 factor = truncate1(factor,n*n*n*J2);
 171                 // Finally divide by 2^15 and change sign.
 172                 if (minusp(factor))
 173                         factor = (-factor) >> 15;
 174                 else
 175                         factor = -(factor >> 15);
 176         }
 177         var cl_LF fsum = scale_float(cl_I_to_LF(sum,actuallen),-scale);
 178         static const cl_I J3 = "262537412640768000"; // -1728*J
 179         var cl_LF pires = sqrt(cl_I_to_LF(J3,actuallen)) / fsum;
 180         return shorten(pires,len); // verkürzen und fertig
 181 }
 182 // Bit complexity (N := len): O(N^2).
 183
 184 #if defined(__mips__) && !defined(__GNUC__) // workaround SGI CC bug
 185 #define A A_fast
 186 #define B B_fast
 187 #define J3 J3_fast
 188 #endif
 189
 190 const cl_LF compute_pi_ramanujan_163_fast (uintC len)
 191 {
 192         // Same formula as above, using a binary splitting evaluation.
 193         // See [Borwein, Borwein, section 10.2.3].
 194         struct rational_series_stream : cl_pqa_series_stream {
 195                 uintC n;
 196                 static cl_pqa_series_term computenext (cl_pqa_series_stream& thisss)
 197                 {
 198                         static const cl_I A = "163096908";
 199                         static const cl_I B = "6541681608";
 200                         static const cl_I J1 = "10939058860032000"; // 72*abs(J)
 201                         var rational_series_stream& thiss = (rational_series_stream&)thisss;
 202                         var uintC n = thiss.n;
 203                         var cl_pqa_series_term result;
 204                         if (n==0) {
 205                                 result.p = 1;
 206                                 result.q = 1;
 207                         } else {
 208                                 result.p = -((cl_I)(6*n-5)*(cl_I)(2*n-1)*(cl_I)(6*n-1));
 209                                 result.q = (cl_I)n*(cl_I)n*(cl_I)n*J1;
 210                         }
 211                         result.a = A+n*B;
 212                         thiss.n = n+1;
 213                         return result;
 214                 }
 215                 rational_series_stream ()
 216                         : cl_pqa_series_stream (rational_series_stream::computenext),
 217                           n (0) {}
 218         } series;
 219         var uintC actuallen = len + 4; // 4 guard digits
 220         static const cl_I A = "163096908";
 221         static const cl_I B = "6541681608";
 222         static const cl_I J1 = "10939058860032000"; // 72*abs(J)
 223         // Evaluate a sum(0 <= n < N, a(n)/b(n) * (p(0)...p(n))/(q(0)...q(n)))
 224         // with appropriate N, and
 225         //   a(n) = A+n*B, b(n) = 1,
 226         //   p(n) = -(6n-5)(2n-1)(6n-1) for n>0,
 227         //   q(n) = 72*|J|*n^3 for n>0.
 228         var const uintC n_slope = (uintC)(intDsize*32*0.02122673)+1;
 229         // n_slope >= 32*intDsize*log(2)/log(|J|), normally n_slope = 22.
 230         var uintC N = (n_slope*actuallen)/32 + 1;
 231         // N > intDsize*log(2)/log(|J|) * actuallen, hence
 232         // |J|^-N < 2^(-intDsize*actuallen).
 233         var cl_LF fsum = eval_rational_series<false>(N,series,actuallen,actuallen);
 234         static const cl_I J3 = "262537412640768000"; // -1728*J
 235         var cl_LF pires = sqrt(cl_I_to_LF(J3,actuallen)) / fsum;
 236         return shorten(pires,len); // verkürzen und fertig
 237 }
 238 // Bit complexity (N := len): O(log(N)^2*M(N)).
 239
 240 // Timings of the above algorithms, on an i486 33 MHz, running Linux.
 241 //    N      Brent   Brent4  R 163   R 163 fast
 242 //    10     0.0079  0.0079  0.0052  0.0042
 243 //    25     0.026   0.026   0.014   0.012
 244 //    50     0.085   0.090   0.037   0.033
 245 //   100     0.29    0.29    0.113   0.098
 246 //   250     1.55    1.63    0.60    0.49
 247 //   500     5.7     5.7     2.24    1.71
 248 //  1000    21.6    22.9     8.5     5.5
 249 //  2500    89      95      49      19.6
 250 //  5000   217     218     188      49
 251 // 10000   509     540     747     117
 252 // 25000  1304    1310    4912     343
 253 // We see that
 254 //   - "Brent4" isn't worth it: No speed improvement over "Brent".
 255 //   - "R 163" is pretty fast at the beginning, but it is an O(N^2)
 256 //     algorithm, hence it loses in the end,
 257 //   - "R 163 fast", which uses the same formula as "R 163", but evaluates
 258 //     it using binary splitting, is an O(log N * M(N)) algorithm, and
 259 //     outperforms all of the others.
 260
 261 const cl_LF pi (uintC len)
 262 {
 263         var uintC oldlen = TheLfloat(cl_LF_pi)->len; // vorhandene Länge
 264         if (len < oldlen)
 265                 return shorten(cl_LF_pi,len);
 266         if (len == oldlen)
 267                 return cl_LF_pi;
 268
 269         // TheLfloat(cl_LF_pi)->len um mindestens einen konstanten Faktor
 270         // > 1 wachsen lassen, damit es nicht zu häufig nachberechnet wird:
 271         var uintC newlen = len;
 272         oldlen += floor(oldlen,2); // oldlen * 3/2
 273         if (newlen < oldlen)
 274                 newlen = oldlen;
 275
 276         // gewünschte > vorhandene Länge -> muß nachberechnen:
 277         cl_LF_pi = compute_pi_ramanujan_163_fast(newlen);
 278         return (len < newlen ? shorten(cl_LF_pi,len) : cl_LF_pi);
 279 }
 280
 281 }  // namespace cln