src/base/digitseq/cl_DS_mul_fftp.h

   1 // Fast integer multiplication using FFT in a modular ring.
   2 // Bruno Haible 5.5.1996
   3
   4 // FFT in the complex domain has the drawback that it needs careful round-off
   5 // error analysis. So here we choose another field of characteristic 0: Q_p.
   6 // Since Q_p contains exactly the (p-1)th roots of unity, we choose
   7 // p == 1 mod N and have the Nth roots of unity (N = 2^n) in Q_p and
   8 // even in Z_p. Actually, we compute in Z/(p^m Z).
   9
  10 // All operations the FFT algorithm needs is addition, subtraction,
  11 // multiplication, multiplication by the Nth root of unity and division
  12 // by N. Hence we can use the domain Z/(p^m Z) even if p is not a prime!
  13
  14 // We want to compute the convolution of N 32-bit words. The resulting
  15 // words are < (2^32)^2 * N. If is safe to compute in Z/pZ with p = 2^94 + 1
  16 // or p = 7*2^92 + 1. We choose p < 2^95 so that we can easily represent every
  17 // element of Z/pZ as three 32-bit words.
  18
  19 #if !(intDsize==32)
  20 #error "fft mod p implemented only for intDsize==32"
  21 #endif
  22
  23 #if 0
  24 typedef union {
  25         #if CL_DS_BIG_ENDIAN_P
  26         struct { uint32 w2; uint32 w1; uint32 w0; };
  27         #else
  28         struct { uint32 w0; uint32 w1; uint32 w2; };
  29         #endif
  30         uintD _w[3];
  31 } fftp_word;
  32 #else
  33 // typedef struct { uint32 w2; uint32 w1; uint32 w0; } fftp_word;
  34 // typedef struct { uint32 w0; uint32 w1; uint32 w2; } fftp_word;
  35 typedef struct { uintD _w[3]; } fftp_word;
  36 #endif
  37 #if CL_DS_BIG_ENDIAN_P
  38   #define w2 _w[0]
  39   #define w1 _w[1]
  40   #define w0 _w[2]
  41   #define W3(W2,W1,W0)  { W2, W1, W0 }
  42 #else
  43   #define w0 _w[0]
  44   #define w1 _w[1]
  45   #define w2 _w[2]
  46   #define W3(W2,W1,W0)  { W0, W1, W2 }
  47 #endif
  48
  49 #if 0
  50 // p = 19807040628566084398385987585 = 5 * 3761 * 7484047069 * 140737471578113
  51 static const fftp_word p = W3( 1L<<30, 0, 1 ); // p = 2^94 + 1
  52 #define FFT_P_94
  53 static const fftp_word fftp_roots_of_1 [24+1] =
  54   // roots_of_1[n] is a (2^n)th root of unity in Z/pZ.
  55   // (Also roots_of_1[n-1] = roots_of_1[n]^2, but we don't need this.)
  56   // (To build this table, you need to compute roots of unity modulo the
  57   // factors of p and combine them using the Chinese Remainder Theorem.
  58   // Or ask me for "quadmod.lsp".)
  59   {
  60     W3( 0x00000000, 0x00000000, 0x00000001 ), //                             1
  61     W3( 0x0000003F, 0xFFFFFFFF, 0xFF800000 ), //        1180591620717402914816
  62     W3( 0x20000040, 0x00004000, 0x00000001 ), //  9903521494874733285348474881
  63     W3( 0x3688E9A7, 0xDD78E2A9, 0x1E75974D ), // 16877707849775746711303853901
  64     W3( 0x286E6589, 0x5E86C1E0, 0x42710379 ), // 12512861726041464545960067961
  65     W3( 0x00D79325, 0x1A884885, 0xEA46D6C5 ), //   260613923531515619478787781
  66     W3( 0x1950B480, 0xC387CEE5, 0xA69C443F ), //  7834691712342412468047070271
  67     W3( 0x19DC9D08, 0x11CADC6A, 0x5BA8B123 ), //  8003830486242687653832601891
  68     W3( 0x21D6D905, 0xB8BAC7C3, 0xC3841613 ), // 10472740308573592285123712531
  69     W3( 0x27D73986, 0x6AF6BD27, 0x7A6D7909 ), // 12330106088710388189231937801
  70     W3( 0x20D4698B, 0x0039D457, 0xA092AECF ), // 10160311000635748689099534031
  71     W3( 0x049BD1C4, 0xA94F001A, 0xFA76E358 ), //  1426314143682376031341568856
  72     W3( 0x26DD7228, 0x09400257, 0x9BB49CB9 ), // 12028142067661291067236719801
  73     W3( 0x12DAA9AD, 0xAF9435A9, 0xD50FF483 ), //  5835077289334326375656453251
  74     W3( 0x0B7CDA03, 0x9418702E, 0x7CD934CA ), //  3555271451571910239441204426
  75     W3( 0x2D272FCF, 0xB8644522, 0x68EAD40B ), // 13974199331913037576372147211
  76     W3( 0x00EDA06E, 0x0114DA26, 0xE8D84BA9 ), //   287273027105701319912475561
  77     W3( 0x2219C2C4, 0xFD3145C6, 0xDD019359 ), // 10553633252320053510122083161
  78     W3( 0x1764F007, 0x4F5D5FD4, 0xDAB10AFC ), //  7240181310654329198595869436
  79     W3( 0x01AA13EE, 0x2D1CD906, 0x11D5B1EB ), //   515096517694807745704079851
  80     W3( 0x27038944, 0x5A37BAAD, 0x5CECA64C ), // 12074190385578921562318087756
  81     W3( 0x2459CF22, 0xF625FD38, 0xADB48511 ), // 11250032926302238120667809041
  82     W3( 0x25B6C6A8, 0xD684063F, 0x7ABAD1EF ), // 11671908005633729316324561391
  83     W3( 0x1C1A2BC6, 0x12B253F1, 0x0D1BBCB7 ), //  8697219061868963805380983991
  84     W3( 0x198F3FE2, 0x5EE9919F, 0x535E80D5 }  //  7910303322630257758732976341
  85   };
  86 // Sadly, this p doesn't work because we don't find a (2^n)th root of unity w
  87 // such that  w^(2^(n-1)) = -1  mod p. However, our algorithm below assumes
  88 // that  w^(2^(n-1)) = -1...
  89 #else
  90 // p = 34662321099990647697175478273, a prime
  91 static const fftp_word p = W3( 7L<<28, 0, 1 ); // p = 7 * 2^92 + 1
  92 #define FFT_P_92
  93 static const fftp_word fftp_roots_of_1 [92+1] =
  94   // roots_of_1[n] is a (2^n)th root of unity in Z/pZ.
  95   // (Also roots_of_1[n-1] = roots_of_1[n]^2, but we don't need this.)
  96   {
  97     W3( 0x00000000, 0x00000000, 0x00000001 ), //                             1
  98     W3( 0x70000000, 0x00000000, 0x00000000 ), // 34662321099990647697175478272
  99     W3( 0x064AF70F, 0x997E62CE, 0x77953100 ), //  1947537281862369253065568512
 100     W3( 0x261B8E96, 0xC3AD4296, 0xDA1BFA93 ), // 11793744727492885369350519443
 101     W3( 0x096EA949, 0x6EDCAF05, 0x47C92A4F ), //  2919146363086089454841571919
 102     W3( 0x366A8C3F, 0x7BF1436D, 0x2333BE9E ), // 16840998969615256469762195102
 103     W3( 0x27569FA8, 0xAE1775F1, 0xB21956A0 ), // 12174636971387721414084220576
 104     W3( 0x16CABB8B, 0xBAA59813, 0x62FCBCD9 ), //  7053758891710792545762852057
 105     W3( 0x1AE130A3, 0xF909B101, 0xB6BA30CF ), //  8318848263123793919933558991
 106     W3( 0x32AE8FEE, 0x6B1A656B, 0xED02BF24 ), // 15685283280129931240441823012
 107     W3( 0x2D1EE047, 0x5AEDC882, 0x8E96BCCC ), // 13964152342912072497719852236
 108     W3( 0x222A18FD, 0x3BF40635, 0xBFDEA8AD ), // 10573383226491471052459124909
 109     W3( 0x10534EE6, 0xED5A55D4, 0x06AE2155 ), //  5052473604609413010647032149
 110     W3( 0x02F3BFA3, 0x2D816786, 0xE6C27B3C ), //   913643975905572976593107772
 111     W3( 0x0B0CD0A5, 0x9A1FF4F7, 0x2624A5E1 ), //  3419827524917244902802499041
 112     W3( 0x257A492F, 0x156C141C, 0xFC5D75F4 ), // 11598779914676604137587439092
 113     W3( 0x061FB92A, 0xB1A1F41A, 0x7006920F ), //  1895261184698485907279745551
 114     W3( 0x2A4E1471, 0xDDB96073, 0xD8DDBB71 ), // 13092763174215078887900953457
 115     W3( 0x213B469E, 0xD72A84CA, 0xAAA477F2 ), // 10284665443205365583657072626
 116     W3( 0x1D7EF67C, 0x3DC2DA37, 0x4C86E9DC ), //  9128553932091860654576036316
 117     W3( 0x0CB7AA67, 0x2E087ED8, 0x2675D6E3 ), //  3935858248479385987820410595
 118     W3( 0x00BD7B24, 0x68388052, 0x57FFFB10 ), //   229068502577238003716979472
 119     W3( 0x1E1724A6, 0xBA587C3D, 0x0C12825B ), //  9312528669272006966417457755
 120     W3( 0x20595EF0, 0xC89DA33B, 0x3CB5583B ), // 10011563056352601486430394427
 121     W3( 0x15E730B2, 0x6D34E9EB, 0x71CCE555 ), //  6778677035560020292206912853
 122     W3( 0x015EFDBB, 0xC0A80C3B, 0xE4B1E017 ), //   424322259008787317821399063
 123     W3( 0x1B81FC63, 0x0C694944, 0x8EB481BF ), //  8513238559382277026756198847
 124     W3( 0x1AF53421, 0x5DCAA1A4, 0xD0C15A03 ), //  8343043259718611508685527555
 125     W3( 0x2F2B6B58, 0xBB60E464, 0x37A7DE2E ), // 14598286201875835624993840686
 126     W3( 0x27B4AB13, 0x54617640, 0xE86E757A ), // 12288329911800070034603013498
 127     W3( 0x041A31D2, 0xF0AC8E3C, 0x8AA4FD27 ), //  1269607397711669380834983207
 128     W3( 0x1A52F484, 0x39AC5917, 0x34E3F1F7 ), //  8146896869111203814625767927
 129     W3( 0x048FC120, 0x50F6ECBF, 0x268D86A8 ), //  1411728444351387120148776616
 130     W3( 0x27A2C427, 0x001F1239, 0x93380047 ), // 12266687669072434694473646151
 131     W3( 0x2E7E8DFB, 0x2411A754, 0xE12A9B1D ), // 14389305591459206001391737629
 132     W3( 0x29F14702, 0x40B3E1E2, 0xF7D71A8D ), // 12980571854778363745245010573
 133     W3( 0x3158DCE7, 0x8B8FEB32, 0x1DE35D24 ), // 15272194145252623177165790500
 134     W3( 0x12484C07, 0x437ED373, 0x9E45F602 ), //  5658131869639928287764805122
 135     W3( 0x1AEAE06E, 0xB905C908, 0x4389BF5F ), //  8330558749711089231534341983
 136     W3( 0x27BC0045, 0x43024FEB, 0xEC880258 ), // 12297194714773858676269122136
 137     W3( 0x2EFE1CBC, 0x0D2FAA94, 0xB4EA69A6 ), // 14543513305163560781242591654
 138     W3( 0x0B0D3D8B, 0xD779F105, 0x920367FA ), //  3420341787669373425792804858
 139     W3( 0x2D4D7BA9, 0x0970D8CF, 0x8CE6D7EC ), // 14020496699328277892009744364
 140     W3( 0x00DC5971, 0x0209470E, 0x713F2B27 ), //   266386055561000736260041511
 141     W3( 0x27E54E26, 0x53BA0137, 0xDD6740B3 ), // 12347128447319282384829366451
 142     W3( 0x2143A889, 0x8F2B57F5, 0xFB8181C1 ), // 10294799249108063706647986625
 143     W3( 0x1125419F, 0x5C4E0608, 0xE0AC0396 ), //  5306285315793562029414679446
 144     W3( 0x15B61D90, 0x63A27BB0, 0x26402B32 ), //  6719349317556695539371748146
 145     W3( 0x03B582FC, 0x419EF656, 0xB06BBC35 ), //  1147889163765050226454019125
 146     W3( 0x08FF62E1, 0xA3BB1145, 0xDA998F77 ), //  2784623116803271439773437815
 147     W3( 0x101978AF, 0xF93CBFA1, 0xB788B5A3 ), //  4982553232749484200897852835
 148     W3( 0x061334DE, 0x8FE5C6E9, 0x2B2309D6 ), //  1880129318103954373583505878
 149     W3( 0x343C6E7C, 0x8019BB43, 0xD954E744 ), // 16166277816826816936484857668
 150     W3( 0x06506A03, 0x0E6DE333, 0xF8011494 ), //  1954124751724394051182597268
 151     W3( 0x34892A42, 0x6502DAA3, 0x8FDA6971 ), // 16259042912153157504364865905
 152     W3( 0x0EF2C4BD, 0xF42D9711, 0xC32CEA49 ), //  4626279273705729025744104009
 153     W3( 0x24511305, 0x4F1EAE2C, 0x62FB10F4 ), // 11239473167855288013010178292
 154     W3( 0x14E5A052, 0xF1748A9C, 0xDD536730 ), //  6467301317787608309692589872
 155     W3( 0x0621D0A7, 0x0A5188AF, 0x7316C352 ), //  1897789944553576071437927250
 156     W3( 0x234498F0, 0xDF078E95, 0x6FEED50B ), // 10914904542475816633386325259
 157     W3( 0x029E4925, 0x948D6D57, 0xD4DF93A6 ), //   810325725128913871737688998
 158     W3( 0x11BB3805, 0x0589D746, 0x852F3E2F ), //  5487578840386649632552205871
 159     W3( 0x1D4370CA, 0xA4441B85, 0xC9606FE0 ), //  9056595957858187419376971744
 160     W3( 0x1C536F7D, 0x77D44926, 0x8DDB8932 ), //  8766447615182890705620797746
 161     W3( 0x3498CE71, 0xB726A4D3, 0xF4F3C813 ), // 16277952140466335672647796755
 162     W3( 0x1E4A297E, 0xAC13196E, 0xFACD8102 ), //  9374206759006667727054930178
 163     W3( 0x0E7C2CCC, 0xC940C98B, 0x0BC0CA49 ), //  4482908500893894680116251209
 164     W3( 0x124CF912, 0xD84438FD, 0x9C03585F ), //  5663784755954194195257972831
 165     W3( 0x06180FF8, 0xD447BEBE, 0xDB8821E7 ), //  1885999704184999223512015335
 166     W3( 0x1ED2EB11, 0x0687EC7C, 0xBE3436C8 ), //  9539534786948152514714023624
 167     W3( 0x30EBB35C, 0x59616A3C, 0x502CBB52 ), // 15140225046175435352009653074
 168     W3( 0x33E24883, 0xEDA36D60, 0xA25C8E5F ), // 16057295180155395438855097951
 169     W3( 0x0D879ED9, 0x076BAB06, 0x9BE12AA2 ), //  4187260250707927256570866338
 170     W3( 0x1A1B6C9C, 0x0966383B, 0x54123A87 ), //  8079764146434082816365050503
 171     W3( 0x31BD863A, 0xA2A6505C, 0xD759E6CF ), // 15393886339893077529104869071
 172     W3( 0x3209AF0A, 0x5E5055A1, 0x480AF03F ), // 15485957428841754012708171839
 173     W3( 0x1A4CC03C, 0xC8AA650B, 0x7F4DBCE9 ), //  8139396433274519652257348841
 174     W3( 0x3596471F, 0xB99D2EA3, 0xA3433E0C ), // 16584380266717730797139475980
 175     W3( 0x28E87642, 0x98E21FCE, 0xDE1B53EA ), // 12660429650750886812340409322
 176     W3( 0x20161DB9, 0xCDC199E9, 0x0A6BEDF2 ), //  9930257058416521571476434418
 177     W3( 0x1D0DC095, 0x2C40D22B, 0x088549BA ), //  8991690766592354745340742074
 178     W3( 0x2FCC953C, 0xA8B62408, 0x50FC4C29 ), // 14793121080372138443684989993
 179     W3( 0x0F854B39, 0xF659B4B5, 0xD2B0A6AC ), //  4803417528030967235217499820
 180     W3( 0x30E087D4, 0x02F3BBAB, 0xBA503373 ), // 15126721285415891216108630899
 181     W3( 0x0DAF660C, 0x26B99C42, 0x98B8BE05 ), //  4235349051642660841298902533
 182     W3( 0x0ED6AE0E, 0xCD02982A, 0xD233F0D9 ), //  4592322227691334993146278105
 183     W3( 0x3415EB9B, 0x4B61C19F, 0xB21F1255 ), // 16119720573722492095181034069
 184     W3( 0x1015A729, 0x20A1FAA2, 0x0D094529 ), //  4977936993224010619482096937
 185     W3( 0x1D2E3AD2, 0x7093579F, 0x1C93C97B ), //  9030953651705465548198627707
 186     W3( 0x130EAA8F, 0x859C980F, 0xD9E7E8ED ), //  5897945597894388791627999469
 187     W3( 0x2B7CA1C8, 0xFC34C5B5, 0x9C0B1C0C ), // 13458526232475976507763399692
 188     W3( 0x22367055, 0xA53B526A, 0x7505EABE ), // 10588302813110450634719881918
 189     W3( 0x344FEF55, 0x0B77067F, 0x38999E77 )  // 16189855864848287589134343799
 190   };
 191 #endif
 192
 193 // Define this if you want the external loops instead of inline operations.
 194 #define FFTP_EXTERNAL_LOOPS
 195
 196 // Define this for (cheap) consistency checks.
 197 //#define DEBUG_FFTP
 198
 199 // Define this for extensive consistency checks.
 200 //#define DEBUG_FFTP_OPERATIONS
 201
 202 // Define the algorithm of the backward FFT:
 203 // Either FORWARD (a normal FFT followed by a permutation)
 204 // or     RECIPROOT (an FFT with reciprocal root of unity)
 205 // or     CLEVER (an FFT with reciprocal root of unity but clever computation
 206 //                of the reciprocals).
 207 // Drawback of FORWARD: the permutation pass.
 208 // Drawback of RECIPROOT: need all the powers of the root, not only half of them.
 209 #define FORWARD   42
 210 #define RECIPROOT 43
 211 #define CLEVER    44
 212 #define FFTP_BACKWARD CLEVER
 213
 214 // r := a + b
 215 static inline void add (const fftp_word& a, const fftp_word& b, fftp_word& r)
 216 {
 217 #ifdef FFTP_EXTERNAL_LOOPS
 218         add_loop_lsp(arrayLSDptr(a._w,3),arrayLSDptr(b._w,3),arrayLSDptr(r._w,3),3);
 219 #else
 220         var uint32 tmp;
 221
 222         tmp = a.w0 + b.w0;
 223         if (tmp >= a.w0) {
 224                 // no carry
 225                 r.w0 = tmp;
 226                 tmp = a.w1 + b.w1;
 227                 if (tmp >= a.w1) goto no_carry_1; else goto carry_1;
 228         } else {
 229                 // carry
 230                 r.w0 = tmp;
 231                 tmp = a.w1 + b.w1 + 1;
 232                 if (tmp > a.w1) goto no_carry_1; else goto carry_1;
 233         }
 234         if (1) {
 235                 no_carry_1: // no carry
 236                 r.w1 = tmp;
 237                 tmp = a.w2 + b.w2;
 238         } else {
 239                 carry_1: // carry
 240                 r.w1 = tmp;
 241                 tmp = a.w2 + b.w2 + 1;
 242         }
 243         r.w2 = tmp;
 244 #endif
 245 }
 246
 247 // r := a - b
 248 static inline void sub (const fftp_word& a, const fftp_word& b, fftp_word& r)
 249 {
 250 #ifdef FFTP_EXTERNAL_LOOPS
 251         sub_loop_lsp(arrayLSDptr(a._w,3),arrayLSDptr(b._w,3),arrayLSDptr(r._w,3),3);
 252 #else
 253         var uint32 tmp;
 254
 255         tmp = a.w0 - b.w0;
 256         if (tmp <= a.w0) {
 257                 // no carry
 258                 r.w0 = tmp;
 259                 tmp = a.w1 - b.w1;
 260                 if (tmp <= a.w1) goto no_carry_1; else goto carry_1;
 261         } else {
 262                 // carry
 263                 r.w0 = tmp;
 264                 tmp = a.w1 - b.w1 - 1;
 265                 if (tmp < a.w1) goto no_carry_1; else goto carry_1;
 266         }
 267         if (1) {
 268                 no_carry_1: // no carry
 269                 r.w1 = tmp;
 270                 tmp = a.w2 - b.w2;
 271         } else {
 272                 carry_1: // carry
 273                 r.w1 = tmp;
 274                 tmp = a.w2 - b.w2 - 1;
 275         }
 276         r.w2 = tmp;
 277 #endif
 278 }
 279
 280 // b := a >> 1
 281 static inline void shift (const fftp_word& a, fftp_word& b)
 282 {
 283 #ifdef FFTP_EXTERNAL_LOOPS
 284         #ifdef DEBUG_FFTP
 285         if (shiftrightcopy_loop_msp(arrayMSDptr(a._w,3),arrayMSDptr(b._w,3),3,1,0))
 286                 throw runtime_exception();
 287         #else
 288         shiftrightcopy_loop_msp(arrayMSDptr(a._w,3),arrayMSDptr(b._w,3),3,1,0);
 289         #endif
 290 #else
 291         var uint32 tmp, carry;
 292
 293         tmp = a.w2;
 294         b.w2 = a.w2 >> 1;
 295         carry = tmp << 31;
 296         tmp = a.w1;
 297         b.w1 = (tmp >> 1) | carry;
 298         carry = tmp << 31;
 299         tmp = a.w0;
 300         b.w0 = (tmp >> 1) | carry;
 301         #ifdef DEBUG_FFTP
 302         carry = tmp << 31;
 303         if (carry)
 304                 throw runtime_exception();
 305         #endif
 306 #endif
 307 }
 308
 309 #ifdef DEBUG_FFTP_OPERATIONS
 310 #define check_fftp_word(x)  if (compare_loop_msp(arrayMSDptr((x)._w,3),arrayMSDptr(p._w,3),3) >= 0) throw runtime_exception()
 311 #else
 312 #define check_fftp_word(x)
 313 #endif
 314
 315 // r := (a + b) mod p
 316 static inline void addp (const fftp_word& a, const fftp_word& b, fftp_word& r)
 317 {
 318         check_fftp_word(a); check_fftp_word(b);
 319 #ifdef FFTP_EXTERNAL_LOOPS
 320         add(a,b, r);
 321         if (compare_loop_msp(arrayMSDptr(r._w,3),arrayMSDptr(p._w,3),3) >= 0)
 322                 sub(r,p, r);
 323 #else
 324         add(a,b, r);
 325         if ((r.w2 > p.w2)
 326             || ((r.w2 == p.w2)
 327                 && ((r.w1 > p.w1)
 328                     || ((r.w1 == p.w1)
 329                         && (r.w0 >= p.w0)))))
 330                 sub(r,p, r);
 331 #endif
 332         check_fftp_word(r);
 333 }
 334
 335 // r := (a - b) mod p
 336 static inline void subp (const fftp_word& a, const fftp_word& b, fftp_word& r)
 337 {
 338         check_fftp_word(a); check_fftp_word(b);
 339         sub(a,b, r);
 340         if ((sint32)r.w2 < 0)
 341                 add(r,p, r);
 342         check_fftp_word(r);
 343 }
 344
 345 // r := (a * b) mod p
 346 static void mulp (const fftp_word& a, const fftp_word& b, fftp_word& r)
 347 {
 348         check_fftp_word(a); check_fftp_word(b);
 349 #if defined(FFT_P_94)
 350         var uintD c[6];
 351         var uintD* const cLSDptr = arrayLSDptr(c,6);
 352         // Multiply the two words, using the standard method.
 353         mulu_2loop(arrayLSDptr(a._w,3),3, arrayLSDptr(b._w,3),3, cLSDptr);
 354         // c[0..5] now contains the product.
 355         // Divide by p.
 356         // To divide c (0 <= c < p^2) by p = 2^n+1,
 357         // we set q := floor(c/2^n) and r := c - q*p = (c mod 2^n) - q.
 358         // If this becomes negative, set r := r + p (at most twice).
 359         // (This works because  floor(c/p) <= q <= floor(c/p)+2.)
 360         // (Actually, here, 0 <= c <= (p-1)^2, hence
 361         // floor(c/p) <= q <= floor(c/p)+1, so we have
 362         // to set r := r + p at most once!)
 363         // n = 94 = 3*32-2 = 2*32+30.
 364         shiftleft_loop_lsp(cLSDptr lspop 3,3,2,lspref(cLSDptr,2)>>30);
 365         lspref(cLSDptr,2) &= bit(30)-1;
 366         // c[0..2] now contains q, c[3..5] contains (c mod 2^n).
 367         #if 0
 368         if (compare_loop_msp(cLSDptr lspop 6,arrayMSDptr(p._w,3),3) >= 0) // q >= p ?
 369                 subfrom_loop_lsp(arrayLSDptr(p._w,3),cLSDptr lspop 3,3); // q -= p;
 370         #endif
 371         if (subfrom_loop_lsp(cLSDptr lspop 3,cLSDptr,3)) // (c mod 2^n) - q
 372                 addto_loop_lsp(arrayLSDptr(p._w,3),cLSDptr,3);
 373         r.w2 = lspref(cLSDptr,2); r.w1 = lspref(cLSDptr,1); r.w0 = lspref(cLSDptr,0);
 374 #elif defined(FFT_P_92)
 375         var uintD c[7];
 376         var uintD* const cLSDptr = arrayLSDptr(c,7);
 377         // Multiply the two words, using the standard method.
 378         mulu_2loop(arrayLSDptr(a._w,3),3, arrayLSDptr(b._w,3),3, cLSDptr);
 379         // c[1..6] now contains the product.
 380         // Divide by p.
 381         // To divide c (0 <= c < p^2) by p = 7*2^n+1,
 382         // we set q := floor(floor(c/2^n)/7) and
 383         // r := c - q*p = (floor(c/2^n) mod 7)*2^n + (c mod 2^n) - q.
 384         // If this becomes negative, set r := r + p.
 385         // (As above, since 0 <= c <= (p-1)^2, we have
 386         // floor(c/p) <= q <= floor(c/p)+1, so we have
 387         // to set r := r + p at most once!)
 388         // n = 92 = 3*32-4 = 2*32+28.
 389         lspref(cLSDptr,6) = shiftleft_loop_lsp(cLSDptr lspop 3,3,4,lspref(cLSDptr,2)>>28);
 390         lspref(cLSDptr,2) &= bit(28)-1;
 391         // c[0..3] now contains floor(c/2^n), c[4..6] contains (c mod 2^n).
 392         var uintD remainder = divu_loop_msp(7,cLSDptr lspop 7,4);
 393         lspref(cLSDptr,2) |= remainder << 28;
 394         // c[0..3] now contains q, c[4..6] contains (c mod 7*2^n).
 395         #ifdef DEBUG_FFTP
 396         if (lspref(cLSDptr,6) > 0)
 397                 throw runtime_exception();
 398         #endif
 399         #if 0
 400         if (compare_loop_msp(cLSDptr lspop 6,arrayMSDptr(p._w,3),3) >= 0) // q >= p ?
 401                 subfrom_loop_lsp(arrayLSDptr(p._w,3),cLSDptr lspop 3,3); // q -= p;
 402         #endif
 403         if (subfrom_loop_lsp(cLSDptr lspop 3,cLSDptr,3)) // (c mod 2^n) - q
 404                 addto_loop_lsp(arrayLSDptr(p._w,3),cLSDptr,3);
 405         r.w2 = lspref(cLSDptr,2); r.w1 = lspref(cLSDptr,1); r.w0 = lspref(cLSDptr,0);
 406 #else
 407 #error "mulp not implemented for this prime"
 408 #endif
 409         if ((sint32)r.w2 < 0)
 410                 throw runtime_exception();
 411         check_fftp_word(r);
 412 }
 413 #ifdef DEBUG_FFTP_OPERATIONS
 414 static void mulp_doublecheck (const fftp_word& a, const fftp_word& b, fftp_word& r)
 415 {
 416         fftp_word zero, ma, mb, or;
 417         subp(a,a, zero);
 418         subp(zero,a, ma);
 419         subp(zero,b, mb);
 420         mulp(ma,mb, or);
 421         mulp(a,b, r);
 422         if (compare_loop_msp(arrayMSDptr(r._w,3),arrayMSDptr(or._w,3),3))
 423                 throw runtime_exception();
 424 }
 425 #define mulp mulp_doublecheck
 426 #endif /* DEBUG_FFTP_OPERATIONS */
 427
 428 // b := (a / 2) mod p
 429 static inline void shiftp (const fftp_word& a, fftp_word& b)
 430 {
 431         check_fftp_word(a);
 432         if (a.w0 & 1) {
 433                 var fftp_word a_even;
 434                 add(a,p, a_even);
 435                 shift(a_even, b);
 436         } else
 437                 shift(a, b);
 438         check_fftp_word(b);
 439 }
 440
 441 #ifndef _BIT_REVERSE
 442 #define _BIT_REVERSE
 443 // Reverse an n-bit number x. n>0.
 444 static uintC bit_reverse (uintL n, uintC x)
 445 {
 446         var uintC y = 0;
 447         do {
 448                 y <<= 1;
 449                 y |= (x & 1);
 450                 x >>= 1;
 451         } while (!(--n == 0));
 452         return y;
 453 }
 454 #endif
 455
 456 // Compute an convolution mod p using FFT: z[0..N-1] := x[0..N-1] * y[0..N-1].
 457 static void fftp_convolution (const uintL n, const uintC N, // N = 2^n
 458                               fftp_word * x, // N words
 459                               fftp_word * y, // N words
 460                               fftp_word * z  // N words result
 461                              )
 462 {
 463         CL_ALLOCA_STACK;
 464         #if (FFTP_BACKWARD == RECIPROOT) || defined(DEBUG_FFTP)
 465         var fftp_word* const w = cl_alloc_array(fftp_word,N);
 466         #else
 467         var fftp_word* const w = cl_alloc_array(fftp_word,(N>>1)+1);
 468         #endif
 469         var uintC i;
 470         // Initialize w[i] to w^i, w a primitive N-th root of unity.
 471         w[0] = fftp_roots_of_1[0];
 472         w[1] = fftp_roots_of_1[n];
 473         #if (FFTP_BACKWARD == RECIPROOT) || defined(DEBUG_FFTP)
 474         for (i = 2; i < N; i++)
 475                 mulp(w[i-1],fftp_roots_of_1[n], w[i]);
 476         #else // need only half of the roots
 477         for (i = 2; i < N>>1; i++)
 478                 mulp(w[i-1],fftp_roots_of_1[n], w[i]);
 479         #endif
 480         #ifdef DEBUG_FFTP
 481         // Check that w is really a primitive N-th root of unity.
 482         {
 483                 var fftp_word w_N;
 484                 mulp(w[N-1],fftp_roots_of_1[n], w_N);
 485                 if (!(w_N.w2 == 0 && w_N.w1 == 0 && w_N.w0 == 1))
 486                         throw runtime_exception();
 487                 w_N = w[N>>1];
 488                 if (!(w_N.w2 == p.w2 && w_N.w1 == p.w1 && w_N.w0 == p.w0 - 1))
 489                         throw runtime_exception();
 490         }
 491         #endif
 492         var bool squaring = (x == y);
 493         // Do an FFT of length N on x.
 494         {
 495                 var sintL l;
 496                 /* l = n-1 */ {
 497                         var const uintC tmax = N>>1; // tmax = 2^(n-1)
 498                         for (var uintC t = 0; t < tmax; t++) {
 499                                 var uintC i1 = t;
 500                                 var uintC i2 = i1 + tmax;
 501                                 // Butterfly: replace (x(i1),x(i2)) by
 502                                 // (x(i1) + x(i2), x(i1) - x(i2)).
 503                                 var fftp_word tmp;
 504                                 tmp = x[i2];
 505                                 subp(x[i1],tmp, x[i2]);
 506                                 addp(x[i1],tmp, x[i1]);
 507                         }
 508                 }
 509                 for (l = n-2; l>=0; l--) {
 510                         var const uintC smax = (uintC)1 << (n-1-l);
 511                         var const uintC tmax = (uintC)1 << l;
 512                         for (var uintC s = 0; s < smax; s++) {
 513                                 var uintC exp = bit_reverse(n-1-l,s) << l;
 514                                 for (var uintC t = 0; t < tmax; t++) {
 515                                         var uintC i1 = (s << (l+1)) + t;
 516                                         var uintC i2 = i1 + tmax;
 517                                         // Butterfly: replace (x(i1),x(i2)) by
 518                                         // (x(i1) + w^exp*x(i2), x(i1) - w^exp*x(i2)).
 519                                         var fftp_word tmp;
 520                                         mulp(x[i2],w[exp], tmp);
 521                                         subp(x[i1],tmp, x[i2]);
 522                                         addp(x[i1],tmp, x[i1]);
 523                                 }
 524                         }
 525                 }
 526         }
 527         // Do an FFT of length N on y.
 528         if (!squaring) {
 529                 var sintL l;
 530                 /* l = n-1 */ {
 531                         var uintC const tmax = N>>1; // tmax = 2^(n-1)
 532                         for (var uintC t = 0; t < tmax; t++) {
 533                                 var uintC i1 = t;
 534                                 var uintC i2 = i1 + tmax;
 535                                 // Butterfly: replace (y(i1),y(i2)) by
 536                                 // (y(i1) + y(i2), y(i1) - y(i2)).
 537                                 var fftp_word tmp;
 538                                 tmp = y[i2];
 539                                 subp(y[i1],tmp, y[i2]);
 540                                 addp(y[i1],tmp, y[i1]);
 541                         }
 542                 }
 543                 for (l = n-2; l>=0; l--) {
 544                         var const uintC smax = (uintC)1 << (n-1-l);
 545                         var const uintC tmax = (uintC)1 << l;
 546                         for (var uintC s = 0; s < smax; s++) {
 547                                 var uintC exp = bit_reverse(n-1-l,s) << l;
 548                                 for (var uintC t = 0; t < tmax; t++) {
 549                                         var uintC i1 = (s << (l+1)) + t;
 550                                         var uintC i2 = i1 + tmax;
 551                                         // Butterfly: replace (y(i1),y(i2)) by
 552                                         // (y(i1) + w^exp*y(i2), y(i1) - w^exp*y(i2)).
 553                                         var fftp_word tmp;
 554                                         mulp(y[i2],w[exp], tmp);
 555                                         subp(y[i1],tmp, y[i2]);
 556                                         addp(y[i1],tmp, y[i1]);
 557                                 }
 558                         }
 559                 }
 560         }
 561         // Multiply the transformed vectors into z.
 562         for (i = 0; i < N; i++)
 563                 mulp(x[i],y[i], z[i]);
 564         // Undo an FFT of length N on z.
 565         {
 566                 var uintL l;
 567                 for (l = 0; l < n-1; l++) {
 568                         var const uintC smax = (uintC)1 << (n-1-l);
 569                         var const uintC tmax = (uintC)1 << l;
 570                         #if FFTP_BACKWARD != CLEVER
 571                         for (var uintC s = 0; s < smax; s++) {
 572                                 var uintC exp = bit_reverse(n-1-l,s) << l;
 573                                 #if FFTP_BACKWARD == RECIPROOT
 574                                 if (exp > 0)
 575                                         exp = N - exp; // negate exp (use w^-1 instead of w)
 576                                 #endif
 577                                 for (var uintC t = 0; t < tmax; t++) {
 578                                         var uintC i1 = (s << (l+1)) + t;
 579                                         var uintC i2 = i1 + tmax;
 580                                         // Inverse Butterfly: replace (z(i1),z(i2)) by
 581                                         // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)).
 582                                         var fftp_word sum;
 583                                         var fftp_word diff;
 584                                         addp(z[i1],z[i2], sum);
 585                                         subp(z[i1],z[i2], diff);
 586                                         shiftp(sum, z[i1]);
 587                                         mulp(diff,w[exp], diff); shiftp(diff, z[i2]);
 588                                 }
 589                         }
 590                         #else // FFTP_BACKWARD == CLEVER: clever handling of negative exponents
 591                         /* s = 0, exp = 0 */ {
 592                                 for (var uintC t = 0; t < tmax; t++) {
 593                                         var uintC i1 = t;
 594                                         var uintC i2 = i1 + tmax;
 595                                         // Inverse Butterfly: replace (z(i1),z(i2)) by
 596                                         // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)),
 597                                         // with exp <-- 0.
 598                                         var fftp_word sum;
 599                                         var fftp_word diff;
 600                                         addp(z[i1],z[i2], sum);
 601                                         subp(z[i1],z[i2], diff);
 602                                         shiftp(sum, z[i1]);
 603                                         shiftp(diff, z[i2]);
 604                                 }
 605                         }
 606                         for (var uintC s = 1; s < smax; s++) {
 607                                 var uintC exp = bit_reverse(n-1-l,s) << l;
 608                                 exp = (N>>1) - exp; // negate exp (use w^-1 instead of w)
 609                                 for (var uintC t = 0; t < tmax; t++) {
 610                                         var uintC i1 = (s << (l+1)) + t;
 611                                         var uintC i2 = i1 + tmax;
 612                                         // Inverse Butterfly: replace (z(i1),z(i2)) by
 613                                         // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/(2*w^exp)),
 614                                         // with exp <-- (N/2 - exp).
 615                                         var fftp_word sum;
 616                                         var fftp_word diff;
 617                                         addp(z[i1],z[i2], sum);
 618                                         subp(z[i2],z[i1], diff); // note that w^(N/2) = -1
 619                                         shiftp(sum, z[i1]);
 620                                         mulp(diff,w[exp], diff); shiftp(diff, z[i2]);
 621                                 }
 622                         }
 623                         #endif
 624                 }
 625                 /* l = n-1 */ {
 626                         var const uintC tmax = N>>1; // tmax = 2^(n-1)
 627                         for (var uintC t = 0; t < tmax; t++) {
 628                                 var uintC i1 = t;
 629                                 var uintC i2 = i1 + tmax;
 630                                 // Inverse Butterfly: replace (z(i1),z(i2)) by
 631                                 // ((z(i1)+z(i2))/2, (z(i1)-z(i2))/2).
 632                                 var fftp_word sum;
 633                                 var fftp_word diff;
 634                                 addp(z[i1],z[i2], sum);
 635                                 subp(z[i1],z[i2], diff);
 636                                 shiftp(sum, z[i1]);
 637                                 shiftp(diff, z[i2]);
 638                         }
 639                 }
 640         }
 641         #if FFTP_BACKWARD == FORWARD
 642         // Swap z[i] and z[N-i] for 0 < i < N/2.
 643         for (i = (N>>1)-1; i > 0; i--) {
 644                 var fftp_word tmp = z[i];
 645                 z[i] = z[N-i];
 646                 z[N-i] = tmp;
 647         }
 648         #endif
 649 }
 650
 651 static void mulu_fft_modp (const uintD* sourceptr1, uintC len1,
 652                            const uintD* sourceptr2, uintC len2,
 653                            uintD* destptr)
 654 // Es ist 2 <= len1 <= len2.
 655 {
 656         // Methode:
 657         // source1 ist ein Stück der Länge N1, source2 ein oder mehrere Stücke
 658         // der Länge N2, mit N1+N2 <= N, wobei N Zweierpotenz ist.
 659         // sum(i=0..N-1, x_i b^i) * sum(i=0..N-1, y_i b^i) wird errechnet,
 660         // indem man die beiden Polynome
 661         // sum(i=0..N-1, x_i T^i), sum(i=0..N-1, y_i T^i)
 662         // multipliziert, und zwar durch Fourier-Transformation (s.o.).
 663         var uint32 n;
 664         integerlengthC(len1-1, n=); // 2^(n-1) < len1 <= 2^n
 665         var uintC len = (uintC)1 << n; // kleinste Zweierpotenz >= len1
 666         // Wählt man N = len, so hat man ceiling(len2/(len-len1+1)) * FFT(len).
 667         // Wählt man N = 2*len, so hat man ceiling(len2/(2*len-len1+1)) * FFT(2*len).
 668         // Wir wählen das billigere von beiden:
 669         // Bei ceiling(len2/(len-len1+1)) <= 2 * ceiling(len2/(2*len-len1+1))
 670         // nimmt man N = len, bei ....... > ........ dagegen N = 2*len.
 671         // (Wahl von N = 4*len oder mehr bringt nur in Extremfällen etwas.)
 672         if (len2 > 2 * (len-len1+1) * (len2 <= (2*len-len1+1) ? 1 : ceiling(len2,(2*len-len1+1)))) {
 673                 n = n+1;
 674                 len = len << 1;
 675         }
 676         var const uintC N = len; // N = 2^n
 677         CL_ALLOCA_STACK;
 678         var fftp_word* const x = cl_alloc_array(fftp_word,N);
 679         var fftp_word* const y = cl_alloc_array(fftp_word,N);
 680         #ifdef DEBUG_FFTP
 681         var fftp_word* const z = cl_alloc_array(fftp_word,N);
 682         #else
 683         var fftp_word* const z = x; // put z in place of x - saves memory
 684         #endif
 685         var uintD* const tmpprod = cl_alloc_array(uintD,len1+1);
 686         var uintP i;
 687         var uintC destlen = len1+len2;
 688         clear_loop_lsp(destptr,destlen);
 689         do {
 690                 var uintC len2p; // length of a piece of source2
 691                 len2p = N - len1 + 1;
 692                 if (len2p > len2)
 693                         len2p = len2;
 694                 // len2p = min(N-len1+1,len2).
 695                 if (len2p == 1) {
 696                         // cheap case
 697                         var uintD* tmpptr = arrayLSDptr(tmpprod,len1+1);
 698                         mulu_loop_lsp(lspref(sourceptr2,0),sourceptr1,tmpptr,len1);
 699                         if (addto_loop_lsp(tmpptr,destptr,len1+1))
 700                                 if (inc_loop_lsp(destptr lspop (len1+1),destlen-(len1+1)))
 701                                         throw runtime_exception();
 702                 } else {
 703                         var uintC destlenp = len1 + len2p - 1;
 704                         // destlenp = min(N,destlen-1).
 705                         var bool squaring = ((sourceptr1 == sourceptr2) && (len1 == len2p));
 706                         // Fill factor x.
 707                         {
 708                                 for (i = 0; i < len1; i++) {
 709                                         x[i].w0 = lspref(sourceptr1,i);
 710                                         x[i].w1 = 0;
 711                                         x[i].w2 = 0;
 712                                 }
 713                                 for (i = len1; i < N; i++) {
 714                                         x[i].w0 = 0;
 715                                         x[i].w1 = 0;
 716                                         x[i].w2 = 0;
 717                                 }
 718                         }
 719                         // Fill factor y.
 720                         if (!squaring) {
 721                                 for (i = 0; i < len2p; i++) {
 722                                         y[i].w0 = lspref(sourceptr2,i);
 723                                         y[i].w1 = 0;
 724                                         y[i].w2 = 0;
 725                                 }
 726                                 for (i = len2p; i < N; i++) {
 727                                         y[i].w0 = 0;
 728                                         y[i].w1 = 0;
 729                                         y[i].w2 = 0;
 730                                 }
 731                         }
 732                         // Multiply.
 733                         if (!squaring)
 734                                 fftp_convolution(n,N, &x[0], &y[0], &z[0]);
 735                         else
 736                                 fftp_convolution(n,N, &x[0], &x[0], &z[0]);
 737                         #ifdef DEBUG_FFTP
 738                         // Check result.
 739                         for (i = 0; i < N; i++)
 740                                 if (!(z[i].w2 < N))
 741                                         throw runtime_exception();
 742                         #endif
 743                         // Add result to destptr[-destlen..-1]:
 744                         {
 745                                 var uintD* ptr = destptr;
 746                                 // ac2|ac1|ac0 are an accumulator.
 747                                 var uint32 ac0 = 0;
 748                                 var uint32 ac1 = 0;
 749                                 var uint32 ac2 = 0;
 750                                 var uint32 tmp;
 751                                 for (i = 0; i < destlenp; i++) {
 752                                         // Add z[i] to the accumulator.
 753                                         tmp = z[i].w0;
 754                                         if ((ac0 += tmp) < tmp) {
 755                                                 if (++ac1 == 0)
 756                                                         ++ac2;
 757                                         }
 758                                         tmp = z[i].w1;
 759                                         if ((ac1 += tmp) < tmp)
 760                                                 ++ac2;
 761                                         tmp = z[i].w2;
 762                                         ac2 += tmp;
 763                                         // Add the accumulator's least significant word to destptr:
 764                                         tmp = lspref(ptr,0);
 765                                         if ((ac0 += tmp) < tmp) {
 766                                                 if (++ac1 == 0)
 767                                                         ++ac2;
 768                                         }
 769                                         lspref(ptr,0) = ac0;
 770                                         lsshrink(ptr);
 771                                         ac0 = ac1;
 772                                         ac1 = ac2;
 773                                         ac2 = 0;
 774                                 }
 775                                 // ac2 = 0.
 776                                 if (ac1 > 0) {
 777                                         if (!((i += 2) <= destlen))
 778                                                 throw runtime_exception();
 779                                         tmp = lspref(ptr,0);
 780                                         if ((ac0 += tmp) < tmp)
 781                                                 ++ac1;
 782                                         lspref(ptr,0) = ac0;
 783                                         lsshrink(ptr);
 784                                         tmp = lspref(ptr,0);
 785                                         ac1 += tmp;
 786                                         lspref(ptr,0) = ac1;
 787                                         lsshrink(ptr);
 788                                         if (ac1 < tmp)
 789                                                 if (inc_loop_lsp(ptr,destlen-i))
 790                                                         throw runtime_exception();
 791                                 } else if (ac0 > 0) {
 792                                         if (!((i += 1) <= destlen))
 793                                                 throw runtime_exception();
 794                                         tmp = lspref(ptr,0);
 795                                         ac0 += tmp;
 796                                         lspref(ptr,0) = ac0;
 797                                         lsshrink(ptr);
 798                                         if (ac0 < tmp)
 799                                                 if (inc_loop_lsp(ptr,destlen-i))
 800                                                         throw runtime_exception();
 801                                 }
 802                         }
 803                         #ifdef DEBUG_FFTP
 804                         // If destlenp < N, check that the remaining z[i] are 0.
 805                         for (i = destlenp; i < N; i++)
 806                                 if (z[i].w2 > 0 || z[i].w1 > 0 || z[i].w0 > 0)
 807                                         throw runtime_exception();
 808                         #endif
 809                 }
 810                 // Decrement len2.
 811                 destptr = destptr lspop len2p;
 812                 destlen -= len2p;
 813                 sourceptr2 = sourceptr2 lspop len2p;
 814                 len2 -= len2p;
 815         } while (len2 > 0);
 816 }
 817
 818 #undef FFT_P_94
 819 #undef FFT_P_92
 820 #undef w0
 821 #undef w1
 822 #undef w2
 823 #undef W3