-
Notifications
You must be signed in to change notification settings - Fork 10
/
zcubes.c
640 lines (558 loc) · 35.4 KB
/
zcubes.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
#ifdef VERIFY
#define SOFTASSERTS 1 // unlike asserts, softasserts won't be compiled unless SOFTASSERTS is define, this needs to happen before cstd.h is included
#endif
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/sysinfo.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <gmp.h>
#include "primes.h" // interface to primsieve and implementation of prime pipes for multi-threading/processing
#include "m64.h" // 64-bit Montgomery arithmetic
#include "b32.h" // 32-bit Barrett arithmetic (including functions for fast CRT-ing)
#include "bitmap.h" // simple bitmap test/set/lookup and enumeration
#include "cstd.h" // define SOFTASSERTS before this inlude if you want them on
#include "report.h" // reporting and output functions
#include "kdata.h" // loads cubic-reciprocity constraints for k, precomputes admissible d|k
#include "cbrts.h" // code for accessing precomputed cuberoots, inverses modulo small d, static cpmax, cdmin, sdmin, sdtab, ... declared here
/*
Copyright (c) 2019-2020 Andrew R. Booker and Andrew V. Sutherland
See LICENSE file for license details.
*/
// these globals are used both here and in zcheck.h so they need to be declared before the inlude below
static uint64_t dmax; // must be < 2^63
static uint64_t pdmin; // d divisible by p in [pdmin,dmax] must be prime
static uint64_t bpmin; // d divisible by p in [bpmin,dmax] are prime and greater than zmax >> ZMANYBITS (we never call zrcheckmany for these)
static int zmaxbits; // this can run up to ZMAXBITS
static uint128_t zmax128; // zmax128 <= ZMAX, zmin128 is a lower bound on dmax/(2^(1/3)-1)
static long double zmaxld; // long doubles only have 64 bits of integer precision (16 bit exponent), and we may truncate to double in certain situations
// We add a fudge factor to handle this (zmaxld is zmax128*(1+2^-62) + 1
#include "zcheck.h" // code for testing z's in arithmetic progressions and splitting long progressions
static uint64_t *rbuf; // local to this module
/*
This is the main module for zcubes, which has the following command line interfaces
zcubesx n k pmin pmax dmax zmax [options]
where x is the empty string or one of r,v,d,p indicating reporting, verifciation, debugging, or profiling (see make file)
If specified options is a number from 1 to 6 which will cause the program to only perform a subset of its functions (e.g. just enuemrate primes,
just compute cuberoots mod primes, just enumerate arithmetic progressions, etc...), see report.h for a complete list of options.
The parent process creates a prime pipe to enuemrate all primes p in [pmin,pmax] and creates n child processes to read the pripe and a separate sibling
to feed the pipe (this sibling is the only process that will call primesieve -- this is both more efficient and reduces the memory footprint).
The parent thread simply waits for all its children to finish, but it watches for aborts (if any child aborts due to an assert failure the parent
will immediately log an error and kill all the children -- this should never happen but is done to prevent zombies and not waste energy).
For all p in the interval [pmin,pmax] every d <= dmax not dividing k with largest prime factor p will be processed, meaning that we will check all
pairs (d,z) with z <= zmax,
The processing of the primes p is divided into six possible phases (in a large run only one or two will apply to a particular job).
* Cached phase (small primes p whose cuberoots have already been precomputed -- this also applies to powers of p and all prime power divisors of d/p)
* Uncached phase (cuberoots mod p > sqrt(dmax) have not been precomputed, nor have cuberoots mod d/p, but cuberoots mod prime power divisors of d/p have)
* Cocached phase (cuberoots mod p > sqrt(dmax) have not been precomputed but cuberoots modulo d/p have)
* Near prime phase (primes p close enough to dmax that in addition to cuberoots we have precomputed a complete table of inverses mod d/p
* Prime phase (primes p that are so close to dmax that we must have d=p)
* Bigprime phase (primes p for which we must have d=p and zmax/p is small enough that we know we don't want to split progressions)
For each p, the enumeration of the admissible d <= dmax with p as its largest prime divisor proceeds in two separate steps
* the part of d coprime to k is generated as the product of a power of p and powers of smaller primes, none of which divide k
for d not a power of p this step is handled by the functions enumd and enumcd (enumd may call enumcd or call itself recusrively)
* the part of d dividing k is generated by enumerating divisors d' of k that for each prime q|k satisfy v_q(d') in {0,v_q(k)}
this step is handled by the functions prockd which calls procd for each d'
Once both parts of d have been computed/chosen (the choice of gcd(k,d) is simply an index into a table of divisors of k), d is processed
by the function procd immediately below, or by procdcoprime for d coprime to k, or by procdbigprime for large prime d close to zmax.
Each of these functions will call one of zrcheckone, zrcheckafew, or zcrchecklift to check the resulting arithmetic progressions for z.
*/
#define MAXK 1000
#define IBATCH 256
#define CUBEROOT_BUFSIZE 88573 // 1+3+3^2+...+3^9+3^10, here 3^10 is max # cuberoots of k mod d for admissible k < 1000 and d < 2^63 coprime to k
// process d < 2^63 specified by (a,ki), where a is coprime to k and ki indexes an admissible factor of k (stored in kdtab)
static inline void procd (unsigned ki, uint64_t a, uint64_t za[], uint32_t ca)
{
uint64_t d, n;
uint32_t b, m;
unsigned i, si, mi, b2, b7;
softassert (verify_cuberoots_64(za,ca,a));
softassert (ki >= 0 && ki <= kdcnt && a <= kdmax[ki]);
d = a*kdtab[ki].d;
if ( ! report_d (d, ca*kdtab[ki].n) ) return;
si = sgnz_index(d);
mi = kdtab[ki].fi; m = k27ftab[mi].m;
b7 = ( onezmod7(d,si) ? 7 : 1 ); // for k = +/-2 mod 7, for 3/7 d will have z=0 mod 7
b2 = 1+(m&1&d); // note that m will be even whenever k is
b = m*b2*b7; // b will be 162 if k=3, otherwise 9, 18 or 126
// if we are reasonable close to zmax, just use z's mod a and b (no change in the number of arithmetic progressions but it may reduce their length)
// the term 4-log2(ca) is meant to make us more willing to spend time lifting when we have more arithmetic progressions that can benefit
n = fastceilboundl(zmaxld/((long double)a*b));
if ( n <= ZSHORT || (n*ca) <= ZFEW ) {
uint32_t zb[K27MAXN];
struct k27frec *x = k27ftab+mi;
uint64_t minv = x->minv[0]; softassert(minv);
uint16_t dm = b32_red(d,m,minv); softassert (mod3(dm) && (x->ztab[dm]||(m==k27&&dm==1)));
uint32_t cb = x->zcnts[dm];
uint16_t *r = k27zs + x->ztab[dm];
for ( i = 0 ; i < cb ; i++ ) zb[i] = r[i];
uint32_t ainvb = x->itab[b32_red(a,m,minv)]; softassert(b32_red(ainvb*b32_red(a,m,minv),m,minv)==1);
if ( b2 > 1 ) {
for ( i = 0 ; i < cb ; i++ ) if ( zb[i]&1 ) zb[i] += m; // lift to z = 0 mod 2
if ( !(ainvb&1) ) ainvb += m;
softassert (b32_red(ainvb*b32_red(a,b2*m,x->minv[1]),b2*m,x->minv[1])==1);
}
uint64_t binv = x->minv[2*(b7>1)+b2-1];
if ( b7 > 1 ) {
uint32_t b2m = b2*m;
for ( i = 0 ; i < cb ; i++ ) zb[i] = crt7(zb[i],b2m,0); // lift to 0 mod 7
ainvb = crt7 (ainvb, b2m, inv7(a));
softassert (b32_red(ainvb*b32_red(a,b,binv),b,binv)==1);
}
if ( (uint128_t)a*b > zmax128 ) zrcheckone (d, si, a, za, ca, b, zb, cb, ainvb, binv);
else zrcheckafew (d, si, a, za, ca, b, zb, cb, ainvb, binv, n);
} else {
// Lift progressions using cubic reciprocity constraints and auxiliary primes, then check
zrchecklift (d, si, ki, a, za, ca);
}
profile_checkpoint (); // if we are profiling and have collected enough information, this will end the run
}
// process d < 2^63 coprime to k
static inline void procdcoprime (uint64_t d, uint64_t z[], uint32_t c)
{
uint32_t b;
unsigned si, mi;
softassert (verify_cuberoots_64(z,c,d));
if ( ! report_d (d,c) ) return;
si = sgnz_index(d);
mi = (km1&d&1) + 2*( onezmod7(d,si) ? 1 : 0 );
b = km[mi]; softassert(b);
// if we are reasonable close to zmax, just use z's mod a and b (no change in the number of arithmetic progressions but it may reduce their length)
// the term 4-log2(ca) is meant to make us more willing to spend time lifting when we have more arithmetic progressions that can benefit
uint64_t l = fastceilboundl(zmaxld/((long double)d*b));
if ( l <= ZSHORT || l*c <= ZFEW ) {
uint64_t binv = kminv[mi];
uint32_t db = b32_red(d,b,binv);
uint32_t *zb = kmztab[mi]+db;
uint32_t dinvb = kmitab[mi][db]; softassert (b32_red(d*dinvb,b,binv)==1);
if ( (uint128_t)d*b > zmax128 ) zrcheckone (d, si, d, z, c, b, zb, 1, dinvb, binv);
else zrcheckafew (d, si, d, z, c, b, zb, 1, dinvb, binv, l);
} else {
// Lift progressions using cubic reciprocity constraints and auxiliary primes, then check
zrchecklift (d, si, 0, d, z, c);
}
profile_checkpoint (); // if we are profiling and have collected enough information, this will end the run
}
// process large prime d < 2^63 (large means close enough to zmax that we don't want to think about lifting other than modulo b where cb=1
// this means b=162 if k=3, b=126 if k = +/- 2 mod 7 and onezmod(d,si) is set, and b=18 otherwise
static inline void procdbigprime (uint64_t d, uint64_t z[], uint32_t c, uint32_t si, uint32_t mi, uint32_t l)
{
softassert (mi < 4 && km[mi]);
softassert (verify_cuberoots_64(z,c,d));
if ( ! report_d (d, c) ) return;
uint64_t binv = kminv[mi];
uint32_t b = km[mi], db = b32_red(d,b,binv), dinvb = kmitab[mi][db], *zb = kmztab[mi]+db;
if ( l==1 ) zrcheckone (d, si, d, z, c, b, zb, 1, dinvb, binv);
else zrcheckafew (d, si, d, z, c, b, zb, 1, dinvb, binv, l);
profile_checkpoint (); // if we are profiling and have collected enough information, this will end the run
}
// process d and all multiples d*m with m an admissible divisor of k (which is automatically coprime to d)
static inline void prockd (uint64_t d, uint64_t zd[], unsigned n)
{
procdcoprime (d, zd, n);
for ( int i = 1 ; d <= kdmax[i] ; i++ ) procd (i,d,zd,n);
}
// processes admissible dmin > cdmin (so d*cdmax >= dmax) with smallest prime divisor p (which may be less than cdmax)
// zd is a list of n cuberoots of k mod d, p is largest p|d, r is workspace for CRT-lifted cuberoots
static void inline enumcd (uint64_t d, uint64_t p, uint64_t zd[], uint32_t n, uint64_t *r)
{
struct cdrec *z[IBATCH];
struct cdrec *x;
uint64_t ai[IBATCH];
uint64_t dinv, R, R2, R3;
uint64_t *s;
uint32_t i,j,m;
softassert (d >= cdmin);
if ( ! (x = cdentry (p-1,d,dmax)) ) return;
softassert((uint128_t)d*x->d <= dmax);
softassert(x->p < p);
if ( d < sdmin ) { dinv = m64_pinv(d); R = m64_R(d); R2 = m64_R2(R,d); R3 = m64_R3 (R2,d,dinv); } else { dinv=R=R2=R3=0; }
for ( m = 0 ;;) { // terminates below when x hits the bottom of the cache, with x->d = 0
if ( !x->d || m == IBATCH ) {
if ( !m ) return;
softassert(dinv);
m64_inv_array (ai,ai,m,R,R2,R3,d,dinv);
for ( i = 0 ; i < m ; i++ ) {
uint64_t a = z[i]->d, u = a*m64_to_ui(ai[i],d,dinv) - 1, ab = a*d;
for ( s = r, j = 0 ; j < z[i]->n ; j++ ) {
uint64_t nza = a-cdroots[z[i]->r+j];
for ( int ii = 0 ; ii < n ; ii++ ) *s++ = fcrt64(u,nza,zd[ii],ab);
}
prockd (ab,r,s-r);
}
if ( !x->d ) return;
m = 0;
}
softassert((uint128_t)d*x->d <= dmax);
softassert(x->p < p);
if ( x->d <= sdmax ) {
struct sdrec *y = sdtab+x->sdpi;
softassert (y->d == x->d);
uint64_t sdinv = y->dinv;
uint64_t dinvsd = sdinvs[y->i+b32_red(d,y->d,sdinv)];
for ( i = 0, s = r ; i < n ; i++ ) for ( j = 0 ; j < x->n ; j++ ) *s++ = b32_crt64 (zd[i],d,sdroots[y->r+j],y->d,dinvsd,sdinv);
prockd (d*y->d,r,s-r);
} else {
ai[m] = m64_from_ui_R2 (x->d,R2,d,dinv); z[m] = x;
m++;
}
for ( x-- ; x->p >= p ; x-- );
}
}
// recursively enumerate admissible multiples of d by taking on prime powers; recursion ends with a call to enumcd
// zd is a list of n cuberoots of k mod d, p is the smallest prime divisor of d, r is workspace for CRT-lifted cuberoots
static void enumd (uint64_t d, uint64_t p, uint64_t zd[], uint32_t n, uint64_t *r)
{
uint64_t dinv, R, R2, R3;
uint64_t qq[IBATCH],ai[IBATCH], qz[3];
uint32_t qpi[IBATCH], qe[IBATCH];
uint64_t a,u,nza,ab,q,*s;
uint32_t i,j,e,qn,m,pi;
if ( d >= cdmin ) { enumcd (d,p,zd,n,r); return; }
softassert (p <= cpmax || (uint128_t)d*cpmax >= dmax );
if ( ! (pi = pimaxp (p-1,d,dmax)) ) return;
dinv = m64_pinv(d); R = m64_R(d); R2 = m64_R2(R,d); R3 = m64_R3 (R2,d,dinv);
q = cptab[pi]; e = 1;
for ( m = 0 ;; m++ ) { // terminates below when pi hits 0
if ( ! pi || m == IBATCH ) {
if ( !m ) return;
m64_inv_array (ai,ai,m,R,R2,R3,d,dinv);
for ( i = 0 ; i < m ; i++ ) {
a = qq[i]; u = a*m64_to_ui(ai[i],d,dinv) - 1; ab = a*d;
qn = cached_cuberoots_modq (qz,qpi[i],qe[i]);
s = r;
for ( j = 0 ; j < qn ; j++ ) { nza = a-qz[j]; for ( int ii = 0 ; ii < n ; ii++ ) *s++ = fcrt64(u,nza,zd[ii],ab); }
prockd (ab,r,s-r);
if ( ab >= cdmin ) enumcd (ab,cptab[qpi[i]],r,s-r,s);
else enumd (ab,cptab[qpi[i]],r,s-r,s);
}
if (!pi) return;
m = 0;
}
softassert((uint128_t)d*q <= dmax);
// for small q we could look up inverses here
qq[m] = q; qpi[m] = pi; qe[m] = e;
ai[m] = m64_from_ui_R2 (q,R2,d,dinv);
q *= cptab[pi]; e++;
if ( (uint128_t)d*q > dmax ) { q = cptab[--pi]; e = 1; }
}
}
static void precompute (uint32_t k, uint64_t pmin, uint64_t pmax)
{
char zbuf[64];
precompute_kdata (k, dmax);
precompute_zchecks (k);
precompute_cuberoots(k, pmin, pmax, dmax);
pdmin = 1 + dmax / (kdmin ? _min(kdmin,cptab[1]) : cptab[1]); // d >= pdmin must be prime not dividing k (and > 3)
if ( pdmin <= k ) pdmin = k+1;
bpmin = fastceilboundl(zmaxld/((km1&1?km2:km1)*ZSHORT)); // for d >= bpmin we will never use zrcheckmany
if ( bpmin <= 7 ) bpmin = 11;
report_printf ("LIMITS:pmin=%lu:pmax%lu:dmax=%lu:zmax=%s:cpmax=%u:cqmax=%lu:cdmax=%u:cdmin=%lu:sdmin=%lu:pdmin=%lu:bpmin=%lu\n", pmin, pmax, dmax, itoa128(zbuf,zmax128), cpmax, cqmax, cdmax, cdmin, sdmin, pdmin, bpmin);
}
void allocate_private_buffers (void)
{
rbuf = private_malloc (CUBEROOT_BUFSIZE*sizeof(*rbuf));
zabuf[0] = private_malloc((1<<ZBUFBITS)*sizeof(*zabuf[0]));
zabuf[1] = private_malloc((1<<ZBUFBITS)*sizeof(*zabuf[0]));
zbbuf[0] = private_malloc((1<<ZBUFBITS)*sizeof(*zbbuf[1]));
zbbuf[1] = private_malloc((1<<ZBUFBITS)*sizeof(*zbbuf[1]));
bm0buf = private_malloc((1<<(BMBITS-3)));
bm1buf = private_malloc((1<<(BMBITS-3)));
}
void free_private_buffers (void)
{
private_free (rbuf, CUBEROOT_BUFSIZE*sizeof(*rbuf));
private_free (zabuf[0], (1<<ZBUFBITS)*sizeof(*zabuf[0]));
private_free (zabuf[1], (1<<ZBUFBITS)*sizeof(*zabuf[0]));
private_free (zbbuf[0],(1<<ZBUFBITS)*sizeof(*zbbuf[1]));
private_free (zbbuf[1], (1<<ZBUFBITS)*sizeof(*zbbuf[1]));
private_free (bm0buf,(1<<(BMBITS-3)));
private_free (bm1buf,(1<<(BMBITS-3)));
}
// Used when largest p|d is fixed to a single prime p0 and we are iterating over the second largest prime
// In this scenario we assume all the primes involved are cached (and smaller than sqrt(dmax))
static void process_subprimes (uint32_t p0, uint32_t *itabp0, primes_pipe_ctx_t *pipe, int jobid, uint64_t *r)
{
uint64_t p0inv, p, q, pmax, dmax0;
uint32_t z0[3];
uint64_t z[3], zz[3];
uint32_t i,j,m,n,n0,pi,qinvp0;
pmax = pipe->end;
assert (pmax <= p0 && p0 <= cpmax);
pi = pimaxp (p0,1,dmax);
if ( primes_next_prime(p0-1) != p0 ) { report_printf ("Nothing to do for nonprime p0=%u\n", p0); return; }
if ( p0 > 1 && mod3(p0) == 1 && ! has_cuberoots_modp(K,p0) ) { report_printf ("Nothing to do, there are no cuberoots of k=%u mod p0=%u\n", K, p0); return; }
assert (cptab[pi]==p0);
dmax0 = dmax / p0;
p0inv = p0 > 2 ? b32_inv (p0) : (1UL<<63);
n0 = cached_cuberoots_modq (z,pi,1); assert (n0>0);
if ( ! n0 ) return;
for ( i = 0 ; i < n0 ; i++ ) z0[i] = (uint32_t) z[i];
p = primes_read_pipe(pipe,jobid);
pi = pimaxp(p,1,dmax0);
if ( pmax == p0 ) pmax--;
for ( ; p <= pmax && p < p0 ; p = primes_read_pipe (pipe,jobid) ) {
report_p (p); // we need to report p for checkpointing purposes by report.h knows not to increment pcnt
while ( pi <= cpcnt && cptab[pi] < p ) pi++; // a linear scan is almost certainly faaster than using pimaxp
if ( pi > cpcnt || cptab[pi] > p ) continue; // this may happen if there are no cuberoots of k mod p (even when pi > cpcnt)
for ( i=1,q=p ; (uint128_t)q*p <= dmax0 ; i++, q*= p ); // determine the largest power q of p that we need
n = ( cached_cuberoots_e (pi) >= i ? cached_cuberoots_modq (z,pi,i) : cuberoots_modq (z,K,p,i) ); // get cuberoots mod q=p^i, cached if possible
assert (n>0); m = n*n0;
for ( uint64_t pp=p ; pp < q ; pp*=p ) {
for ( i = 0 ; i < n ; i++ ) zz[i] = z[i]%pp; // each mod takes under 20 cycles, not worth trying to optimize
qinvp0 = itabp0[b32_red(pp,p0,p0inv)]; softassert (b32_red((uint64_t)qinvp0*b32_red(pp,p0,p0inv),p0,p0inv)==1);
for ( i = 0 ; i < n ; i++ ) for ( j = 0 ; j < n0 ; j++ ) r[i*n0+j] = b32_crt64(zz[i],pp,z0[j],p0,qinvp0,p0inv);
prockd (pp*p0,r,m); enumd(pp*p0,p,r,m,r+m); // process all d divisible by pp*p0 (prockd handles cofactors dividing k, enumd the rest)
}
qinvp0 = itabp0[b32_red(q,p0,p0inv)]; softassert (b32_red((uint64_t)qinvp0*b32_red(q,p0,p0inv),p0,p0inv)==1);
for ( i = 0 ; i < n ; i++ ) for ( j = 0 ; j < n0 ; j++ ) r[i*n0+j] = b32_crt64(z[i],q,z0[j],p0,qinvp0,p0inv);
prockd (q*p0,r,m); enumd(q*p0,p,r,m,r+m); // process all d divisible by q*p0 (prockd handles cofactors dividing k, enumd the rest)
}
// if p0 pops output the pipe we need to handle d=p and d divisible by p^2
if ( p==p0 ) {
report_p (p0); // report.h will increment pcnt for p=p0
while ( pi <= cpcnt && cptab[pi] < p ) pi++; // a linear scan is almost certainly faaster than using pimaxp
for ( i=1,q=p ; (uint128_t)q*p <= dmax ; i++, q*= p ); // determine the largest power q of p0 that we need
n = ( cached_cuberoots_e (pi) >= i ? cached_cuberoots_modq (z,pi,i) : cuberoots_modq (z,K,p0,i) ); // get cuberoots mod q=p^i, cached if possible
assert (n>0);
report_c (n); // we do report cuberoots for p=p0
for ( uint64_t pp=p ; pp < q ; pp*=p ) {
for ( i = 0 ; i < n ; i++ ) zz[i] = z[i]%pp; // each mod takes under 20 cycles, not worth trying to optimize
prockd (pp,zz,n); if ( pp > p ) enumd(pp,p,zz,n,r); // note that here we do not call enumd for pp=p, we effectively implemented this above
}
prockd (q,z,n); if ( q > p ) enumd(q,p,z,n,r); // note that here we do not call enumd for q=p, we effectively implemented this above
}
assert (p > pmax);
}
// This the main loop for each child thread (or the single main thread for n=1)
// For each p in the pipe (all p in [pmin,pmax] if we are the only core) processed all d with largest prime divisor p
static void process_primes (primes_pipe_ctx_t *pipe, int jobid, uint64_t *r)
{
uint64_t p, q, pmax;
uint64_t z[3], zz[3];
uint32_t i,j,n;
pmax = pipe->end;
p = primes_read_pipe(pipe,jobid);
if ( p > pmax ) return;
// Note that the code below relies on the fact that primes_read_pipe returns 2^64-1 on end of pipe
// So even if pipe->end < cpmax (for example), we will get a return value > cpmax at the end of the pipe
// Phase 1: primes p <= cpmax (note cpmax >= sqrt(dmax)) which were considerd during the precomputation phase
// For these primes we know cuberoots mod p (and some powers of p but not necessarily up to dmax, this depends on pmin)
// We are guaranteed that for any power of a prime q < p appearing in a cofactor we will have cached cuberoots (which enumd will use)
if ( p <= cpmax ) {
uint32_t pi = pimaxp(pipe->start,1,dmax);
for ( ; p <= cpmax && p <= pmax ; p = primes_read_pipe (pipe,jobid) ) {
if ( ! report_p(p) ) continue;
while ( pi <= cpcnt && cptab[pi] < p ) pi++; // a linear scan is almost certainly faaster than using pimaxp
if ( pi > cpcnt || cptab[pi] > p ) continue; // this may happen if there are no cuberoots of k mod p (even when pi > cpcnt)
for ( i=1, q=p ; (uint128_t)q*p <= dmax ; i++, q*= p ); // determine the largest power q of p that we need
// we could gain perhaps a factor of two in the line below in the uncached case by lifting the cuberoots mod p we have precomputed
// the benefit of doing this is negligible (for primes this small there are many d per p)
n = ( cached_cuberoots_e (pi) >= i ? cached_cuberoots_modq (z,pi,i) : cuberoots_modq (z,K,p,i) ); // get cuberoots mod q=p^i, cached if possible
assert (n>0);
if ( ! report_c (n) ) continue;
for ( uint64_t pp=p ; pp < q ; pp*=p ) {
for ( i = 0 ; i < n ; i++ ) zz[i] = z[i]%pp; // each mod takes under 20 cycles, not worth trying to optimize
prockd (pp,zz,n); enumd(pp,p,zz,n,r); // process all d divisible by pp < q (prockd handles cofactors dividing k, enumd the rest)
}
prockd (q,z,n); enumd(q,p,z,n,r); // process all d divisible by pp < q (prockd handles cofactors dividing k, enumd the rest)
}
}
if ( ! report_phase(PHASE_CACHED) ) goto done;
if ( p > pmax ) goto done;
// at this point we assume all primes up to cpmax >= sqrt(dmax) have been processed
// henceforth all d will be the product of a prime p > sqrt(dmax) and a sqrt(dmax)-smooth cofactor
assert (p > sqrt(dmax));
assert (3*p > K); // to avoid checking for p|k below we want to assume all such p are cached
// Phase 2: primes p in (cpmax,cdmin) -- both p > sqrt(dmax) and cofactors may be uncached (for dmax < CDMAX^2 there will be no such p)
// For these primes we compute cuberoots and then call enumd to recursively tack on powers of smaller primes (enumd will check for cached cuberoots)
for ( softassert (p>cpmax); p < cdmin && p <= pmax ; p = primes_read_pipe (pipe,jobid) ) {
if ( ! report_p(p) ) continue;
n = cuberoots_modp (z,K,p);
if ( ! n || ! report_c(n) ) continue;
prockd (p,z,n); enumd (p,p,z,n,r); // process all d divisible by pp < q (prockd handles cofactors dividing k, enumd the rest)
}
report_phase (PHASE_UNCACHED);
if ( p > pmax ) goto done;
// Phase 3: primes in [cdmin,sdmin) -- for these p all possible cofactors have cached cuberoots, but not necessarily cached inverses
// For these we compute cuberoots and then call (inline) enumcd to tack on precomputed roots modulo all admissible cofactors d with p*d <= dmax
for ( softassert (p>=cdmin) ; p < sdmin && p <= pmax; p = primes_read_pipe (pipe,jobid) ) {
if ( ! report_p (p) ) continue;
n = cuberoots_modp (z,K,p);
if ( ! n || ! report_c (n) ) continue;
prockd (p,z,n); enumcd (p,p,z,n,r); // process all d divisible by pp < q (prockd handles cofactors dividing k, enumd the rest)
}
report_phase (PHASE_COCACHED);
if ( p > pmax ) goto done;
// Phasse 4: primes in [sdmin,pdmin) -- for these p all possible cofactors have cached cuberoots and inverses
// For these primes we compute cuberoots and process all admissible cofactors using cached cuberoots and inverses
struct sdrec *x;
int pimax = sdcnt;
for ( softassert (p>=sdmin) ; p < pdmin && p <= pmax ; p = primes_read_pipe (pipe,jobid) ) {
if ( ! report_p(p) ) continue;
n = cuberoots_modp (z,K,p);
if ( !n || !report_c(n) ) continue;
prockd (p,z,n); // process all d that are p times a (possibly trivial) cofactor dividing k
while ( pimax && (uint128_t)p*sdtab[pimax].d > dmax ) pimax--;
for ( x = sdtab+pimax ; x > sdtab ; x-- ) {
uint64_t dinv = x->dinv;
uint64_t pinvb = sdinvs[x->i+b32_red(p,x->d,dinv)], *s = r;
for ( i = 0 ; i < n ; i++ ) for ( j = 0 ; j < x->n ; j++ ) *s++ = b32_crt64 (z[i],p,sdroots[x->r+j],x->d,pinvb,dinv);
prockd (p*x->d,r,s-r); // process all d that are p*x->d times a (possibly trivial) cofactor dividing k
}
}
report_phase (PHASE_NEARPRIME);
if ( p > pmax ) goto done;
// Phasse 5: primes in [pdmin,bpmin) -- we must have d=p, no cofactors are possible
// For these primes we just compute cuberoots and process d=p prime using procdcoprime
for ( softassert (p>=pdmin) ; p < bpmin && p <= pmax ; p = primes_read_pipe (pipe,jobid) ) {
if ( ! report_p(p) ) continue;
n = cuberoots_modp (z,K,p);
if (! n || ! report_c(n) ) continue;
procdcoprime (p,z,n); // process d = p
}
report_phase (PHASE_PRIME);
if ( p > pmax ) goto done;
// Phasse 6: primes in (bpmin,pmax] -- we have d=p prime and close enough to zmax that we don't want to split arithmetic progressions in order to lift them
// For these primes we just compute cuberoots and process d=p using procdprime (which will call zrcheckone or zrcheckafew but never zrcheckmany)
uint32_t si, mi = km1&1, m = km[mi]; // by default we mod km[mi] = 18 or 162 (if k=3)
softassert (m && !(m&1));
uint32_t l = fastceilboundl(zmaxld/((long double)p*m)); // l = length of arithmetic progressions for current p
softassert ( l <= ZSHORT && (uint128_t)l*p*m > zmax128 );
uint64_t lpmax = (uint128_t)(l-1)*m*pmax > zmax128 ? fastceilboundl (zmaxld/((long double)m*(l-1))) : pmax;
if ( mod7(K*K) != 4 ) {
for ( softassert (p >= bpmin) ; p <= pmax ; p = primes_read_pipe (pipe,jobid) ) {
if ( ! report_p(p) ) continue;
n = cuberoots_modp (z,K,p);
if ( !n || ! report_c(n) ) continue;
si = sgnz_index(p);
if ( p > lpmax ) { l = fastceilboundl(zmaxld/((long double)p*m)); lpmax = (uint128_t)(l-1)*m*pmax > zmax128 ? fastceilboundl (zmaxld/((long double)m*(l-1))) : pmax; }
procdbigprime (p,z,n,si,mi,l);
}
} else {
uint32_t mi7 = mi+2, m7 = km[mi7];
softassert (m7 && !(m7&1) && !mod7(m7)); // in fact m7=126
uint32_t l7 = fastceilboundl(zmaxld/((long double)p*m7)); // l = length of arithmetic progressions for current p
uint64_t lpmax7 = (uint128_t)(l-1)*m7*pmax > zmax128 ? fastceilboundl (zmaxld/((long double)m7*(l-1))) : pmax;
for ( softassert (p >= bpmin) ; p <= pmax ; p = primes_read_pipe (pipe,jobid) ) {
if ( ! report_p(p) ) continue;
n = cuberoots_modp (z,K,p);
if ( !n || ! report_c(n) ) continue;
si = sgnz_index(p);
if ( (j=onezmod7(p,si)) ) {
if ( p > lpmax7 ) { l7 = fastceilboundl(zmaxld/((long double)p*m7)); lpmax7 = (uint128_t)(l-1)*m7*pmax > zmax128 ? fastceilboundl (zmaxld/((long double)m7*(l-1))) : pmax; }
i = mi7; j = l7;
} else {
if ( p > lpmax ) { l = fastceilboundl(zmaxld/((long double)p*m)); lpmax = (uint128_t)(l-1)*m*pmax > zmax128 ? fastceilboundl (zmaxld/((long double)m*(l-1))) : pmax; }
i = mi; j = l;
}
procdbigprime (p,z,n,si,i,j);
}
}
report_phase (PHASE_BIGPRIME);
done:
assert (p > pmax);
}
int main (int argc, char *argv[])
{
uint64_t pmin, pmax, start_pmin;
uint32_t p0, *itabp0=0;
char *s;
int k, n, opts, cores, status;
if ( argc < 7 ) { fprintf (stderr," zcubes n k pmin pmax dmax zmax [options]\n (version %s)\n", VERSION_STRING); return 0; }
cores = atoi(argv[1]);
assert (cores >= 0);
if ( profiling() && cores != 1 ) { fprintf (stderr, "Setting cores to 1 for profiling.\n"); cores = 1; }
n = get_nprocs();
if ( ! cores ) { cores = n; report_printf ("Using %d threads.\n", cores); }
else { if ( cores > n ) fprintf (stderr, "WARNING: specified number of cores %d exceeds number of cores %d available\n", cores, n); }
k = atoi(argv[2]); if ( k < 0 || ! goodk(k) ) { fprintf (stderr, "ERROR: k=%d must be a cubefree postive integer <= 1000 congruent to 3 or 6 mod 9.\n",k); return -1; }
dmax = strto64(argv[5]);
if ( dmax > DMAX ) { fprintf (stderr, "ERROR: dmax = %lu cannot exceed DMAX = %lu\n", dmax, DMAX); return -1; }
if ( (s=strchr(argv[3],'x')) ) {
if ( memcmp(argv[3],argv[4],s+1-argv[3]) != 0 ) { fprintf (stderr, "ERROR: pmax=%s not valid for pmin=%s (if pmin=p0xq we require pmax=p0xr with r>=q)\n", argv[4], argv[3]); return -1; }
*s = '\0'; p0 = atoi(argv[3]);
if ( p0 < 2 ) { fprintf (stderr, "ERROR: p0=%u must be at least 2\n", p0); return -1; }
pmin = atoi(s+1);
pmax = atoi(argv[4]+(s+1-argv[3]));
if ( pmax > p0 ) { fprintf (stderr, "ERROR: We must have pmax=%u*%lu <= %u*%u\n", p0, pmax, p0, p0); return -1; }
if ( profiling() || (argc > 7 && atoi(argv[7])) ) { fprintf (stderr, "ERROR: Profiling and options are not permitted for pmin=%ux%lu pmax=%ux%lu\n", p0, pmin, p0, pmax); return -1; }
} else {
pmin = strto64(argv[3]);
pmax = strto64(argv[4]);
if ( cores > 1 && pmin == pmax && pmax <= sqrt(dmax) ) { p0 = pmin; pmin = 2; } else p0 = 1;
}
if ( p0 > 1 && primes_next_prime(p0-1) != p0 ) { fprintf (stderr, "WARNING: p0=%u is not prime\n", p0); }
if ( p0 > 1 && mod3(p0) == 1 && ! has_cuberoots_modp(k,p0) ) { fprintf (stderr, "WARNING: There are no cuberoots of k=%u mod p0=%u\n", k, p0); }
if ( p0 > 1 && !(k%p0) ) { fprintf (stderr, "ERROR: p0=%u divides k=%u, this case is not currently supported\n", p0, k); return -1; }
if ( pmin < 2 ) pmin = 2;
if ( pmax < pmin ) { fprintf (stderr, "ERROR: We must have pmin=%lu <= pmax=%lu and pmax > 1\n", pmin, pmax); return -1; }
zmax128 = strto128(argv[6]);
zmaxbits = ui128_len(zmax128);
if ( zmaxbits > ZMAXBITS ) { fprintf (stderr, "ERROR: zmax = %s cannot exceed 2^%d.\n", argv[6], ZMAXBITS); return -1; }
assert (zmax128 < ZMAX);
zmaxld = (long double) (zmax128 + (zmax128>>62) + 1); // add a fudge factor to account for the loss of precision
assert (zmaxld > zmax128);
if ( reporting() ) opts = argc > 7 ? atoi(argv[7]) : 0; else { opts = 0; if ( argc > 7 ) fprintf (stderr, "WARNING: Ignoring option %d with reporting off.\n", opts); }
if ( sqrt(dmax) < p0 ) { fprintf (stderr, "ERROR: We must have p0=%u <= sqrt(dmax)=%.1f\n", p0, sqrt(dmax)); return -1; }
if ( pmax < pmin || dmax < p0*pmax || zmax128 < dmax ) { char buf[64]; fprintf (stderr, "ERROR: We must have pmin=%lu <= pmax=%lu <= dmax=%lu <= zmax=%s\n", pmin, pmax, dmax, itoa128(buf,zmax128)); return -1; }
long double zminld = 3.847322101863072639L*dmax;
if ( zminld > zmaxld ) { fprintf (stderr, "WARNING: for dmax=%lud we have zminld=%Lf > zmaxld=%.0Lf, you should increase zmax or decrease dmax\n", dmax, zminld, zmaxld); if ( ! opts ) return -1; }
output_start (cores, k, p0, pmin, pmax, dmax, zmax128, opts);
start_pmin = report_start (cores, k, p0, pmin, pmax, dmax, zmax128, opts);
precompute (k, p0>1?p0:pmin, p0>1?p0:pmax);
if ( p0 > 1 ) {
itabp0 = shared_malloc (p0*sizeof(*itabp0));
uint32_t *w = malloc (2*p0*sizeof(*w));
invtab32 (itabp0, p0, &p0, 1, w);
free (w);
}
report_printf("Shared memory usage is %.3f MB\n", (double)shared_bytes()/(1<<20));
assert (!private_bytes());
if ( ! report_phase (PHASE_PRECOMPUTE) ) { report_end(); exit (0); }
if ( profiling() ) {
allocate_private_buffers (); profile_start (); process_primes (primes_create_pipe(start_pmin,pmax,0,0,0),0,rbuf); profile_end (); free_private_buffers();
assert(0); // we should never get here, we should terminate in the call to profile_end above
}
pid_t pids[cores+1];
primes_pipe_ctx_t *pipe = primes_create_pipe (start_pmin, pmax, cores, 0, 0);
for ( int i = 0 ; i < cores ; i++ ) {
if ( !(pids[i]=fork()) ) {
allocate_private_buffers();
if ( !i ) report_printf("Private memory usage is %d * %.3f MB = %.3f MB\n", cores, (double)private_bytes()/(1<<20), (double)(cores*private_bytes())/(1<<20));
report_job_start (i);
if ( p0 > 1 ) process_subprimes (p0, itabp0, pipe, i, rbuf); else process_primes (pipe, i, rbuf);
report_job_end (i);
free_private_buffers();
primes_close_pipe (pipe, i);
_exit (0);
}
if ( pids[i] < 0 ) { for ( int j = 0 ; j < i ; j++ ) { kill (pids[j],SIGTERM); } exit (-1); }
}
// create a separate child to feed the rest (this is the only one that will call primesieve)
if ( !(pids[cores]=fork()) ) {
while (primes_feed_pipe(pipe)); // if a job aborts we may wait forever here, but parent will kill everyone if this happens
primes_destroy_pipe (pipe); // this will wait for our siblings to call primes_close_pipe
_exit (0);
}
// if any child exits abnormally (e.g. due to an assert failure) kill them all and output an error
while ( wait(&status) > 0 ) if ( !WIFEXITED(status) || WEXITSTATUS(status) ) {
for ( int i = 0 ; i <= cores ; i++ ) { kill (pids[i],SIGTERM); }
output_end (cores, k, p0, pmin, pmax, dmax, zmax128, opts, 1);
exit (-1);
}
report_end ();
if ( reporting() && argc > 7 ) { // check for predictions specified on the command line that we want to compare against
uint64_t pcnt=0, ccnt=0, dcnt=0, rcnt=0;
for ( int i = 7 ; i < argc ; i++ ) {
if ( memcmp(argv[i],"pcnt=",5) == 0 ) pcnt = strto64(argv[i]+5);
if ( memcmp(argv[i],"ccnt=",5) == 0 ) ccnt = strto64(argv[i]+5);
if ( memcmp(argv[i],"dcnt=",5) == 0 ) dcnt = strto64(argv[i]+5);
if ( memcmp(argv[i],"rcnt=",5) == 0 ) rcnt = strto64(argv[i]+5);
}
report_comparisons (pcnt,ccnt,dcnt,rcnt);
}
output_end (cores, k, p0, pmin, pmax, dmax, zmax128, opts, 0);
exit (0);
}