Alex said:
Tim Hochberg wrote:
...
Still thinking of gmpy's primitives, I came up with a variant of this...:
[Nice compact version using gmpy snipped]
Just to keep my hand in, I'm attaching a version based on the idea above
which is in turn based on Emile's and Lulu's ideas for compression.
This produces data files that total just over 6 GB for all primes up to
1E8. The initial run is quite slow, but after that it can determine the
nth prime, or the number of primes below some number in about 0.5 ms on
my box.
The first run, building the file primes.dat (just 23048 bytes), took
almost 7 minutes elapsed time on my machine (250 CPU seconds). But
any successive run is extremely fast:
[alex@lancelot perex]$ time python -O prio.py
5762 primes stored (1 every 1000), highest is 99991609
55555th prime is 686671
Up to 555555, there are 45741 primes
0.06user 0.00system 0:00.14elapsed 41%CPU (0avgtext+0avgdata 0maxresident)k
0inputs+0outputs (494major+297minor)pagefaults 0swaps
[alex@lancelot perex]$
I've coded this rapidly and sloppily, so I wouldn't be surprised if
there were off-by-one errors here and there (needs to be checked against
some other, independent way to generate/check primes!), but I think the
fundamental ideas are sound.
Well one of us is off by one as I get 45740 primes below 55555. However,
I get 686671 as the 55555th prime, so we agree there. I would not be at
all suprised if it is mine that is off by one.
-tim
import array, bisect, psyco, struct
def eratosthenes(max=None):
'''Yields the sequence of prime numbers via the Sieve of Eratosthenes.'''
# Mostly based on Alex Marteli's version from the Python Cookbook. The max
# attribute, which I addded, trims the memory usage substantially when
# computing lots of primes. Of course you need to know your max prime in
# advance.
D = {} # map composite integers to primes witnessing their compositeness
q = 2 # first integer to test for primality
while (max is None) or (q <= max):
p = D.pop(q, None)
if p:
x = p + q
while x in D:
x += p
if (x is None) or (x <= max):
D[x] = p
else:
q2 = q*q
if (max is None) or (q2 <= max):
D[q2] = q
yield q
q += 1
psyco.bind(eratosthenes)
def encode(delta):
# To encode N-bytes, preface with (N-1) leading zero-bytes
enc = []
delta /= 2
while delta:
delta, byte = divmod(delta, 128)
enc.append(byte)
enc.reverse()
result = ['\0'] * (len(enc) - 1)
for byte in enc[:-1]:
result.append(chr(byte+1))
result.append(chr(enc[-1]))
return ''.join(result)
def decode(chars):
chars = chars.lstrip('\0')
bytes = []
for c in chars[:-1]:
bytes.append(ord(c)-1)
bytes.append(ord(chars[-1]))
value = bytes[0]
for b in bytes[1:]:
value *= 128
value += b
return 2 * value
def pop_delta(data):
# return delta, data_without_delta
index = 0
while data[index] == '\0':
index += 1
index = 2*index + 1
return decode(data[:index]), data[index:]
def encode_primes(n, chunk, basename):
primes = eratosthenes(n)
index_file = open(basename+'.index', 'wb')
everyn_file = open(basename+'.everyn', 'wb')
encoded_file = open(basename+'.encoded', 'wb')
index_file.write(struct.pack('l', chunk))
everyn_file.write(struct.pack('l', chunk))
# We reserve some space now and come back later and fill in the length
encoded_file.write(struct.pack('l', 0))
# We write to the files a chunk at a time.
encoded_chunk = ''
primes.next() # skip 2
p0 = primes.next()
index = 0
for i, p in enumerate(primes):
delta = p - p0
if not (i % chunk):
index += len(encoded_chunk)
index_file.write(struct.pack('l', index))
everyn_file.write(struct.pack('l', p0))
encoded_file.write(encoded_chunk)
encoded_chunk = ''
encoded_chunk += encode(delta)# This is supposed to be fast under psyco
p0 = p
# write leftover data
index += len(encoded_chunk)
index_file.write(struct.pack('l', index))
encoded_file.write(encoded_chunk)
encoded_file.seek(0)
encoded_file.write(struct.pack('l', i))
psyco.bind(encode_primes)
class Primes:
def __init__(self, basename):
sizel = struct.calcsize('l')
index_file = open(basename+'.index', 'rb')
[self.chunk] = struct.unpack('l', index_file.read(sizel))
index_data = index_file.read()
self.indices = struct.unpack("%sl" % (len(index_data) // sizel), index_data)
index_file.close()
everyn_file = open(basename+'.everyn', 'rb')
[chunk] = struct.unpack('l', everyn_file.read(sizel))
assert self.chunk == chunk, "inconsistent chunk sizes (%s vs %s)" % (self.chunk, chunk)
everyn_data = everyn_file.read()
self.everyn = struct.unpack("%sl" % (len(everyn_data) // sizel), everyn_data)
everyn_file.close()
encoded_file = open(basename+'.encoded', 'rb')
[self.nprimes] = struct.unpack('l', encoded_file.read(sizel))
self.encoded = encoded_file.read()
def __len__(self):
return self.nprimes
def __getitem__(self, n):
if n < 0:
n += self.nprimes
if not isinstance(n, int) and (0 <= n < self._len):
raise IndexError()
if n == 0:
return 2
cindex, pindex = divmod((n-1), self.chunk)
return self.decode_chunk(cindex)[pindex]
def decode_chunk(self, n):
cprimes = [self.everyn[n]]
data = self.encoded[self.indices[n]:self.indices[n+1]]
while data:
delta, data = pop_delta(data)
cprimes.append(cprimes[-1] + delta)
return cprimes
def upto(self, n):
if n < 2:
return 0
elif n < 3:
return 1
else:
n0 = max(bisect.bisect_right(self.everyn, n) - 1, 0)
chunk = self.decode_chunk(n0)
return bisect.bisect_right(chunk, n) + n0*self.chunk + 1
basename = "packedprimes"
try:
primes = Primes(basename)
except: # be more specific...
encode_primes(int(1e8), 64, basename)
primes = Primes(basename)
if __name__ == "__main__":
for n in 1, 2, 2.5, 3, 3.5, 5, 13, 13.5, 555555:
print "Up to", n, ", there are %d primes" % primes.upto(n)
print primes[55555-1]
import timeit
t = timeit.Timer("packedprimes.primes.upto(555555)", "import packedprimes")
print "seconds per call to upto =", t.timeit(number=1000) / 1000
t = timeit.Timer("packedprimes.primes[99834]", "import packedprimes")
print "seconds per call to [] =", t.timeit(number=1000) / 1000
#~ eprimes = eratosthenes(int(1e5))
#~ for i, p in enumerate(eprimes):
#~ assert p == primes
, "mismatch at prime[%s] (%s vs %s)" % (i, p, primes)
#~ print "OK"