Implementing file reading in C/Python

J

Johannes Bauer

Hello group,

I've come from C/C++ and am now trying to code some Python because I
absolutely love the language. However I still have trouble getting
Python code to run efficiently. Right now I have a easy task: Get a
file, split it up into a million chunks, count the most prominent
character in each chunk and output that value into a file - in other
words: Say we have a 2 GB file, we evaluate what character is most
prominent in filepos [0, 2048[ - say it's a "A", then put a 65 in there
(ord("A")).

I've first tried Python. Please don't beat me, it's slow as hell and
probably a horrible solution:

#!/usr/bin/python
import sys
import os

f = open(sys.argv[1], "r")
filesize = os.stat(sys.argv[1])[6]

width = 1024
height = 1024
pixels = width * height
blocksize = filesize / width / height

print("Filesize : %d" % (filesize))
print("Image size : %dx%d" % (width, height))
print("Bytes per Pixel: %d" % (blocksize))

picture = { }
havepixels = 0
while True:
data = f.read(blocksize)
if len(data) <= 0: break

datamap = { }
for i in range(len(data)):
datamap[ord(data)] = datamap.get(data, 0) + 1

maxchr = None
maxcnt = None
for (char, count) in datamap.items():
if (maxcnt is None) or (count > maxcnt):
maxcnt = count
maxchr = char

most = maxchr

posx = havepixels % width
posy = havepixels / width

havepixels += 1
if (havepixels % 1024) == 0:
print("Progresss %s: %.1f%%" % (sys.argv[1], 100.0 * havepixels / pixels))

picture[(posx, posy)] = most

pic = open(sys.argv[1] + ".pgm", "w")
pic.write("P2\n")
pic.write("# CREATOR: Crappyass Python Script\n")
pic.write("%d %d\n" % (width, height))
pic.write("255\n")
for y in range(height):
for x in range(width):
pos = (x, y)
most = picture.get(pos, -1)
pic.write("%d\n" % (most))

As this was horribly slow (20 Minutes for a 2GB file) I coded the whole
thing in C also:

#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>

#define BLOCKSIZE 2048

int main(int argc, char **argv) {
unsigned int count[256];
int width, height;
FILE *f;
FILE *in;
width = 1024;
height = 1024;
char temp[2048];

if (argc != 2) { fprintf(stderr, "Argument?\n"); exit(2); }

in = fopen(argv[1], "r");
if (!in) { perror("fopen"); exit(1); }

snprintf(temp, 255, "%s.pgm", argv[1]);
f = fopen(temp, "w");
if (!f) { perror("fopen"); exit(1); }

fprintf(f, "P2\n");
fprintf(f, "# CREATOR: C\n");
fprintf(f, "%d %d\n", width, height);
fprintf(f, "255\n");

width = 1024;
height = 1024;
while (fread(temp, 1, sizeof(temp), in) == sizeof(temp)) {
int i;
memset(count, 0, sizeof(count));
for (i = 0; i < sizeof(temp); i++) {
count[(int)temp]++;
}

int greatest;
int maxcount;

greatest = 0;
maxcount = count[0];
for (i = 1; i < 256; i++) {
if (count > maxcount) {
maxcount = count;
greatest = i;
}
}

fprintf(f, "%d\n", greatest);
}

fclose(f);
fclose(in);
return 0;
}

Which takes about 40 seconds. I want the niceness of Python but a little
more speed than I'm getting (I'd settle for factor 2 or 3 slower, but
factor 30 is just too much).

Can anyone point out how to solve this efficiently in Python?

Kind regards,
Johannes
 
M

MRAB

Johannes said:
Hello group,

I've come from C/C++ and am now trying to code some Python because I
absolutely love the language. However I still have trouble getting
Python code to run efficiently. Right now I have a easy task: Get a
file, split it up into a million chunks, count the most prominent
character in each chunk and output that value into a file - in other
words: Say we have a 2 GB file, we evaluate what character is most
prominent in filepos [0, 2048[ - say it's a "A", then put a 65 in there
(ord("A")).

I've first tried Python. Please don't beat me, it's slow as hell and
probably a horrible solution:

#!/usr/bin/python
import sys
import os

f = open(sys.argv[1], "r")
filesize = os.stat(sys.argv[1])[6]

width = 1024
height = 1024
pixels = width * height
blocksize = filesize / width / height

print("Filesize : %d" % (filesize))
print("Image size : %dx%d" % (width, height))
print("Bytes per Pixel: %d" % (blocksize))

picture = { }
havepixels = 0
while True:
data = f.read(blocksize)
if len(data) <= 0: break

datamap = { }
for i in range(len(data)):
datamap[ord(data)] = datamap.get(data, 0) + 1

maxchr = None
maxcnt = None
for (char, count) in datamap.items():
if (maxcnt is None) or (count > maxcnt):
maxcnt = count
maxchr = char

most = maxchr

posx = havepixels % width
posy = havepixels / width

havepixels += 1
if (havepixels % 1024) == 0:
print("Progresss %s: %.1f%%" % (sys.argv[1], 100.0 * havepixels / pixels))

picture[(posx, posy)] = most

pic = open(sys.argv[1] + ".pgm", "w")
pic.write("P2\n")
pic.write("# CREATOR: Crappyass Python Script\n")
pic.write("%d %d\n" % (width, height))
pic.write("255\n")
for y in range(height):
for x in range(width):
pos = (x, y)
most = picture.get(pos, -1)
pic.write("%d\n" % (most))

As this was horribly slow (20 Minutes for a 2GB file) I coded the whole
thing in C also:

#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>

#define BLOCKSIZE 2048

int main(int argc, char **argv) {
unsigned int count[256];
int width, height;
FILE *f;
FILE *in;
width = 1024;
height = 1024;
char temp[2048];

if (argc != 2) { fprintf(stderr, "Argument?\n"); exit(2); }

in = fopen(argv[1], "r");
if (!in) { perror("fopen"); exit(1); }

snprintf(temp, 255, "%s.pgm", argv[1]);
f = fopen(temp, "w");
if (!f) { perror("fopen"); exit(1); }

fprintf(f, "P2\n");
fprintf(f, "# CREATOR: C\n");
fprintf(f, "%d %d\n", width, height);
fprintf(f, "255\n");

width = 1024;
height = 1024;
while (fread(temp, 1, sizeof(temp), in) == sizeof(temp)) {
int i;
memset(count, 0, sizeof(count));
for (i = 0; i < sizeof(temp); i++) {
count[(int)temp]++;
}

int greatest;
int maxcount;

greatest = 0;
maxcount = count[0];
for (i = 1; i < 256; i++) {
if (count > maxcount) {
maxcount = count;
greatest = i;
}
}

fprintf(f, "%d\n", greatest);
}

fclose(f);
fclose(in);
return 0;
}

Which takes about 40 seconds. I want the niceness of Python but a little
more speed than I'm getting (I'd settle for factor 2 or 3 slower, but
factor 30 is just too much).

Can anyone point out how to solve this efficiently in Python?

Have a look at psyco: http://psyco.sourceforge.net/
 
J

James Mills

Hello group,
Hello.

(...)

Which takes about 40 seconds. I want the niceness of Python but a little
more speed than I'm getting (I'd settle for factor 2 or 3 slower, but
factor 30 is just too much).

Can anyone point out how to solve this efficiently in Python?

Johannes, your 2 programs, 1 in Python and the
other in C do _not_ produce the same result.

I have tested this against a randomly generated
file from /dev/urandom (10M). Yes the Python
one is much slower, but I believe it's bebcause
the Python implementation is _correct_ where
teh C one is _wrong_ :)

The resulting test.bin.pgm from python is exactly
3.5M (from 10M). The resulting test.bin.pgm from
the C version is 16K.

Something is not quite right here :)

cheers
James
 
J

James Mills

Uhh, yes, you're right there... I must admit that I was too lazy to
include all the stat headers and to a proper st_size check in the C
version (just a quick hack), so it's practically hardcoded.

With files of exactly 2GB in size the results should be the same (more
or less, +- 1 line doesn't matter really), because 2 GB / 2048 (the
buffer) = 1 Million.

Sorry I didn't mention that, it was really kind of sloppy,
quick-and-dirty C writing on my part. But you're right, the Python
implementation does what is actually supposed to happen.

I shall attempt to optimize this :)
I have a funny feeling you might be caught up with
some features of Python - one notable one being that
some things in Python are immutable.

psyco might help here though ...

cheers
James
 
J

Johannes Bauer

James said:
I have tested this against a randomly generated
file from /dev/urandom (10M). Yes the Python
one is much slower, but I believe it's bebcause
the Python implementation is _correct_ where
teh C one is _wrong_ :)

The resulting test.bin.pgm from python is exactly
3.5M (from 10M). The resulting test.bin.pgm from
the C version is 16K.

Something is not quite right here :)

Uhh, yes, you're right there... I must admit that I was too lazy to
include all the stat headers and to a proper st_size check in the C
version (just a quick hack), so it's practically hardcoded.

With files of exactly 2GB in size the results should be the same (more
or less, +- 1 line doesn't matter really), because 2 GB / 2048 (the
buffer) = 1 Million.

Sorry I didn't mention that, it was really kind of sloppy,
quick-and-dirty C writing on my part. But you're right, the Python
implementation does what is actually supposed to happen.

Kind regards,
Johannes
 
J

James Mills

I shall attempt to optimize this :)
I have a funny feeling you might be caught up with
some features of Python - one notable one being that
some things in Python are immutable.

psyco might help here though ...

What does this little tool do anyway ?
It's very interesting the images it creates
out of files. What is this called ?

I'm curious :) I haven't had much tiem to
optimize it yet - I'll try to when I get home from work.

cheers
James
 
M

Marc 'BlackJack' Rintsch

I've first tried Python. Please don't beat me, it's slow as hell and
probably a horrible solution:

#!/usr/bin/python
import sys
import os

f = open(sys.argv[1], "r")

Mode should be 'rb'.
filesize = os.stat(sys.argv[1])[6]

`os.path.getsize()` is a little bit more readable.
width = 1024
height = 1024
pixels = width * height
blocksize = filesize / width / height

print("Filesize : %d" % (filesize)) print("Image size : %dx%d"
% (width, height)) print("Bytes per Pixel: %d" % (blocksize))

Why parentheses around ``print``\s "argument"? In Python <3 ``print`` is
a statement and not a function.
picture = { }
havepixels = 0
while True:
data = f.read(blocksize)
if len(data) <= 0: break

if data:
break

is enough.
datamap = { }
for i in range(len(data)):
datamap[ord(data)] = datamap.get(data, 0) + 1


Here you are creating a list full of integers to use them as index into
`data` (twice) instead of iterating directly over the elements in
`data`. And you are calling `ord()` for *every* byte in the file
although you just need it for one value in each block. If it's possible
to write the raw PGM format this conversion wouldn't be necessary at all.

For the `datamap` a `collections.defaultdict()` might be faster.
maxchr = None
maxcnt = None
for (char, count) in datamap.items():
if (maxcnt is None) or (count > maxcnt):
maxcnt = count
maxchr = char

Untested:

maxchr = max((i, c) for c, i in datamap.iteritems())[1]
most = maxchr
Why?

posx = havepixels % width
posy = havepixels / width

posx, posy = divmod(havepixels, width)

Don't know if this is faster though.
havepixels += 1
if (havepixels % 1024) == 0:
print("Progresss %s: %.1f%%" % (sys.argv[1], 100.0 * havepixels /
pixels))

picture[(posx, posy)] = most

Why are you using a dictionary as "2d array"? In the C code you simply
write the values sequentially, why can't you just use a flat list and
append here?

Ciao,
Marc 'BlackJack' Rintsch
 
J

James Mills

Why parentheses around ``print``\s "argument"? In Python <3 ``print`` is
a statement and not a function.

Not true as of 2.6+ and 3.0+

print is now a function.

cheers
James
 
M

Marc 'BlackJack' Rintsch

On Fri, Jan 9, 2009 at 7:15 PM, Marc 'BlackJack' Rintsch


Not true as of 2.6+ and 3.0+

print is now a function.

Please read again what I wrote.

Ciao,
Marc 'BlackJack' Rintsch
 
M

Marc 'BlackJack' Rintsch

datamap = { }
for i in range(len(data)):
datamap[ord(data)] = datamap.get(data, 0) + 1


Here is an error by the way: You call `ord()` just on the left side of
the ``=``, so all keys in the dictionary are mapped to ones after the
loop which gives a pretty boring PGM. :)

Ciao,
Marc 'BlackJack' Rintsch
 
S

Steven D'Aprano

Not true as of 2.6+ and 3.0+

print is now a function.


Not so. print is still a statement in 2.6.

$ python2.6
Python 2.6.1 (r261:67515, Dec 24 2008, 00:33:13)
[GCC 4.1.2 20070502 (Red Hat 4.1.2-12)] on linux2
Type "help", "copyright", "credits" or "license" for more information.23
 
S

Steven D'Aprano

if data:
break

is enough.


You've reversed the sense of the test. The OP exits the loop when data is
empty, you exit the loop when it *isn't* empty.
 
M

Marc 'BlackJack' Rintsch

As this was horribly slow (20 Minutes for a 2GB file) I coded the whole
thing in C also:

Yours took ~37 minutes for 2 GiB here. This "just" ~15 minutes:

#!/usr/bin/env python
from __future__ import division, with_statement
import os
import sys
from collections import defaultdict
from functools import partial
from itertools import imap


def iter_max_values(blocks, block_count):
for i, block in enumerate(blocks):
histogram = defaultdict(int)
for byte in block:
histogram[byte] += 1

yield max((count, byte)
for value, count in histogram.iteritems())[1]

if i % 1024 == 0:
print 'Progresss: %.1f%%' % (100 * i / block_count)


def write_pgm(filename, width, height, pixel_values):
with open(filename, 'w') as pgm_file:
pgm_file.write('P2\n'
'# CREATOR: Crappyass Python Script\n'
'%d %d\n'
'255\n' % (width, height))
pgm_file.writelines('%d\n' % value for value in pixel_values)


def main():
filename = sys.argv[1]
filesize = os.path.getsize(filename)

width = 1024
height = 1024
pixels = width * height
blocksize = filesize // width // height

print 'Filesize : %d' % filesize
print 'Image size : %dx%d' % (width, height)
print 'Bytes per Pixel: %d' % blocksize

with open(filename, 'rb') as data_file:
blocks = iter(partial(data_file.read, blocksize), '')
pixel_values = imap(ord, iter_max_values(blocks, pixels))
write_pgm(filename + '.pgm', width, height, pixel_values)


if __name__ == '__main__':
main()


Ciao,
Marc 'BlackJack' Rintsch
 
S

Steve Holden

Steven said:
Not true as of 2.6+ and 3.0+

print is now a function.


Not so. print is still a statement in 2.6.

$ python2.6
Python 2.6.1 (r261:67515, Dec 24 2008, 00:33:13)
[GCC 4.1.2 20070502 (Red Hat 4.1.2-12)] on linux2
Type "help", "copyright", "credits" or "license" for more information.23
C:\Users\sholden>\python26\python
Python 2.6 (r26:66721, Oct 2 2008, 11:35:03) [MSC v.1500 32 bit
(Intel)] on win32
Type "help", "copyright", "credits" or "license" for more information.
OK, I confess I missed out

regards
Steve
 
M

mk

Johannes said:
Which takes about 40 seconds. I want the niceness of Python but a little
more speed than I'm getting (I'd settle for factor 2 or 3 slower, but
factor 30 is just too much).

This probably doesn't contribute much, but have you tried using Python
profiler? You might have *something* wrong that eats up a lot of time in
the code.

The factor of 30 indeed does not seem right -- I have done somewhat
similar stuff (calculating Levenshtein distance [edit distance] on words
read from very large files), coded the same algorithm in pure Python and
C++ (using linked lists in C++) and Python version was 2.5 times slower.

Regards,
mk
 
J

Johannes Bauer

Marc said:
f = open(sys.argv[1], "r")

Mode should be 'rb'.
Check.
filesize = os.stat(sys.argv[1])[6]

`os.path.getsize()` is a little bit more readable.
Check.
print("Filesize : %d" % (filesize)) print("Image size : %dx%d"
% (width, height)) print("Bytes per Pixel: %d" % (blocksize))

Why parentheses around ``print``\s "argument"? In Python <3 ``print`` is
a statement and not a function.

I write all new code to work under Python3.0. Actually I develop on
Python 3.0 but the code is currently deployed onto 2.6.
picture = { }
havepixels = 0
while True:
data = f.read(blocksize)
if len(data) <= 0: break

if data:
break

is enough.
datamap = { }
for i in range(len(data)):
datamap[ord(data)] = datamap.get(data, 0) + 1


Here you are creating a list full of integers to use them as index into
`data` (twice) instead of iterating directly over the elements in
`data`. And you are calling `ord()` for *every* byte in the file
although you just need it for one value in each block. If it's possible
to write the raw PGM format this conversion wouldn't be necessary at all.


OK, those two are just stupid, you're right. I changed it to:

datamap = { }
for i in data:
datamap = datamap.get(i, 0) + 1

array = sorted([(b, a) for (a, b) in datamap.items()], reverse=True)
most = ord(array[0][1])
pic.write("%d\n" % (most))

For the `datamap` a `collections.defaultdict()` might be faster.

Tried that, not much of a change.
maxchr = None
maxcnt = None
for (char, count) in datamap.items():
if (maxcnt is None) or (count > maxcnt):
maxcnt = count
maxchr = char

Untested:

maxchr = max((i, c) for c, i in datamap.iteritems())[1]

This is nice, I use it - the sort thing was a workaround anyways.

I don't really know anymore :-\
posx, posy = divmod(havepixels, width)

That's a nice one.
Why are you using a dictionary as "2d array"? In the C code you simply
write the values sequentially, why can't you just use a flat list and
append here?

Yup, I changed the Python code to behave the same way the C code did -
however overall it's not much of an improvement: Takes about 15 minutes
to execute (still factor 23).

Thanks for all your pointers!

Kind regards,
Johannes
 
J

Johannes Bauer

James said:
What does this little tool do anyway ?
It's very interesting the images it creates
out of files. What is this called ?

It has no particular name. I was toying around with the Princeton Cold
Boot Attack (http://citp.princeton.edu/memory/). In particular I was
interested in how much memory is erased when I would (on my system)
enable the slow POST (which counts through all RAM three times).

I downloaded the provided utitilities, dumped my system memory via PXE
boot onto another system after resetting it hard in the middle of a
running Linux session. I did sync, though. Praise all journaling
filesystems.

As a 2GB file is not really of much use for telling where something is
and where isn't, I thought of that picture coloring. In a 1024x1024
picture a pixel is 2048 bytes with 2GB of RAM, so exactly half a page.
This is sufficiently high resolution to detect what's in there.
I'm curious :) I haven't had much tiem to
optimize it yet - I'll try to when I get home from work.

Thanks for your effort, I appreciate it... hope my work leads to some
meaningful results. Currently it looks (*cough* if there aren't bugs in
my picture code) as if my PC would reset the whole RAM anyways, although
I do not have any ECC. Strange.

Kind regards,
Johannes
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
474,039
Messages
2,570,376
Members
47,029
Latest member
EmiliaSton

Latest Threads

Top