J
Johannes Bauer
Hello group,
I've come from C/C++ and am now trying to code some Python because I
absolutely love the language. However I still have trouble getting
Python code to run efficiently. Right now I have a easy task: Get a
file, split it up into a million chunks, count the most prominent
character in each chunk and output that value into a file - in other
words: Say we have a 2 GB file, we evaluate what character is most
prominent in filepos [0, 2048[ - say it's a "A", then put a 65 in there
(ord("A")).
I've first tried Python. Please don't beat me, it's slow as hell and
probably a horrible solution:
#!/usr/bin/python
import sys
import os
f = open(sys.argv[1], "r")
filesize = os.stat(sys.argv[1])[6]
width = 1024
height = 1024
pixels = width * height
blocksize = filesize / width / height
print("Filesize : %d" % (filesize))
print("Image size : %dx%d" % (width, height))
print("Bytes per Pixel: %d" % (blocksize))
picture = { }
havepixels = 0
while True:
data = f.read(blocksize)
if len(data) <= 0: break
datamap = { }
for i in range(len(data)):
datamap[ord(data)] = datamap.get(data, 0) + 1
maxchr = None
maxcnt = None
for (char, count) in datamap.items():
if (maxcnt is None) or (count > maxcnt):
maxcnt = count
maxchr = char
most = maxchr
posx = havepixels % width
posy = havepixels / width
havepixels += 1
if (havepixels % 1024) == 0:
print("Progresss %s: %.1f%%" % (sys.argv[1], 100.0 * havepixels / pixels))
picture[(posx, posy)] = most
pic = open(sys.argv[1] + ".pgm", "w")
pic.write("P2\n")
pic.write("# CREATOR: Crappyass Python Script\n")
pic.write("%d %d\n" % (width, height))
pic.write("255\n")
for y in range(height):
for x in range(width):
pos = (x, y)
most = picture.get(pos, -1)
pic.write("%d\n" % (most))
As this was horribly slow (20 Minutes for a 2GB file) I coded the whole
thing in C also:
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#define BLOCKSIZE 2048
int main(int argc, char **argv) {
unsigned int count[256];
int width, height;
FILE *f;
FILE *in;
width = 1024;
height = 1024;
char temp[2048];
if (argc != 2) { fprintf(stderr, "Argument?\n"); exit(2); }
in = fopen(argv[1], "r");
if (!in) { perror("fopen"); exit(1); }
snprintf(temp, 255, "%s.pgm", argv[1]);
f = fopen(temp, "w");
if (!f) { perror("fopen"); exit(1); }
fprintf(f, "P2\n");
fprintf(f, "# CREATOR: C\n");
fprintf(f, "%d %d\n", width, height);
fprintf(f, "255\n");
width = 1024;
height = 1024;
while (fread(temp, 1, sizeof(temp), in) == sizeof(temp)) {
int i;
memset(count, 0, sizeof(count));
for (i = 0; i < sizeof(temp); i++) {
count[(int)temp]++;
}
int greatest;
int maxcount;
greatest = 0;
maxcount = count[0];
for (i = 1; i < 256; i++) {
if (count > maxcount) {
maxcount = count;
greatest = i;
}
}
fprintf(f, "%d\n", greatest);
}
fclose(f);
fclose(in);
return 0;
}
Which takes about 40 seconds. I want the niceness of Python but a little
more speed than I'm getting (I'd settle for factor 2 or 3 slower, but
factor 30 is just too much).
Can anyone point out how to solve this efficiently in Python?
Kind regards,
Johannes
I've come from C/C++ and am now trying to code some Python because I
absolutely love the language. However I still have trouble getting
Python code to run efficiently. Right now I have a easy task: Get a
file, split it up into a million chunks, count the most prominent
character in each chunk and output that value into a file - in other
words: Say we have a 2 GB file, we evaluate what character is most
prominent in filepos [0, 2048[ - say it's a "A", then put a 65 in there
(ord("A")).
I've first tried Python. Please don't beat me, it's slow as hell and
probably a horrible solution:
#!/usr/bin/python
import sys
import os
f = open(sys.argv[1], "r")
filesize = os.stat(sys.argv[1])[6]
width = 1024
height = 1024
pixels = width * height
blocksize = filesize / width / height
print("Filesize : %d" % (filesize))
print("Image size : %dx%d" % (width, height))
print("Bytes per Pixel: %d" % (blocksize))
picture = { }
havepixels = 0
while True:
data = f.read(blocksize)
if len(data) <= 0: break
datamap = { }
for i in range(len(data)):
datamap[ord(data)] = datamap.get(data, 0) + 1
maxchr = None
maxcnt = None
for (char, count) in datamap.items():
if (maxcnt is None) or (count > maxcnt):
maxcnt = count
maxchr = char
most = maxchr
posx = havepixels % width
posy = havepixels / width
havepixels += 1
if (havepixels % 1024) == 0:
print("Progresss %s: %.1f%%" % (sys.argv[1], 100.0 * havepixels / pixels))
picture[(posx, posy)] = most
pic = open(sys.argv[1] + ".pgm", "w")
pic.write("P2\n")
pic.write("# CREATOR: Crappyass Python Script\n")
pic.write("%d %d\n" % (width, height))
pic.write("255\n")
for y in range(height):
for x in range(width):
pos = (x, y)
most = picture.get(pos, -1)
pic.write("%d\n" % (most))
As this was horribly slow (20 Minutes for a 2GB file) I coded the whole
thing in C also:
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>
#define BLOCKSIZE 2048
int main(int argc, char **argv) {
unsigned int count[256];
int width, height;
FILE *f;
FILE *in;
width = 1024;
height = 1024;
char temp[2048];
if (argc != 2) { fprintf(stderr, "Argument?\n"); exit(2); }
in = fopen(argv[1], "r");
if (!in) { perror("fopen"); exit(1); }
snprintf(temp, 255, "%s.pgm", argv[1]);
f = fopen(temp, "w");
if (!f) { perror("fopen"); exit(1); }
fprintf(f, "P2\n");
fprintf(f, "# CREATOR: C\n");
fprintf(f, "%d %d\n", width, height);
fprintf(f, "255\n");
width = 1024;
height = 1024;
while (fread(temp, 1, sizeof(temp), in) == sizeof(temp)) {
int i;
memset(count, 0, sizeof(count));
for (i = 0; i < sizeof(temp); i++) {
count[(int)temp]++;
}
int greatest;
int maxcount;
greatest = 0;
maxcount = count[0];
for (i = 1; i < 256; i++) {
if (count > maxcount) {
maxcount = count;
greatest = i;
}
}
fprintf(f, "%d\n", greatest);
}
fclose(f);
fclose(in);
return 0;
}
Which takes about 40 seconds. I want the niceness of Python but a little
more speed than I'm getting (I'd settle for factor 2 or 3 slower, but
factor 30 is just too much).
Can anyone point out how to solve this efficiently in Python?
Kind regards,
Johannes