C++ programming challenge

Peter Jansson · Jun 10, 2009

Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson

Ioannis Vranos · Jun 10, 2009

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson

May you provide the about.com relevant URL?

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

Peter Jansson · Jun 10, 2009

May you provide the about.com relevant URL?

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

There is no relevant about.com relevant URL at the time of this
writing. If they approve and use my challenge suggestion, I will
update my weblog post accordingly.

Chris M. Thomasson · Jun 10, 2009

Peter Jansson said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

Here is a stupid simple solution; needs ASCII:
____________________________________________________________________
#include <stdio.h>

#define BUFFER 32768U

static
unsigned long
count_chars(
void const* buf,
size_t size,
unsigned long* counts
) {
unsigned char const* cur = buf;
unsigned char const* const end = cur + size;
unsigned long total = 0;
while (cur < end) {
if (*cur >= 'a' && *cur <= 'a' + 26) {
++total;
++counts[*cur - 'a'];
} else if (*cur >= 'A' && *cur < 'A' + 26) {
++total;
++counts[*cur - 'A'];
}
++cur;
}
return total;
}

int main(void) {
size_t i;
FILE* file;
unsigned long total = 0;
unsigned long counts[26] = { 0 };

file = fopen("./data.txt", "rb");

if (file) {
size_t size;
static unsigned char buf[BUFFER];

while ((size = fread(buf, 1, BUFFER, file))) {
total += count_chars(buf, size, counts);
printf("\rcharacters processed: %lu", total);
if (feof(file) || ferror(file)) break;
}

fclose(file);
}

puts("\n____________________________________________________");

for (i = 0; i < 26; ++i) {
printf("%c %%%f\n", i + 'a',
(total) ? ((double)counts / total) * 100 : 0.0);
}

return 0;
}
____________________________________________________________________

Of course, if you wanted to parse huge multi-gigabyte files, this simplistic
approach would not be ideal. I can think of several different ways to
improve this thing through parallelization and memory mapped files. But,
that would not be standard C or C++.

Chris M. Thomasson · Jun 10, 2009

Chris M. Thomasson said:
Here is a stupid simple solution; needs ASCII:
____________________________________________________________________
#include <stdio.h>

#define BUFFER 32768U

static
unsigned long
count_chars(
void const* buf,
size_t size,
unsigned long* counts
) {
unsigned char const* cur = buf;
unsigned char const* const end = cur + size;
unsigned long total = 0;
while (cur < end) {
if (*cur >= 'a' && *cur <= 'a' + 26) {

^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

ARGHGHGHAHREHAH!!!!

If course that should be:

if (*cur >= 'a' && *cur < 'a' + 26) {

++total;
++counts[*cur - 'a'];
} else if (*cur >= 'A' && *cur < 'A' + 26) {
++total;
++counts[*cur - 'A'];
}
++cur;
}
return total;
}
[...]

I repost program:
____________________________________________________________________
#include <stdio.h>
#include <assert.h>

#define BUFFER 32768U

static
unsigned long
count_chars(
void const* buf,
size_t size,
unsigned long* counts
) {
unsigned char const* cur = buf;
unsigned char const* const end = cur + size;
unsigned long total = 0;
while (cur < end) {
if (*cur >= 'a' && *cur < 'a' + 26) {
assert(*cur - 'a' >= 0 && *cur - 'a' < 26);
++total;
++counts[*cur - 'a'];
} else if (*cur >= 'A' && *cur < 'A' + 26) {
assert(*cur - 'A' >= 0 && *cur - 'A' < 26);
++total;
++counts[*cur - 'A'];
}
++cur;
}
return total;
}

int main(void) {
size_t i;
FILE* file;
unsigned long total = 0;
unsigned long counts[26] = { 0 };

file = fopen("c:/newvz/vztest/data.txt", "rb");

if (file) {
size_t size;
static unsigned char buf[BUFFER];

while ((size = fread(buf, 1, BUFFER, file))) {
total += count_chars(buf, size, counts);
printf("\rcharacters processed: %lu", total);
if (feof(file) || ferror(file)) break;
}

fclose(file);
}

puts("\n____________________________________________________");

for (i = 0; i < 26; ++i) {
printf("%c %%%f\n", i + 'a',
(total) ? ((double)counts / total) * 100 : 0.0);
}

return 0;
}

____________________________________________________________________

Sorry about that retarded mistake!

;^(...

Ioannis Vranos · Jun 10, 2009

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson

When you say characters" you mean letters, symbols, everything in the basic character set (1-127), or it may
include wide_characters?

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

Ioannis Vranos · Jun 10, 2009

Ioannis said:
When you say characters" you mean letters, symbols, everything in the
basic character set (1-127), or it may include wide_characters?

Since this is comp.lang.c++, I will post a high-level C++ solution.

Preliminary codes, while waiting for an answer.

#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

map<char, unsigned long> characterFrequencies;

// The array where the read characters will be stored.
char buffer[BUFSIZ];

// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}

// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);

string characters= "ABDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;

// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);

// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}

do
{
inputFile.read(buffer, sizeof(buffer));

for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());

// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<"\n\n\nThe letter frequencies are:\n";

for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ]<< "\n";

// We "stop" timing.
time2= clock();

// We convert he timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;

cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds\n";

cout<<"\n\nHave a nice day!\n";
}

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

Ioannis Vranos · Jun 10, 2009

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson

High level C++ solution:

Code 1:

#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

// Warning: long double has problems with MINGW compiler for Windows.
map<char, long double> characterFrequencies;

// The array where the read characters will be stored.
char buffer[BUFSIZ];

// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}

// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);

string characters= "ABDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;

// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);

// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}

do
{
inputFile.read(buffer, sizeof(buffer));

for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());

// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";

long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];

for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";

// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;

cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}

Code 2:

The above code more "inlined" (denoted with "==>").

This code is not any faster with my compiler (as it shouldn't), than the original code above.

#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

// Warning: long double has problems with MINGW compiler for Windows.
map<char, long double> characterFrequencies;

// The array where the read characters will be stored.
char buffer[BUFSIZ];

// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}

// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);

string characters= "ABDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;

// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);

// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}

do
{
inputFile.read(buffer, sizeof(buffer));

==> ADDED: const streamsize SIZE= inputFile.gcount();

==> CHANGED: for(streamsize i= 0; i< SIZE; ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());

// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";

long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];

for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";

// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;

cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

Ioannis Vranos · Jun 10, 2009

Corrected:

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson

High level C++ solution:

Code 1:

#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

// Warning: long double has problems with MINGW compiler for Windows.
map<char, long double> characterFrequencies;

// The array where the read characters will be stored.
char buffer[BUFSIZ];

// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}

// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);

string characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;

// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);

// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}

do
{
inputFile.read(buffer, sizeof(buffer));

for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());

// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";

long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];

for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";

// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;

cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}

Code 2:

The above code more "inlined" (denoted with "==>").

This code is not any faster with my compiler (as it shouldn't), than the original code above.

#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

// Warning: long double has problems with MINGW compiler for Windows.
map<char, long double> characterFrequencies;

// The array where the read characters will be stored.
char buffer[BUFSIZ];

// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}

// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);

string characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;

// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);

// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}

do
{
inputFile.read(buffer, sizeof(buffer));

==> ADDED: const streamsize SIZE= inputFile.gcount();

==> CHANGED: for(streamsize i= 0; i< SIZE; ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());

// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";

long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];

for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";

// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;

cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

Bart van Ingen Schenau · Jun 10, 2009

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter- frequency.html

With kind regards,
Peter Jansson

Lets see how the std algorithms measure up to the rest (only tested on
its own source so far):

#include <fstream>
#include <iostream>
#include <numeric>
#include <map>
#include <ctype.h>
#include <iterator>
#include <algorithm>
#include <time.h>

struct accum_original
{
std::map<char, long> counts;
long total;
accum_original() : counts(), total(0) {}
void count_char(char c)
{
total++;
counts[tolower((unsigned char)c)]++;
}
const std::map<char, long>& getCounts()
{
return counts;
}
long getTotal()
{
return total;
}
};

struct accum_fast
{
struct data {
std::map<char, long> counts;
long total;
int refcount;
data() : counts(), total(0), refcount(1) {}
}* payload;
accum_fast() : payload(new data) {}
accum_fast(const accum_fast& other) : payload(other.payload) {
payload->refcount++; }
~accum_fast() { if (--payload->refcount == 0) { delete payload; } }
void swap(accum_fast& other)
{
data* temp(other.payload);
other.payload = payload;
payload = temp;
}
accum_fast& operator=(accum_fast rhs)
{
swap(rhs);
return *this;
}
void count_char(char c)
{
payload->total++;
payload->counts[tolower((unsigned char)c)]++;
}
const std::map<char, long>& getCounts()
{
return payload->counts;
}
long getTotal()
{
return payload->total;
}
};
typedef accum_fast accum;

accum count_chars(accum acc, char c)
{
acc.count_char(c);
return acc;
}

struct scaler
{
long total;
scaler(long total_) : total(total_) {}
std:

air<const char, double> operator()(std:

air<const char, long>
value)
{
return std:

air<const char, double>(value.first,
(100.0*value.second)/total);
}
};

namespace std {
ostream& operator<<(ostream& os, const pair<const char, double>& value)
{
os << '\'' << value.first << "' " << value.second << "%\n";
return os;
}
}

int main(int argc, char** argv)
{
clock_t time1, time2;
accum acc;

if (argc < 2)
{
time1 = clock();
acc = std::accumulate(std::istream_iterator<char>(std::cin),
std::istream_iterator<char>(), acc, count_chars);
}
else
{
time1 = clock();
std::ifstream file(argv[1]);
acc = std::accumulate(std::istream_iterator<char>(file),
std::istream_iterator<char>(), acc, count_chars);
}

std::transform(acc.getCounts().begin(), acc.getCounts().end(),

std::ostream_iterator said:
(std::cout, ""),

scaler(acc.getTotal()));
time2 = clock();
std::cout << "Time taken: " << (time2 - time1) << " clock ticks = " <<
(time2 - time1)*1.0/CLOCKS_PER_SEC << " seconds" << std::endl;
}

agelos.anaptixi.net · Jun 10, 2009

Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-freque...

With kind regards,
Peter Jansson

Does this work?

#include <fstream>
#include <iostream>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

int frequency[128];
memset(frequency, 0, sizeof(int) * 128);

const int BUFFER_SIZE = 4096;
char buffer[BUFFER_SIZE];
unsigned int totalChars = 0;

if (argc != 2)
{
cerr << "\nUsage: " << argv[0] << " fileNameToRead\n\n";
return EXIT_FAILURE;
}

clock_t time1, time2;

time1 = clock();

ifstream inputFile(argv[argc -1]);

if (!inputFile.good())
{
cerr << "Bad file, ciao!\n";
return EXIT_FAILURE;
}

int *p;
char *end;

do
{
inputFile.read(buffer, BUFFER_SIZE);
int bytesRead = inputFile.gcount();

totalChars += bytesRead;

end = buffer + bytesRead;
for (p = (int *)buffer; p <= (int *)(end - sizeof(int)); p++)
{
frequency[(*p & 0xff000000) >> 24]++;
frequency[(*p & 0x00ff0000) >> 16]++;
frequency[(*p & 0x0000ff00) >> 8 ]++;
frequency[(*p & 0x000000ff) ]++;
}

}while(!inputFile.eof());

char *c = (char *)p;
while (c != end)
{
frequency[*c]++;
c++;
}

inputFile.close();

double dTotalChars = (double)totalChars;

for (int i = 'A'; i <= 'Z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

for (int i = 'a'; i <= 'z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

time2 = clock();

double totalTimeInSeconds = (time2- time1);
cout << "Time elapsed (ms) : " << totalTimeInSeconds << endl;
}

agelos.anaptixi.net · Jun 11, 2009

Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-freque...

With kind regards,
Peter Jansson

Actually, just changing the line :
ifstream inputFile(argv[argc -1]);
with
ifstream inputFile(argv[argc -1],std::ios::binary);

beats any other post (on my PC at least

)

Here's the new code (just added the binary open mode)

#include <fstream>
#include <iostream>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

int frequency[128];
memset(frequency, 0, sizeof(int) * 128);

const int BUFFER_SIZE = 4096;
char buffer[BUFFER_SIZE];
unsigned int totalChars = 0;

if (argc != 2)
{
cerr << "\nUsage: " << argv[0] << " fileNameToRead\n\n";
return EXIT_FAILURE;
}

clock_t time1, time2;

time1 = clock();

ifstream inputFile(argv[argc -1],std::ios::binary);

if (!inputFile.good())
{
cerr << "Bad file, ciao!\n";
return EXIT_FAILURE;
}

int *p;
char *end;

do
{
inputFile.read(buffer, BUFFER_SIZE);
int bytesRead = inputFile.gcount();

totalChars += bytesRead;

end = buffer + bytesRead;
for (p = (int *)buffer; p <= (int *)(end - sizeof(int)); p++)
{
frequency[(*p & 0xff000000) >> 24]++;
frequency[(*p & 0x00ff0000) >> 16]++;
frequency[(*p & 0x0000ff00) >> 8 ]++;
frequency[(*p & 0x000000ff) ]++;
}

}while(!inputFile.eof());

char *c = (char *)p;
while (c != end)
{
frequency[*c]++;
c++;
}

inputFile.close();

double dTotalChars = (double)totalChars;

for (int i = 'A'; i <= 'Z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

for (int i = 'a'; i <= 'z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

time2 = clock();

double totalTimeInSeconds = (time2- time1);
cout << "Time elapsed (ms) : " << totalTimeInSeconds << endl;
}

Chris M. Thomasson · Jun 11, 2009

Chris M. Thomasson said:
Chris M. Thomasson said:

Peter Jansson said:

Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

Click to expand...

Here is a stupid simple solution; needs ASCII:
[...]

Click to expand...

Here, let me repost this crappy little thing using a bigger buffer, without
using any asserts and calls to printf during the actual processing phase:
_________________________________________________________________________
#include <stdio.h>

#define BUFFER 65536U

static
unsigned long
count_chars(
void const* buf,
size_t size,
unsigned long* counts
) {
unsigned char const* cur = buf;
unsigned char const* const end = cur + size;
unsigned long total = 0;
while (cur < end) {
if (*cur >= 'a' && *cur < 'a' + 26) {
++total;
++counts[*cur - 'a'];
} else if (*cur >= 'A' && *cur < 'A' + 26) {
++total;
++counts[*cur - 'A'];
}
++cur;
}
return total;
}

int main(void) {
size_t i;
FILE* file;
unsigned long total = 0;
unsigned long counts[26] = { 0 };

file = fopen("./data.txt", "rb");

if (file) {
size_t size;
static unsigned char buf[BUFFER];

while ((size = fread(buf, 1, BUFFER, file))) {
total += count_chars(buf, size, counts);
if (feof(file) || ferror(file)) break;
}

fclose(file);
}

for (i = 0; i < 26; ++i) {
printf("%c %%%f\n", i + 'a',
(total) ? ((double)counts / total) * 100 : 0.0);
}

return 0;
}
_________________________________________________________________________

That just might shave one or two milliseconds off... lol

;^)

Ioannis Vranos · Jun 11, 2009

This is an improved version of the original program on run-time performance.

I replaced std::map with std::valarray. std::map is an associative container performing ordering, with worse
random access performance than std::vector. In usual applications it doesn't matter much, but since we use
timings, here is the updated code.

The code continues to be independent on how the basic character set is implemented (e.g. ASCII, EBCDIC).
However since other codes posted with ASCII implementation assumptions were considered equivalent by the OP to
generic portable code, I may post ASCII-specific C++ code tomorrow.

Here's the code:

#include <valarray>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

// Warning: long double has problems with MINGW compiler for Windows.
//
// The C++ basic character set is using the value range [0, 127].
// If we used vector<long double>, it would not have any run-time difference in any modern compiler.
valarray<long double> characterFrequencies(127);

// The array where the read characters will be stored.
char buffer[BUFSIZ];

// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}

// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);

string characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;

// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);

// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}

do
{
inputFile.read(buffer, sizeof(buffer));

for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());

// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";

long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];

for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";

// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;

cout<<"\n\nThe whole process took "<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

Chris M. Thomasson · Jun 11, 2009

Peter Jansson said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

Well, try this one out too:
_______________________________________________________________________________
#include <stdio.h>
#include <limits.h>

#define BUFFER 65536U

int main(int argc, char** argv) {
size_t i;
unsigned long total = 0;
unsigned long counts[UCHAR_MAX + 1] = { 0 };
FILE* file = fopen(argv[1], "rb");

if (file) {
size_t size;
static unsigned char buf[BUFFER];

while ((size = fread(buf, 1, BUFFER, file))) {

while (size) {
--size;
++counts[buf[size]];
}

if (size < BUFFER) {
if (feof(file) || ferror(file)) break;
}
}

fclose(file);
}

for (i = 0; i < 26; ++i) {
total += counts[i + 'a'];
total += counts[i + 'A'];
}

for (i = 0; i < 26; ++i) {
printf("%c %%%f\n", i + 'a',
(total) ? ((double)(counts[i + 'a'] + counts[i + 'A']) / total) *
100.0 : 0.0);
}

return 0;
}
_______________________________________________________________________________

It just might beat my previous submission by a little bit...

;^)

Ioannis Vranos · Jun 11, 2009

Corrected:

This is an improved version of the original program on run-time performance.

I replaced std::map with std::valarray. std::map is an associative container performing ordering, with worse
random access performance than std::vector. In usual applications it doesn't matter much, but since we use
timings, here is the updated code.

The code continues to be independent on how the basic character set is implemented (e.g. ASCII, EBCDIC).
However since other codes posted with ASCII implementation assumptions were considered equivalent by the OP to
generic portable code, I may post ASCII-specific C++ code tomorrow.

Here's the code:

#include <valarray>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

// Warning: long double has problems with MINGW compiler for Windows.
//
// The C++ basic character set is using the value range [0, 127].
// If we used vector<long double>, it would not have any run-time difference in any modern compiler.
valarray<long double> characterFrequencies(128);

// The array where the read characters will be stored.
char buffer[BUFSIZ];

// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}

// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);

string characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;

// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);

// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}

do
{
inputFile.read(buffer, sizeof(buffer));

for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());

// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";

long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];

for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";

// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;

cout<<"\n\nThe whole process took "<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

giannis t · Jun 11, 2009

check this out:

#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <iomanip>
#include <math.h>
#include <fstream>

const int MAX_TEXT_LENGTH = 8192;

int freq[256];
double pctFreq[256];

int main(int argc, char** argv) {
for (int i = 0; i < 256; i++) {
freq = 0;
pctFreq = 0;
}
unsigned char text[MAX_TEXT_LENGTH];
int count = 0;
char c;
int diff = 'a' - 'A';
std::ios_base::sync_with_stdio(false);
std::ifstream file(argv[1], std::ios::binary);
for (; file.read((char*) text, MAX_TEXT_LENGTH) {
for (std::streamsize i(0), num(file.gcount()); i < num; ++i) {
c = text;
if ((c >= 'A') && (c <= 'Z')) {
++freq[c];
count++;
} else if ((c >= 'a') && (c <= 'z')) {
++freq[c - diff];
count++;
}

}
}
file.close();
for (int i = ('A'); i <= 'Z'; i++) {
pctFreq = round((double(freq) / count)*10000) / 100;
std::cout << i << ": " << std::setw(5) << std::setfill('0') <<
std::fixed << std::setprecision(2) << pctFreq << "%\n";
}
return (EXIT_SUCCESS);
}

Ioannis Vranos · Jun 11, 2009

giannis said:
check this out:

#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <iomanip>
#include <math.h>
#include <fstream>

const int MAX_TEXT_LENGTH = 8192;

int freq[256];
double pctFreq[256];

int main(int argc, char** argv) {
for (int i = 0; i < 256; i++) {
freq = 0;
pctFreq = 0;
}
unsigned char text[MAX_TEXT_LENGTH];
int count = 0;
char c;
int diff = 'a' - 'A';
std::ios_base::sync_with_stdio(false);
std::ifstream file(argv[1], std::ios::binary);
for (; file.read((char*) text, MAX_TEXT_LENGTH) {
for (std::streamsize i(0), num(file.gcount()); i < num; ++i) {
c = text;
if ((c >= 'A') && (c <= 'Z')) {
++freq[c];
count++;
} else if ((c >= 'a') && (c <= 'z')) {
++freq[c - diff];
count++;
}

}
}
file.close();
for (int i = ('A'); i <= 'Z'; i++) {
pctFreq = round((double(freq) / count)*10000) / 100;
std::cout << i << ": " << std::setw(5) << std::setfill('0') <<
std::fixed << std::setprecision(2) << pctFreq << "%\n";
}
return (EXIT_SUCCESS);
}

Reading a text file as binary, is not guaranteed to work. It just happens that binary reading is implemented
as text reading in your platform.

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

Ioannis Vranos · Jun 11, 2009

Ioannis said:
giannis said:

check this out:

#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <iomanip>
#include <math.h>
#include <fstream>

const int MAX_TEXT_LENGTH = 8192;

int freq[256];
double pctFreq[256];

int main(int argc, char** argv) {
for (int i = 0; i < 256; i++) {
freq = 0;
pctFreq = 0;
}
unsigned char text[MAX_TEXT_LENGTH];
int count = 0;
char c;
int diff = 'a' - 'A';
std::ios_base::sync_with_stdio(false);
std::ifstream file(argv[1], std::ios::binary);
for (; file.read((char*) text, MAX_TEXT_LENGTH) {
for (std::streamsize i(0), num(file.gcount()); i < num; ++i) {
c = text;
if ((c >= 'A') && (c <= 'Z')) {
++freq[c];
count++;
} else if ((c >= 'a') && (c <= 'z')) {
++freq[c - diff];
count++;
}

}
}
file.close();
for (int i = ('A'); i <= 'Z'; i++) {
pctFreq = round((double(freq) / count)*10000) / 100;
std::cout << i << ": " << std::setw(5) << std::setfill('0') <<
std::fixed << std::setprecision(2) << pctFreq << "%\n";
}
return (EXIT_SUCCESS);
}

Click to expand...

Reading a text file as binary, is not guaranteed to work. It just
happens that binary reading is implemented as text reading in your
platform.

.... and is platform specific (ASCII).

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

Ioannis Vranos · Jun 11, 2009

Actually, just changing the line :
ifstream inputFile(argv[argc -1]);
with
ifstream inputFile(argv[argc -1],std::ios::binary);

beats any other post (on my PC at least )

Here's the new code (just added the binary open mode)

#include <fstream>
#include <iostream>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

int frequency[128];
memset(frequency, 0, sizeof(int) * 128);

const int BUFFER_SIZE = 4096;
char buffer[BUFFER_SIZE];
unsigned int totalChars = 0;

if (argc != 2)
{
cerr << "\nUsage: " << argv[0] << " fileNameToRead\n\n";
return EXIT_FAILURE;
}

clock_t time1, time2;

time1 = clock();

ifstream inputFile(argv[argc -1],std::ios::binary);

if (!inputFile.good())
{
cerr << "Bad file, ciao!\n";
return EXIT_FAILURE;
}

int *p;
char *end;

do
{
inputFile.read(buffer, BUFFER_SIZE);
int bytesRead = inputFile.gcount();

totalChars += bytesRead;

end = buffer + bytesRead;
for (p = (int *)buffer; p <= (int *)(end - sizeof(int)); p++)
{
frequency[(*p & 0xff000000) >> 24]++;
frequency[(*p & 0x00ff0000) >> 16]++;
frequency[(*p & 0x0000ff00) >> 8 ]++;
frequency[(*p & 0x000000ff) ]++;
}

}while(!inputFile.eof());

char *c = (char *)p;
while (c != end)
{
frequency[*c]++;
c++;
}

inputFile.close();

double dTotalChars = (double)totalChars;

for (int i = 'A'; i <= 'Z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

for (int i = 'a'; i <= 'z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

time2 = clock();

double totalTimeInSeconds = (time2- time1);
cout << "Time elapsed (ms) : " << totalTimeInSeconds << endl;
}

It doesn't compile.

john@john-laptop:~/Projects/anjuta/cpp/src$ g++ main.cc -o foobar main.cc: In function ¡int main(int, char**)¢:
main.cc:10: error: ¡memset¢ was not declared in this scope
main.cc:19: error: ¡EXIT_FAILURE¢ was not declared in this scope
main.cc:31: error: ¡EXIT_FAILURE¢ was not declared in this scope
john@john-laptop:~/Projects/anjuta/cpp/src$

--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net

C programming in 2011	148	May 26, 2011
Ann: Registration for the Python Game Programming Challenge is nowopen!	0	Jul 29, 2005
Requirement @ Appulse	0	Jul 24, 2008
Some errors in MIT's intro C++ course	109	Sep 8, 2010
Looking for C++ Professionals (Bangalore)	4	Aug 17, 2007
C language now truly universal	0	Jan 1, 2011
Seek Contract Programming Work - 17 Years Experience	0	Feb 22, 2005
Fundamentals of Financial Management Concise 7e Brigham Houston	0	May 1, 2011

C++ programming challenge

Peter Jansson

Ioannis Vranos

Peter Jansson

Chris M. Thomasson

Chris M. Thomasson

Ioannis Vranos

Ioannis Vranos

Ioannis Vranos

Ioannis Vranos

Bart van Ingen Schenau

agelos.anaptixi.net

agelos.anaptixi.net

Chris M. Thomasson

Ioannis Vranos

Chris M. Thomasson

Ioannis Vranos

giannis t

Ioannis Vranos

Ioannis Vranos

Ioannis Vranos

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads