C++ programming challenge

I

Ioannis Vranos

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson


May you provide the about.com relevant URL?



--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
C

Chris M. Thomasson

Peter Jansson said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

Here is a stupid simple solution; needs ASCII:
____________________________________________________________________
#include <stdio.h>


#define BUFFER 32768U


static
unsigned long
count_chars(
void const* buf,
size_t size,
unsigned long* counts
) {
unsigned char const* cur = buf;
unsigned char const* const end = cur + size;
unsigned long total = 0;
while (cur < end) {
if (*cur >= 'a' && *cur <= 'a' + 26) {
++total;
++counts[*cur - 'a'];
} else if (*cur >= 'A' && *cur < 'A' + 26) {
++total;
++counts[*cur - 'A'];
}
++cur;
}
return total;
}


int main(void) {
size_t i;
FILE* file;
unsigned long total = 0;
unsigned long counts[26] = { 0 };

file = fopen("./data.txt", "rb");

if (file) {
size_t size;
static unsigned char buf[BUFFER];

while ((size = fread(buf, 1, BUFFER, file))) {
total += count_chars(buf, size, counts);
printf("\rcharacters processed: %lu", total);
if (feof(file) || ferror(file)) break;
}

fclose(file);
}

puts("\n____________________________________________________");

for (i = 0; i < 26; ++i) {
printf("%c %%%f\n", i + 'a',
(total) ? ((double)counts / total) * 100 : 0.0);
}

return 0;
}
____________________________________________________________________



Of course, if you wanted to parse huge multi-gigabyte files, this simplistic
approach would not be ideal. I can think of several different ways to
improve this thing through parallelization and memory mapped files. But,
that would not be standard C or C++.
 
C

Chris M. Thomasson

Chris M. Thomasson said:
Here is a stupid simple solution; needs ASCII:
____________________________________________________________________
#include <stdio.h>


#define BUFFER 32768U


static
unsigned long
count_chars(
void const* buf,
size_t size,
unsigned long* counts
) {
unsigned char const* cur = buf;
unsigned char const* const end = cur + size;
unsigned long total = 0;
while (cur < end) {
if (*cur >= 'a' && *cur <= 'a' + 26) {
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



ARGHGHGHAHREHAH!!!!


If course that should be:


if (*cur >= 'a' && *cur < 'a' + 26) {



++total;
++counts[*cur - 'a'];
} else if (*cur >= 'A' && *cur < 'A' + 26) {
++total;
++counts[*cur - 'A'];
}
++cur;
}
return total;
}
[...]



I repost program:
____________________________________________________________________
#include <stdio.h>
#include <assert.h>


#define BUFFER 32768U


static
unsigned long
count_chars(
void const* buf,
size_t size,
unsigned long* counts
) {
unsigned char const* cur = buf;
unsigned char const* const end = cur + size;
unsigned long total = 0;
while (cur < end) {
if (*cur >= 'a' && *cur < 'a' + 26) {
assert(*cur - 'a' >= 0 && *cur - 'a' < 26);
++total;
++counts[*cur - 'a'];
} else if (*cur >= 'A' && *cur < 'A' + 26) {
assert(*cur - 'A' >= 0 && *cur - 'A' < 26);
++total;
++counts[*cur - 'A'];
}
++cur;
}
return total;
}


int main(void) {
size_t i;
FILE* file;
unsigned long total = 0;
unsigned long counts[26] = { 0 };

file = fopen("c:/newvz/vztest/data.txt", "rb");

if (file) {
size_t size;
static unsigned char buf[BUFFER];

while ((size = fread(buf, 1, BUFFER, file))) {
total += count_chars(buf, size, counts);
printf("\rcharacters processed: %lu", total);
if (feof(file) || ferror(file)) break;
}

fclose(file);
}

puts("\n____________________________________________________");

for (i = 0; i < 26; ++i) {
printf("%c %%%f\n", i + 'a',
(total) ? ((double)counts / total) * 100 : 0.0);
}

return 0;
}

____________________________________________________________________





Sorry about that retarded mistake!


;^(...
 
I

Ioannis Vranos

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson


When you say characters" you mean letters, symbols, everything in the basic character set (1-127), or it may
include wide_characters?




--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
I

Ioannis Vranos

Ioannis said:
When you say characters" you mean letters, symbols, everything in the
basic character set (1-127), or it may include wide_characters?


Since this is comp.lang.c++, I will post a high-level C++ solution.


Preliminary codes, while waiting for an answer.



#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>


int main(int argc, char **argv)
{
using namespace std;

map<char, unsigned long> characterFrequencies;


// The array where the read characters will be stored.
char buffer[BUFSIZ];


// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}




// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);


string characters= "ABDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;


// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);


// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}


do
{
inputFile.read(buffer, sizeof(buffer));


for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());



// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<"\n\n\nThe letter frequencies are:\n";




for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ]<< "\n";


// We "stop" timing.
time2= clock();

// We convert he timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;

cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds\n";

cout<<"\n\nHave a nice day!\n";
}





--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
I

Ioannis Vranos

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson


High level C++ solution:


Code 1:


#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>


int main(int argc, char **argv)
{
using namespace std;


// Warning: long double has problems with MINGW compiler for Windows.
map<char, long double> characterFrequencies;


// The array where the read characters will be stored.
char buffer[BUFSIZ];


// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}




// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);


string characters= "ABDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;


// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);


// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}


do
{
inputFile.read(buffer, sizeof(buffer));


for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());



// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";


long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];


for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";


// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;


cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}




Code 2:

The above code more "inlined" (denoted with "==>").

This code is not any faster with my compiler (as it shouldn't), than the original code above.



#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>


int main(int argc, char **argv)
{
using namespace std;


// Warning: long double has problems with MINGW compiler for Windows.
map<char, long double> characterFrequencies;


// The array where the read characters will be stored.
char buffer[BUFSIZ];


// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}




// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);


string characters= "ABDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;


// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);


// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}



do
{
inputFile.read(buffer, sizeof(buffer));

==> ADDED: const streamsize SIZE= inputFile.gcount();


==> CHANGED: for(streamsize i= 0; i< SIZE; ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());



// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";


long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];


for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";


// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;


cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}




--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
I

Ioannis Vranos

Corrected:


Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

With kind regards,
Peter Jansson


High level C++ solution:


Code 1:


#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>


int main(int argc, char **argv)
{
using namespace std;


// Warning: long double has problems with MINGW compiler for Windows.
map<char, long double> characterFrequencies;


// The array where the read characters will be stored.
char buffer[BUFSIZ];


// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}




// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);


string characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;


// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);


// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}


do
{
inputFile.read(buffer, sizeof(buffer));


for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());



// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";


long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];


for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";


// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;


cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}




Code 2:

The above code more "inlined" (denoted with "==>").

This code is not any faster with my compiler (as it shouldn't), than the original code above.



#include <map>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>


int main(int argc, char **argv)
{
using namespace std;


// Warning: long double has problems with MINGW compiler for Windows.
map<char, long double> characterFrequencies;


// The array where the read characters will be stored.
char buffer[BUFSIZ];


// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}




// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);


string characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;


// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);


// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}



do
{
inputFile.read(buffer, sizeof(buffer));

==> ADDED: const streamsize SIZE= inputFile.gcount();


==> CHANGED: for(streamsize i= 0; i< SIZE; ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());



// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";


long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];


for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";


// We "stop" timing.
time2= clock();

// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;


cout<<"\n\nThe whole process took "<< fixed<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}




--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
B

Bart van Ingen Schenau

Peter said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter- frequency.html

With kind regards,
Peter Jansson

Lets see how the std algorithms measure up to the rest (only tested on
its own source so far):

#include <fstream>
#include <iostream>
#include <numeric>
#include <map>
#include <ctype.h>
#include <iterator>
#include <algorithm>
#include <time.h>

struct accum_original
{
std::map<char, long> counts;
long total;
accum_original() : counts(), total(0) {}
void count_char(char c)
{
total++;
counts[tolower((unsigned char)c)]++;
}
const std::map<char, long>& getCounts()
{
return counts;
}
long getTotal()
{
return total;
}
};

struct accum_fast
{
struct data {
std::map<char, long> counts;
long total;
int refcount;
data() : counts(), total(0), refcount(1) {}
}* payload;
accum_fast() : payload(new data) {}
accum_fast(const accum_fast& other) : payload(other.payload) {
payload->refcount++; }
~accum_fast() { if (--payload->refcount == 0) { delete payload; } }
void swap(accum_fast& other)
{
data* temp(other.payload);
other.payload = payload;
payload = temp;
}
accum_fast& operator=(accum_fast rhs)
{
swap(rhs);
return *this;
}
void count_char(char c)
{
payload->total++;
payload->counts[tolower((unsigned char)c)]++;
}
const std::map<char, long>& getCounts()
{
return payload->counts;
}
long getTotal()
{
return payload->total;
}
};
typedef accum_fast accum;

accum count_chars(accum acc, char c)
{
acc.count_char(c);
return acc;
}

struct scaler
{
long total;
scaler(long total_) : total(total_) {}
std::pair<const char, double> operator()(std::pair<const char, long>
value)
{
return std::pair<const char, double>(value.first,
(100.0*value.second)/total);
}
};

namespace std {
ostream& operator<<(ostream& os, const pair<const char, double>& value)
{
os << '\'' << value.first << "' " << value.second << "%\n";
return os;
}
}

int main(int argc, char** argv)
{
clock_t time1, time2;
accum acc;

if (argc < 2)
{
time1 = clock();
acc = std::accumulate(std::istream_iterator<char>(std::cin),
std::istream_iterator<char>(), acc, count_chars);
}
else
{
time1 = clock();
std::ifstream file(argv[1]);
acc = std::accumulate(std::istream_iterator<char>(file),
std::istream_iterator<char>(), acc, count_chars);
}

std::transform(acc.getCounts().begin(), acc.getCounts().end(),
std::ostream_iterator said:
(std::cout, ""),
scaler(acc.getTotal()));
time2 = clock();
std::cout << "Time taken: " << (time2 - time1) << " clock ticks = " <<
(time2 - time1)*1.0/CLOCKS_PER_SEC << " seconds" << std::endl;
}
 
A

agelos.anaptixi.net

Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-freque...

With kind regards,
Peter Jansson

Does this work? :p

#include <fstream>
#include <iostream>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

int frequency[128];
memset(frequency, 0, sizeof(int) * 128);

const int BUFFER_SIZE = 4096;
char buffer[BUFFER_SIZE];
unsigned int totalChars = 0;

if (argc != 2)
{
cerr << "\nUsage: " << argv[0] << " fileNameToRead\n\n";
return EXIT_FAILURE;
}

clock_t time1, time2;

time1 = clock();

ifstream inputFile(argv[argc -1]);

if (!inputFile.good())
{
cerr << "Bad file, ciao!\n";
return EXIT_FAILURE;
}

int *p;
char *end;

do
{
inputFile.read(buffer, BUFFER_SIZE);
int bytesRead = inputFile.gcount();

totalChars += bytesRead;

end = buffer + bytesRead;
for (p = (int *)buffer; p <= (int *)(end - sizeof(int)); p++)
{
frequency[(*p & 0xff000000) >> 24]++;
frequency[(*p & 0x00ff0000) >> 16]++;
frequency[(*p & 0x0000ff00) >> 8 ]++;
frequency[(*p & 0x000000ff) ]++;
}

}while(!inputFile.eof());

char *c = (char *)p;
while (c != end)
{
frequency[*c]++;
c++;
}

inputFile.close();

double dTotalChars = (double)totalChars;

for (int i = 'A'; i <= 'Z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

for (int i = 'a'; i <= 'z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

time2 = clock();

double totalTimeInSeconds = (time2- time1);
cout << "Time elapsed (ms) : " << totalTimeInSeconds << endl;
}
 
A

agelos.anaptixi.net

Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-freque...

With kind regards,
Peter Jansson

Actually, just changing the line :
ifstream inputFile(argv[argc -1]);
with
ifstream inputFile(argv[argc -1],std::ios::binary);

beats any other post (on my PC at least :))

Here's the new code (just added the binary open mode)

#include <fstream>
#include <iostream>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

int frequency[128];
memset(frequency, 0, sizeof(int) * 128);

const int BUFFER_SIZE = 4096;
char buffer[BUFFER_SIZE];
unsigned int totalChars = 0;

if (argc != 2)
{
cerr << "\nUsage: " << argv[0] << " fileNameToRead\n\n";
return EXIT_FAILURE;
}

clock_t time1, time2;

time1 = clock();

ifstream inputFile(argv[argc -1],std::ios::binary);

if (!inputFile.good())
{
cerr << "Bad file, ciao!\n";
return EXIT_FAILURE;
}

int *p;
char *end;

do
{
inputFile.read(buffer, BUFFER_SIZE);
int bytesRead = inputFile.gcount();

totalChars += bytesRead;

end = buffer + bytesRead;
for (p = (int *)buffer; p <= (int *)(end - sizeof(int)); p++)
{
frequency[(*p & 0xff000000) >> 24]++;
frequency[(*p & 0x00ff0000) >> 16]++;
frequency[(*p & 0x0000ff00) >> 8 ]++;
frequency[(*p & 0x000000ff) ]++;
}

}while(!inputFile.eof());

char *c = (char *)p;
while (c != end)
{
frequency[*c]++;
c++;
}

inputFile.close();

double dTotalChars = (double)totalChars;

for (int i = 'A'; i <= 'Z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

for (int i = 'a'; i <= 'z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

time2 = clock();

double totalTimeInSeconds = (time2- time1);
cout << "Time elapsed (ms) : " << totalTimeInSeconds << endl;
}
 
C

Chris M. Thomasson

Chris M. Thomasson said:
Chris M. Thomasson said:
Peter Jansson said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

Here is a stupid simple solution; needs ASCII:
[...]

Here, let me repost this crappy little thing using a bigger buffer, without
using any asserts and calls to printf during the actual processing phase:
_________________________________________________________________________
#include <stdio.h>


#define BUFFER 65536U


static
unsigned long
count_chars(
void const* buf,
size_t size,
unsigned long* counts
) {
unsigned char const* cur = buf;
unsigned char const* const end = cur + size;
unsigned long total = 0;
while (cur < end) {
if (*cur >= 'a' && *cur < 'a' + 26) {
++total;
++counts[*cur - 'a'];
} else if (*cur >= 'A' && *cur < 'A' + 26) {
++total;
++counts[*cur - 'A'];
}
++cur;
}
return total;
}


int main(void) {
size_t i;
FILE* file;
unsigned long total = 0;
unsigned long counts[26] = { 0 };

file = fopen("./data.txt", "rb");

if (file) {
size_t size;
static unsigned char buf[BUFFER];

while ((size = fread(buf, 1, BUFFER, file))) {
total += count_chars(buf, size, counts);
if (feof(file) || ferror(file)) break;
}

fclose(file);
}

for (i = 0; i < 26; ++i) {
printf("%c %%%f\n", i + 'a',
(total) ? ((double)counts / total) * 100 : 0.0);
}

return 0;
}
_________________________________________________________________________




That just might shave one or two milliseconds off... lol


;^)
 
I

Ioannis Vranos

This is an improved version of the original program on run-time performance.

I replaced std::map with std::valarray. std::map is an associative container performing ordering, with worse
random access performance than std::vector. In usual applications it doesn't matter much, but since we use
timings, here is the updated code.


The code continues to be independent on how the basic character set is implemented (e.g. ASCII, EBCDIC).
However since other codes posted with ASCII implementation assumptions were considered equivalent by the OP to
generic portable code, I may post ASCII-specific C++ code tomorrow.


Here's the code:


#include <valarray>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>


int main(int argc, char **argv)
{
using namespace std;


// Warning: long double has problems with MINGW compiler for Windows.
//
// The C++ basic character set is using the value range [0, 127].
// If we used vector<long double>, it would not have any run-time difference in any modern compiler.
valarray<long double> characterFrequencies(127);


// The array where the read characters will be stored.
char buffer[BUFSIZ];


// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}




// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);


string characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;


// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);


// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}


do
{
inputFile.read(buffer, sizeof(buffer));


for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());



// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";


long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];


for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";


// We "stop" timing.
time2= clock();


// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;


cout<<"\n\nThe whole process took "<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}



--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
C

Chris M. Thomasson

Peter Jansson said:
Dear news group,

I have created a small programming challenge for those of you who are
interested in challenging your Standard C++ programming skills. The
challenge is about counting character frequency in large texts,
perhaps useful for spam filtering or classical crypto analysis. You
can read more about it here:

http://blog.p-jansson.com/2009/06/programming-challenge-letter-frequency.html

Well, try this one out too:
_______________________________________________________________________________
#include <stdio.h>
#include <limits.h>


#define BUFFER 65536U


int main(int argc, char** argv) {
size_t i;
unsigned long total = 0;
unsigned long counts[UCHAR_MAX + 1] = { 0 };
FILE* file = fopen(argv[1], "rb");

if (file) {
size_t size;
static unsigned char buf[BUFFER];

while ((size = fread(buf, 1, BUFFER, file))) {

while (size) {
--size;
++counts[buf[size]];
}

if (size < BUFFER) {
if (feof(file) || ferror(file)) break;
}
}

fclose(file);
}


for (i = 0; i < 26; ++i) {
total += counts[i + 'a'];
total += counts[i + 'A'];
}

for (i = 0; i < 26; ++i) {
printf("%c %%%f\n", i + 'a',
(total) ? ((double)(counts[i + 'a'] + counts[i + 'A']) / total) *
100.0 : 0.0);
}

return 0;
}
_______________________________________________________________________________



It just might beat my previous submission by a little bit...


;^)
 
I

Ioannis Vranos

Corrected:


This is an improved version of the original program on run-time performance.

I replaced std::map with std::valarray. std::map is an associative container performing ordering, with worse
random access performance than std::vector. In usual applications it doesn't matter much, but since we use
timings, here is the updated code.


The code continues to be independent on how the basic character set is implemented (e.g. ASCII, EBCDIC).
However since other codes posted with ASCII implementation assumptions were considered equivalent by the OP to
generic portable code, I may post ASCII-specific C++ code tomorrow.


Here's the code:


#include <valarray>
#include <fstream>
#include <cstdlib>
#include <cstdio>
#include <iostream>
#include <string>
#include <cctype>
#include <ctime>


int main(int argc, char **argv)
{
using namespace std;


// Warning: long double has problems with MINGW compiler for Windows.
//
// The C++ basic character set is using the value range [0, 127].
// If we used vector<long double>, it would not have any run-time difference in any modern compiler.
valarray<long double> characterFrequencies(128);


// The array where the read characters will be stored.
char buffer[BUFSIZ];


// If argc!= 2, then either the number of arguments is not correct, or the platform does not
// support arguments.
if(argc!= 2)
{
cerr<< "\nUsage: "<< argv[0]<< " fileNameToRead\n\n";

return EXIT_FAILURE;
}




// We disable synchronisation with stdio, to speed up C++ I/O.
ios_base::sync_with_stdio(false);


string characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZ";

clock_t time1, time2;


// We start timing.
time1= clock();

// We open the file
ifstream inputFile(argv[argc -1]);


// An error happened
if(not inputFile.good())
{
cerr<< "\nCould not open file for reading, exiting...\n\n";

return EXIT_FAILURE;
}


do
{
inputFile.read(buffer, sizeof(buffer));


for(streamsize i= 0; i< inputFile.gcount(); ++i)
++characterFrequencies[ buffer ];

}while(not inputFile.eof());



// Since rule 1 is: "Your program should be case insensitive when it counts letters",
// we add the results of lowercase characters and their equivallent uppercase letters together.
cout<<fixed<< "\n\n\nThe letter frequencies are:\n";


long double totalcharacterFrequencies= 0;

for(string::size_type i= 0; i< characters.size(); ++i)
totalcharacterFrequencies+= characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ];


for(string::size_type i= 0; i< characters.size(); ++i)
cout<< characters<< ": "<< (characterFrequencies[ characters ]+ characterFrequencies[
tolower(characters) ])/ totalcharacterFrequencies* 100<< "%\n";


// We "stop" timing.
time2= clock();


// We convert the timing to seconds.
double totalTimeInSeconds= (time2- time1)/ CLOCKS_PER_SEC;


cout<<"\n\nThe whole process took "<< totalTimeInSeconds<< " seconds.\n";

cout<<"\n\nHave a nice day!\n";
}



--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
G

giannis t

check this out:

#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <iomanip>
#include <math.h>
#include <fstream>

const int MAX_TEXT_LENGTH = 8192;

int freq[256];
double pctFreq[256];

int main(int argc, char** argv) {
for (int i = 0; i < 256; i++) {
freq = 0;
pctFreq = 0;
}
unsigned char text[MAX_TEXT_LENGTH];
int count = 0;
char c;
int diff = 'a' - 'A';
std::ios_base::sync_with_stdio(false);
std::ifstream file(argv[1], std::ios::binary);
for (; file.read((char*) text, MAX_TEXT_LENGTH);) {
for (std::streamsize i(0), num(file.gcount()); i < num; ++i) {
c = text;
if ((c >= 'A') && (c <= 'Z')) {
++freq[c];
count++;
} else if ((c >= 'a') && (c <= 'z')) {
++freq[c - diff];
count++;
}

}
}
file.close();
for (int i = ('A'); i <= 'Z'; i++) {
pctFreq = round((double(freq) / count)*10000) / 100;
std::cout << i << ": " << std::setw(5) << std::setfill('0') <<
std::fixed << std::setprecision(2) << pctFreq << "%\n";
}
return (EXIT_SUCCESS);
}
 
I

Ioannis Vranos

giannis said:
check this out:

#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <iomanip>
#include <math.h>
#include <fstream>

const int MAX_TEXT_LENGTH = 8192;

int freq[256];
double pctFreq[256];

int main(int argc, char** argv) {
for (int i = 0; i < 256; i++) {
freq = 0;
pctFreq = 0;
}
unsigned char text[MAX_TEXT_LENGTH];
int count = 0;
char c;
int diff = 'a' - 'A';
std::ios_base::sync_with_stdio(false);
std::ifstream file(argv[1], std::ios::binary);
for (; file.read((char*) text, MAX_TEXT_LENGTH);) {
for (std::streamsize i(0), num(file.gcount()); i < num; ++i) {
c = text;
if ((c >= 'A') && (c <= 'Z')) {
++freq[c];
count++;
} else if ((c >= 'a') && (c <= 'z')) {
++freq[c - diff];
count++;
}

}
}
file.close();
for (int i = ('A'); i <= 'Z'; i++) {
pctFreq = round((double(freq) / count)*10000) / 100;
std::cout << i << ": " << std::setw(5) << std::setfill('0') <<
std::fixed << std::setprecision(2) << pctFreq << "%\n";
}
return (EXIT_SUCCESS);
}



Reading a text file as binary, is not guaranteed to work. It just happens that binary reading is implemented
as text reading in your platform.



--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
I

Ioannis Vranos

Ioannis said:
giannis said:
check this out:

#include <stdlib.h>
#include <iostream>
#include <string.h>
#include <iomanip>
#include <math.h>
#include <fstream>

const int MAX_TEXT_LENGTH = 8192;

int freq[256];
double pctFreq[256];

int main(int argc, char** argv) {
for (int i = 0; i < 256; i++) {
freq = 0;
pctFreq = 0;
}
unsigned char text[MAX_TEXT_LENGTH];
int count = 0;
char c;
int diff = 'a' - 'A';
std::ios_base::sync_with_stdio(false);
std::ifstream file(argv[1], std::ios::binary);
for (; file.read((char*) text, MAX_TEXT_LENGTH);) {
for (std::streamsize i(0), num(file.gcount()); i < num; ++i) {
c = text;
if ((c >= 'A') && (c <= 'Z')) {
++freq[c];
count++;
} else if ((c >= 'a') && (c <= 'z')) {
++freq[c - diff];
count++;
}

}
}
file.close();
for (int i = ('A'); i <= 'Z'; i++) {
pctFreq = round((double(freq) / count)*10000) / 100;
std::cout << i << ": " << std::setw(5) << std::setfill('0') <<
std::fixed << std::setprecision(2) << pctFreq << "%\n";
}
return (EXIT_SUCCESS);
}



Reading a text file as binary, is not guaranteed to work. It just
happens that binary reading is implemented as text reading in your
platform.



.... and is platform specific (ASCII).


--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 
I

Ioannis Vranos

Actually, just changing the line :
ifstream inputFile(argv[argc -1]);
with
ifstream inputFile(argv[argc -1],std::ios::binary);

beats any other post (on my PC at least :))

Here's the new code (just added the binary open mode)

#include <fstream>
#include <iostream>
#include <ctime>

int main(int argc, char **argv)
{
using namespace std;

int frequency[128];
memset(frequency, 0, sizeof(int) * 128);

const int BUFFER_SIZE = 4096;
char buffer[BUFFER_SIZE];
unsigned int totalChars = 0;

if (argc != 2)
{
cerr << "\nUsage: " << argv[0] << " fileNameToRead\n\n";
return EXIT_FAILURE;
}

clock_t time1, time2;

time1 = clock();

ifstream inputFile(argv[argc -1],std::ios::binary);

if (!inputFile.good())
{
cerr << "Bad file, ciao!\n";
return EXIT_FAILURE;
}

int *p;
char *end;

do
{
inputFile.read(buffer, BUFFER_SIZE);
int bytesRead = inputFile.gcount();

totalChars += bytesRead;

end = buffer + bytesRead;
for (p = (int *)buffer; p <= (int *)(end - sizeof(int)); p++)
{
frequency[(*p & 0xff000000) >> 24]++;
frequency[(*p & 0x00ff0000) >> 16]++;
frequency[(*p & 0x0000ff00) >> 8 ]++;
frequency[(*p & 0x000000ff) ]++;
}

}while(!inputFile.eof());

char *c = (char *)p;
while (c != end)
{
frequency[*c]++;
c++;
}

inputFile.close();

double dTotalChars = (double)totalChars;

for (int i = 'A'; i <= 'Z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

for (int i = 'a'; i <= 'z'; i++)
{
cout << (char)i << " = " << (frequency / dTotalChars) * 100.0 <<
"%\n";
}

time2 = clock();

double totalTimeInSeconds = (time2- time1);
cout << "Time elapsed (ms) : " << totalTimeInSeconds << endl;
}



It doesn't compile.


john@john-laptop:~/Projects/anjuta/cpp/src$ g++ main.cc -o foobar main.cc: In function ¡int main(int, char**)¢:
main.cc:10: error: ¡memset¢ was not declared in this scope
main.cc:19: error: ¡EXIT_FAILURE¢ was not declared in this scope
main.cc:31: error: ¡EXIT_FAILURE¢ was not declared in this scope
john@john-laptop:~/Projects/anjuta/cpp/src$



--
Ioannis A. Vranos

C95 / C++03 Developer

http://www.cpp-software.net
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,768
Messages
2,569,574
Members
45,049
Latest member
Allen00Reed

Latest Threads

Top