Larry said:
Ok, so the lines are each 192 bytes long (including the \r\n).
If you use fread() to read the data, then fread needs to read
192 bytes - NOT 190. "\r\n" is not special to fread() - it reads
raw bytes. So, if you read only 190 bytes when each 'line'
is actually 192 bytes long, then the fields for all records
except the first one will each be off by 2 bytes from the previous
record, e.g. by the time you get to record 40, your data fields
will be off by 80 bytes from where you think they are. This will
cause your sFileData structs to NOT have the contents you expect,
and may be contibuting to the terrible performance that you
are seeing.
I have written 3 small programs that I will post in a few
minutes. I wrote them using a 190 byte line length (including
the trailing "\r\n"). As soon as I change them to use 192
byte lines, I'll post them. They are:
simondat.c: to create a test input data file named "simon.dat"
with 100000 records for use by the other 2 programs.
simon.cpp: uses 'char *' with new/delete for the string
fields in sFileData.
simon2.cpp: uses std::string for the string fields in
sFileData.
On my pc (an old Gateway PII 450MHZ with 384MB of RAM):
simon.cpp runs in 2.20 seconds and uses 5624KB of memory.
simon2.cpp runs in 2.22 seconds and uses 6272KB of memory.
Your mileage may vary. I'm running SuSE Linux v9.3 and
using the GCC "g++" compiler v3.3.5.
Regards,
Larry
Here are the 3 programs:
----------------------------------------
/* simondat.c Builds an MS-Windows format data file 'simon.dat' for
* use as input to the various simon* test programs.
* To compile:
* MS Windows: cl simondat.c
* Linux: gcc -o simondat simondat.c
*
* The file created, "simon.dat", is:
* Composed of 192 byte fixed length records containing ascii text.
* A trailing MS-Windows newline (the 2 char string "\r\n")
* comprises the last 2 bytes of each of the 192 byte records.
* The first 192 byte record contains the count of remaining
* records in the file as the only field in that record.
* All subsequent records follow the printf() format:
* "%-40s%-40s%-20d%-20d%70s\r\n"
* the trailing 70 bytes are all blanks.
*/
#include <stdio.h>
int main()
{
FILE * fp;
unsigned int i;
unsigned int recs = 100000;
fp = fopen("simon.dat", "wb");
fprintf(fp, "%-190u\r\n", recs); // this MUST be 192 chars total
for (i = 0; i < recs; i++)
fprintf(fp, "%-40s%-40s%-20d%-20d%70c\r\n",
"Helo", "Goodbye", i + 1, recs - i, ' ');
fclose(fp);
return 0;
}
--------------------------------------------
// simon.cpp uses 'char *' for the string fields in sFileData.
// to compile:
// MS Windows: cl simon.cpp
// Linux: g++ -o simon simon.cpp
#include <vector>
#include <iostream>
#include <fstream>
#include <stdlib.h> // for atoi/atol
#include <string.h> // for strcpy
#include <time.h> // for clock
using namespace std;
struct sFileData
{
char * sSomeString1;
char * sSomeString2;
int iSomeNum1;
int iSomeNum2;
sFileData()
{
NullAll();
}
~sFileData()
{
CleanAll();
}
sFileData(const sFileData&sfd)
{
NullAll();
*this = sfd;
}
const sFileData& operator=( const sFileData &sfd )
{
if( this != &sfd)
{
CleanAll();
iSomeNum1 = sfd.iSomeNum1;
iSomeNum2 = sfd.iSomeNum2;
if( sfd.sSomeString1 )
{
sSomeString1 = new char[strlen(sfd.sSomeString1)+1];
strcpy( sSomeString1, sfd.sSomeString1 );
}
if( sfd.sSomeString2 )
{
sSomeString2 = new char[strlen(sfd.sSomeString2)+1];
strcpy( sSomeString2, sfd.sSomeString2 );
}
}
return *this;
}
void CleanAll()
{
if (sSomeString1)
{
delete [] sSomeString1;
sSomeString1 = 0;
}
if (sSomeString2)
{
delete [] sSomeString2;
sSomeString2 = 0;
}
}
void NullAll()
{
sSomeString1 = 0;
sSomeString2 = 0;
iSomeNum1 = 0;
iSomeNum2 = 0;
}
};
std::vector< sFileData, std::allocator<sFileData> > address_;
// removes leading/trailing whitespace chars from a buffer.
// buf[] does not have to be nul-terminated.
// the resulting content in buf[] is NOT nul-terminated,
// starts at buf[0], and comprises ONLY the number of bytes
// returned by this function - leftover 'garbage' may follow
// those valid bytes.
// leading whitespace is removed by moving the contents of
// buf[] to lower indexes
// e.g if buf[] contains " hello hi " on entry, it will
// contain "hello hi" on exit.
// returns the final trimmed length, zero if buf[] is all
// whitespace.
unsigned int Trim(char * buf, unsigned int sz)
{
char *white = " \t\r\n";
unsigned int pos1, pos2, len;
// if invalid input
if (!buf || 0 == sz)
return 0;
// find first non-whitespace char in buf[]
for (pos1 = 0; pos1 < sz; pos1++)
if (NULL == strchr(white, buf[pos1]))
break;
// if buf[] is all whitespace
if (pos1 >= sz)
return 0;
// find last non-whitespace char in buf[]
for (pos2 = sz; pos2 > pos1; pos2--)
if (NULL == strchr(white, buf[pos2 - 1]))
break;
// buf[] length less any leading/trailing whitespace
len = pos2 - pos1;
// if leading whitespace, move buf[] contents 'left'
// to eliminate the leading whitespace
if (pos1 > 0)
memmove(buf, buf + pos1, len);
return len;
}
int main()
{
char c;
char buf[196];
clock_t cstart, cend;
double elapsed;
int reclen = 192;
unsigned long recs, i;
std::string::size_type pos1, pos2;
// whitespace chars to strip from file records
const char * white = " \t\r\n";
std::cerr << "check initial memory usage now, then" << std::endl;
std::cerr << "press any alpha key followed by Enter to start"
<< std::endl;
std::cin >> c;
cstart = clock();
std::ifstream in("simon.dat",
std::ios_base::in | std::ios_base::binary);
// read the file record count from the 1st field of the 1st record
if (in.read(buf, reclen))
{
recs = atol(buf);
}
else
{
std::cerr << "Unable to read record count from 1st record"
<<std::endl;
in.close();
return 1;
}
// read/process all the records in the file
for (i = 0; i < recs; ++i)
{
// if we read a 192 byte record into buf[]
if (in.read(buf, reclen))
{
unsigned int len;
sFileData sfd;
// trim lead/trail whitespace from 40 bytes at buf[0]
len = Trim(buf, 40);
if (len) // if it was not all whitespace
{
// dup the trimmed buf into sSomeString1
sfd.sSomeString1 = new char[len + 1];
memcpy(sfd.sSomeString1, buf, len);
sfd.sSomeString1[len] = '\0';
}
// trim lead/trail whitespace from 40 bytes at buf[40]
len = Trim(buf + 40, 40);
if (len) // if it was not all whitespace
{
// dup the trimmed buf into sSomeString2
sfd.sSomeString2 = new char[len + 1];
memcpy(sfd.sSomeString2, buf + 40, len);
sfd.sSomeString2[len] = '\0';
}
// assign the int values from the approp locs in buf[]
sfd.iSomeNum1 = atoi(buf + 80);
sfd.iSomeNum2 = atoi(buf + 100);
address_.push_back(sfd);
#if 0
// DEBUG buf[] parsing
std::cout << sfd.sSomeString1 << ", " << sfd.sSomeString2
<< ", " << sfd.iSomeNum1 << ", " << sfd.iSomeNum2
<< std::endl;
#endif
}
}
in.close();
cend = clock();
elapsed = cend - cstart;
std::cerr << "processed " << i << " records in "
<< elapsed / CLOCKS_PER_SEC << " seconds."
<< std::endl;
std::cerr << "check final memory usage now, then" << std::endl;
std::cerr << "press any alpha key followed by Enter to finish"
<< std::endl;
std::cin >> c;
return 0;
}
----------------------------------------------
// simon2.cpp uses std::string for the string fields in sFileData.
// to compile:
// MS Windows: cl simon2.cpp
// Linux: g++ -o simon2 simon2.cpp
#include <string>
#include <vector>
#include <iostream>
#include <fstream>
#include <stdlib.h> // for atoi/atol
#include <time.h> // for clock
using namespace std;
struct sFileData
{
std::string sSomeString1;
std::string sSomeString2;
int iSomeNum1;
int iSomeNum2;
sFileData()
{
NullAll();
}
~sFileData()
{
}
sFileData(const sFileData&sfd)
{
NullAll();
*this = sfd;
}
const sFileData& operator=( const sFileData &sfd )
{
if( this != &sfd)
{
iSomeNum1 = sfd.iSomeNum1;
iSomeNum2 = sfd.iSomeNum2;
sSomeString1 = sfd.sSomeString1;
sSomeString2 = sfd.sSomeString2;
}
return *this;
}
void NullAll()
{
iSomeNum1 = 0;
iSomeNum2 = 0;
}
};
std::vector< sFileData, std::allocator<sFileData> > address_;
int main()
{
char c;
char buf[196];
clock_t cstart, cend;
double elapsed;
int reclen = 192;
unsigned long recs, i;
std::string::size_type pos1, pos2;
// whitespace chars to strip from file records
const char * white = " \t\r\n";
std::cerr << "check initial memory usage now, then" << std::endl;
std::cerr << "press any alpha key followed by Enter to start"
<< std::endl;
std::cin >> c;
cstart = clock();
std::ifstream in("simon.dat",
std::ios_base::in | std::ios_base::binary);
// read the file record count from the 1st field of the 1st record
if (in.read(buf, reclen))
{
recs = atol(buf);
}
else
{
std::cerr << "Unable to read record count from 1st record"
<<std::endl;
in.close();
return 1;
}
// read/process all the records in the file
for (i = 0; i < recs; ++i)
{
// if we read a 192 byte record
if (in.read(buf, reclen))
{
std::string str;
sFileData sfd;
// make a string from buf[0] thru buf[39]
str = std::string(buf, 40);
// find FIRST non-whitespace char in the string
pos1 = str.find_first_not_of(white);
// if the string is NOT all whitespace
if (pos1 != std::string::npos)
{
// find the LAST non-whitespace char in the string
pos2 = str.find_last_not_of(white);
// copy the inclusive range [pos1-pos2] from 'str'
// to 'sSomeString1'
sfd.sSomeString1 = str.substr(pos1, ++pos2);
}
// make a string from buf[40] thru buf[79]
str = std::string(buf + 40, 40);
// find FIRST non-whitespace char in the string
pos1 = str.find_first_not_of(white);
// if the string is NOT all whitespace
if (pos1 != std::string::npos)
{
// find the LAST non-whitespace char in the string
pos2 = str.find_last_not_of(white);
// copy the inclusive range [pos1-pos2] from 'str'
// to 'sSomeString2'
sfd.sSomeString2 = str.substr(pos1, ++pos2);
}
// assign the int values from the approp locs in buf[]
sfd.iSomeNum1 = atoi(buf + 80);
sfd.iSomeNum2 = atoi(buf + 100);
address_.push_back(sfd);
#if 0
// DEBUG buf[] parsing
std::cout << sfd.sSomeString1 << ", " << sfd.sSomeString2
<< ", " << sfd.iSomeNum1 << ", " << sfd.iSomeNum2
<< std::endl;
#endif
}
}
in.close();
cend = clock();
elapsed = cend - cstart;
std::cerr << "processed " << i << " records in "
<< elapsed / CLOCKS_PER_SEC << " seconds."
<< std::endl;
std::cerr << "check final memory usage now, then" << std::endl;
std::cerr << "press any alpha key followed by Enter to finish"
<< std::endl;
std::cin >> c;
return 0;
}