cat

CBFalconer · Mar 6, 2008

SM said:
.... snip bad quote marks ...

The system cat exploits features of the specific system that
are not available in ANSI C. For example on unix, you can
avoid stdio altogether, and do something like
read -> shared buffer -> write

You can largely do that in ANSI C, with streams. Just use getc()
and putc(). This is why these routines can be implemented as
macros. The time cost of transferring a buffer content must be
negligible compared with the cost of reading/writing disk (or
other) files.

user923005 · Mar 6, 2008

I've read parts of K&R's ANSI C v2 and this is what their cat looked
like but when I compared the speed of this code to gnu cat, it seems
very slow. How do I optimize this for greater speeds? is there an
alternative algorithm?

void catfile(FILE *in, FILE *out) {
register int num_char;

/*Get characters*/
while ((num_char = getc(in)) != EOF) {
/*Print to standard output*/
putc(num_char, out);
}

}

C:\tmp>dir dict.sql
Volume in drive C has no label.
Volume Serial Number is 0890-87CA

Directory of C:\tmp

03/01/2007 11:48 AM 7,127,408 dict.sql
1 File(s) 7,127,408 bytes
0 Dir(s) 5,202,309,120 bytes free

C:\tmp>cat dict.sql dict.out
standard cat took 1.984000 seconds
big buffer cat took 0.000000 seconds

C:\tmp>type cat.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void catfilebuffer(FILE * in, FILE * out)
{
register int num_char;

setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}
}

void catfilenobuff(FILE * in, FILE * out)
{
register int num_char;

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}
}

int main(int argc, char **argv)
{
FILE *in = stdin;
FILE *out = stdout;
clock_t start,
end;
static const double cps = 1.0 / CLOCKS_PER_SEC;
if (argc > 1) {
in = fopen(argv[1], "r");
if (in == NULL) {
printf("Error opening %s\n", argv[1]);
exit(EXIT_FAILURE);
}
}
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
}
start = clock();
catfilenobuff(in, out);
end = clock();
printf("standard cat took %f seconds\n", (end - start) * cps);
start = clock();
catfilebuffer(in, out);
end = clock();
printf("big buffer cat took %f seconds\n", (end - start) * cps);
fflush(NULL);
return 0;
}

user923005 · Mar 6, 2008

I've read parts of K&R's ANSI C v2 and this is what their cat looked
like but when I compared the speed of this code to gnu cat, it seems
very slow. How do I optimize this for greater speeds? is there an
alternative algorithm?

Click to expand...

void catfile(FILE *in, FILE *out) {
register int num_char;

Click to expand...

/*Get characters*/
while ((num_char = getc(in)) != EOF) {
/*Print to standard output*/
putc(num_char, out);
}

Click to expand...

}

Click to expand...

C:\tmp>dir dict.sql
Volume in drive C has no label.
Volume Serial Number is 0890-87CA

Directory of C:\tmp

03/01/2007 11:48 AM 7,127,408 dict.sql
1 File(s) 7,127,408 bytes
0 Dir(s) 5,202,309,120 bytes free

C:\tmp>cat dict.sql dict.out
standard cat took 1.984000 seconds
big buffer cat took 0.000000 seconds

C:\tmp>type cat.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void catfilebuffer(FILE * in, FILE * out)
{
register int num_char;

setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}

}

void catfilenobuff(FILE * in, FILE * out)
{
register int num_char;

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}

}

int main(int argc, char **argv)
{
FILE *in = stdin;
FILE *out = stdout;
clock_t start,
end;
static const double cps = 1.0 / CLOCKS_PER_SEC;
if (argc > 1) {
in = fopen(argv[1], "r");
if (in == NULL) {
printf("Error opening %s\n", argv[1]);
exit(EXIT_FAILURE);
}
}
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
}
start = clock();
catfilenobuff(in, out);
end = clock();
printf("standard cat took %f seconds\n", (end - start) * cps);
start = clock();
catfilebuffer(in, out);
end = clock();
printf("big buffer cat took %f seconds\n", (end - start) * cps);
fflush(NULL);
return 0;

}

Oops. Forgot to rewind. Let me try that again...

user923005 · Mar 6, 2008

I've read parts of K&R's ANSI C v2 and this is what their cat looked
like but when I compared the speed of this code to gnu cat, it seems
very slow. How do I optimize this for greater speeds? is there an
alternative algorithm?

Click to expand...

void catfile(FILE *in, FILE *out) {
register int num_char;

Click to expand...

/*Get characters*/
while ((num_char = getc(in)) != EOF) {
/*Print to standard output*/
putc(num_char, out);
}

Click to expand...

}

Click to expand...

C:\tmp>dir dict.sql
Volume in drive C has no label.
Volume Serial Number is 0890-87CA

Directory of C:\tmp

03/01/2007 11:48 AM 7,127,408 dict.sql
1 File(s) 7,127,408 bytes
0 Dir(s) 5,202,309,120 bytes free

C:\tmp>cat dict.sql dict.out
standard cat took 1.984000 seconds
big buffer cat took 0.000000 seconds

C:\tmp>type cat.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void catfilebuffer(FILE * in, FILE * out)
{
register int num_char;

setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}

}

void catfilenobuff(FILE * in, FILE * out)
{
register int num_char;

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}

}

int main(int argc, char **argv)
{
FILE *in = stdin;
FILE *out = stdout;
clock_t start,
end;
static const double cps = 1.0 / CLOCKS_PER_SEC;
if (argc > 1) {
in = fopen(argv[1], "r");
if (in == NULL) {
printf("Error opening %s\n", argv[1]);
exit(EXIT_FAILURE);
}
}
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
}
start = clock();
catfilenobuff(in, out);
end = clock();
printf("standard cat took %f seconds\n", (end - start) * cps);
start = clock();
catfilebuffer(in, out);
end = clock();
printf("big buffer cat took %f seconds\n", (end - start) * cps);
fflush(NULL);
return 0;

}- Hide quoted text -

- Show quoted text -

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void catfilebuffer(FILE * in, FILE * out)
{
register int num_char;

setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}
}

void catfilenobuff(FILE * in, FILE * out)
{
register int num_char;

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}
}

int main(int argc, char **argv)
{
FILE *in = stdin;
FILE *out = stdout;
clock_t start,
end;
static const double cps = 1.0 / CLOCKS_PER_SEC;
if (argc > 1) {
in = fopen(argv[1], "r");
if (in == NULL) {
printf("Error opening %s\n", argv[1]);
exit(EXIT_FAILURE);
}
}
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
}
start = clock();
catfilenobuff(in, out);
end = clock();
printf("standard cat took %f seconds\n", (end - start) * cps);
rewind(in);
fclose(out);
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
} else
out = stdout;
start = clock();
catfilebuffer(in, out);
end = clock();
printf("big buffer cat took %f seconds\n", (end - start) * cps);
fflush(NULL);
return 0;
}
/*
Not nearly so dramatic! ;-)
C:\tmp>cat dict.sql dict.out
standard cat took 1.968000 seconds
big buffer cat took 1.891000 seconds
*/

user923005 · Mar 6, 2008

C:\tmp>dir dict.sql
Volume in drive C has no label.
Volume Serial Number is 0890-87CA

Click to expand...

Directory of C:\tmp

Click to expand...

03/01/2007 11:48 AM 7,127,408 dict.sql
1 File(s) 7,127,408 bytes
0 Dir(s) 5,202,309,120 bytes free

Click to expand...

C:\tmp>cat dict.sql dict.out
standard cat took 1.984000 seconds
big buffer cat took 0.000000 seconds

Click to expand...

C:\tmp>type cat.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void catfilebuffer(FILE * in, FILE * out)
{
register int num_char;

Click to expand...

setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

Click to expand...

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}

void catfilenobuff(FILE * in, FILE * out)
{
register int num_char;

Click to expand...

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}

}

Click to expand...

int main(int argc, char **argv)
{
FILE *in = stdin;
FILE *out = stdout;
clock_t start,
end;
static const double cps = 1.0 / CLOCKS_PER_SEC;
if (argc > 1) {
in = fopen(argv[1], "r");
if (in == NULL) {
printf("Error opening %s\n", argv[1]);
exit(EXIT_FAILURE);
}
}
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
}
start = clock();
catfilenobuff(in, out);
end = clock();
printf("standard cat took %f seconds\n", (end - start) * cps);
start = clock();
catfilebuffer(in, out);
end = clock();
printf("big buffer cat took %f seconds\n", (end - start) * cps);
fflush(NULL);
return 0;

Click to expand...

}- Hide quoted text -

Click to expand...

- Show quoted text -

Click to expand...

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
void catfilebuffer(FILE * in, FILE * out)
{
register int num_char;

setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}

}

void catfilenobuff(FILE * in, FILE * out)
{
register int num_char;

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}

}

int main(int argc, char **argv)
{
FILE *in = stdin;
FILE *out = stdout;
clock_t start,
end;
static const double cps = 1.0 / CLOCKS_PER_SEC;
if (argc > 1) {
in = fopen(argv[1], "r");
if (in == NULL) {
printf("Error opening %s\n", argv[1]);
exit(EXIT_FAILURE);
}
}
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
}
start = clock();
catfilenobuff(in, out);
end = clock();
printf("standard cat took %f seconds\n", (end - start) * cps);
rewind(in);
fclose(out);
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
} else
out = stdout;
start = clock();
catfilebuffer(in, out);
end = clock();
printf("big buffer cat took %f seconds\n", (end - start) * cps);
fflush(NULL);
return 0;}

/*
Not nearly so dramatic! ;-)
C:\tmp>cat dict.sql dict.out
standard cat took 1.968000 seconds
big buffer cat took 1.891000 seconds
*/- Hide quoted text -

- Show quoted text -

The real problem here is that catfilebufferfgets() needs to be made
much more robust. But it does lend a lot of speed (probably due to
the much lower number of function calls and tests).

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

static char string[1024 * 16];

void catfilebufferfgets(FILE * in, FILE * out)
{
setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

/* Get characters (ERROR PRONE: {what if string > 16K}) */
while (fgets(string, sizeof string, in)) {
fputs(string, out);
}
}

void catfilebuffer(FILE * in, FILE * out)
{
register int num_char;

setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}
}

void catfilenobuff(FILE * in, FILE * out)
{
register int num_char;

/* Get characters */
while ((num_char = getc(in)) != EOF) {
/* Print to standard output */
putc(num_char, out);
}
}

int main(int argc, char **argv)
{
FILE *in = stdin;
FILE *out = stdout;
clock_t start,
end;
static const double cps = 1.0 / CLOCKS_PER_SEC;
if (argc > 1) {
in = fopen(argv[1], "r");
if (in == NULL) {
printf("Error opening %s\n", argv[1]);
exit(EXIT_FAILURE);
}
}
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
}
start = clock();
catfilenobuff(in, out);
end = clock();
printf("standard cat took %f seconds\n", (end - start) * cps);
rewind(in);
fclose(out);
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
} else
out = stdout;
start = clock();
catfilebuffer(in, out);
end = clock();
printf("big buffer cat took %f seconds\n", (end - start) * cps);
rewind(in);
fclose(out);
if (argc > 2) {
out = fopen(argv[2], "w");
if (out == NULL) {
printf("Error opening %s\n", argv[2]);
exit(EXIT_FAILURE);
}
} else
out = stdout;
start = clock();
catfilebufferfgets(in, out);
end = clock();
printf("big buffer cat using fgets took %f seconds\n", (end -
start) * cps);
fflush(NULL);
return 0;
}
/*
C:\tmp>cat dict.sql dict.out
standard cat took 2.062000 seconds
big buffer cat took 2.016000 seconds
big buffer cat using fgets took 0.203000 seconds
*/

user923005 · Mar 6, 2008

On Mar 6 said:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

static char string[1024 * 16];

void catfilebufferfgets(FILE * in, FILE * out)
{
setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

/* Get characters (ERROR PRONE: {what if string > 16K}) */
while (fgets(string, sizeof string, in)) {
fputs(string, out);
}

} [snip]
/*
C:\tmp>cat dict.sql dict.out
standard cat took 2.062000 seconds
big buffer cat took 2.016000 seconds
big buffer cat using fgets took 0.203000 seconds
*/

Another important difference is that the fgets() version only works on
text files (for obvious reasons).

Jag · Mar 7, 2008

[snip]

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

Click to expand...

static char string[1024 * 16];

Click to expand...

void catfilebufferfgets(FILE * in, FILE * out)
{
setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

Click to expand...

/* Get characters (ERROR PRONE: {what if string > 16K}) */
while (fgets(string, sizeof string, in)) {
fputs(string, out);
}

}

Click to expand...

[snip]
/*
C:\tmp>cat dict.sql dict.out
standard cat took 2.062000 seconds
big buffer cat took 2.016000 seconds
big buffer cat using fgets took 0.203000 seconds
*/

Click to expand...

Another important difference is that the fgets() version only works on
text files (for obvious reasons).

This code, like cat -n and cat -e outputs the line number or adds a $
at the end of the line. getline is a gnu extension.I haven't used
setvbuf before. what does it do? anyway, without setvbuf(), it
resulted into 2.580000 seconds but with setvbuf(), it resulted into
1.230000 seconds. Thanks for the tip

int catline(FILE *in, FILE *out, int nCounter,
bool const bLine, bool const bEnds) {

char *pLine = NULL;
size_t pLen = 0;

setvbuf(in, NULL, _IOFBF, BUFSIZ);
setvbuf(out, NULL, _IOFBF, BUFSIZ);

while (!feof(in)) {
/*Get line*/
if (getline(&pLine, &pLen, in) >= 0) {

if (bLine == true)
/*Print string to standard output*/
fprintf(out, "%6d %s", nCounter, pLine);
/*Remove '\n' add a $ at the end of the line*/
else if (bEnds == true) {
if (pLine[strlen(pLine) - 1] == '\n') {
pLine[strlen(pLine) - 1] = pLine[strlen(pLine)];
pLine[strlen(pLine)] = '\0';
fprintf(out, "%s$\n", pLine);
}
/*If there is no '\n' at the end of the line add a $
without a '\n'*/
else
fprintf(out, "%s$", pLine);
}

/*Increment line counter*/
++nCounter;
}
}
/*Free allocated memory in getline()*/
free(pLine);

return(nCounter);
}

Jag · Mar 7, 2008

[snip]

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

Click to expand...

static char string[1024 * 16];

Click to expand...

void catfilebufferfgets(FILE * in, FILE * out)
{
setvbuf(in, NULL, _IOFBF, 1024 * 16);
setvbuf(out, NULL, _IOFBF, 1024 * 16);

Click to expand...

/* Get characters (ERROR PRONE: {what if string > 16K}) */
while (fgets(string, sizeof string, in)) {
fputs(string, out);
}

}

Click to expand...

[snip]
/*
C:\tmp>cat dict.sql dict.out
standard cat took 2.062000 seconds
big buffer cat took 2.016000 seconds
big buffer cat using fgets took 0.203000 seconds
*/

Click to expand...

Another important difference is that the fgets() version only works on
text files (for obvious reasons).

Oh, I also used setvbuf() in my original post and yielded: no
setvbuf() 2.440000, with setvbuf(), 0.920000.
WOW.

void catfile(FILE *in, FILE *out) {
register int num_char;

setvbuf(in, NULL, _IOFBF, BUFSIZ);
setvbuf(out, NULL, _IOFBF, BUFSIZ);

/*Get characters*/
while ((num_char = getc(in)) != EOF) {
/*Print to standard output*/
putc(num_char, out);
}
}

Falcon Kirtaran · Mar 7, 2008

Jag said:
I've read parts of K&R's ANSI C v2 and this is what their cat looked
like but when I compared the speed of this code to gnu cat, it seems
very slow. How do I optimize this for greater speeds? is there an
alternative algorithm?

void catfile(FILE *in, FILE *out) {
register int num_char;

/*Get characters*/
while ((num_char = getc(in)) != EOF) {
/*Print to standard output*/
putc(num_char, out);
}
}

Thanks.

It's fairly inefficient to get characters one by one. If you felt like
using system calls to do it, you could use read(), but then you couldn't
use FILE *. However, the only thing you really need to do is increase
your buffer size (from one), and thus you could use fgets(). 4096
bytes, if I recall, is a fairly standard block size; it's the most
efficient to read full blocks at a time.

Remember to initialise your variables!

#include <string.h>
#include <stdio.h>
#include <stdlib.h>

void catfile(FILE * in, FILE * out) {
char * buf = (char *)malloc(4097); //4096 + 1 for \0
if (!buf) exit(-1);
register int num_char = 0;

while (!feof(in)) {
if (!fgets(buf, 4097, in)) break;
num_char += strlen(buf);
fprintf(out, "%s", buf);
};

free(buf);
};

Richard Heathfield · Mar 7, 2008

Falcon Kirtaran said:

It's fairly inefficient to get characters one by one. [...]
However, the only thing you really need to do is increase
your buffer size (from one), and thus you could use fgets().

Typically, input is buffered by default, so it doesn't actually make any
difference.

I compiled your function (after translating it into C90), removed the
spurious semicolon, and added a main. I then used it to cat a 4MB JPEG.

real 0m1.484s
user 0m0.010s
sys 0m0.210s

Compare:

void catfile(FILE * in, FILE * out) {
int ch;
while((ch = getc(in)) != EOF)
{
putc(ch, out);
}
}

real 0m0.633s
user 0m0.540s
sys 0m0.070s

Chris Torek · Mar 8, 2008

[regarding speed]

It's fairly inefficient to get characters one by one. If you felt like
using system calls to do it, you could use read(), but then you couldn't
use FILE *. However, the only thing you really need to do is increase
your buffer size (from one), and thus you could use fgets().

Yes -- or, although it is sort of more aimed at binary files than
text files, you can use fread().

Both fgets() and fread() have a problem that lower-level system
functions (whether these are named read, _read, or even SYS$QIO)
tend to avoid: when using the <stdio.h> facilities, the C library
provides one layer of buffering for the input file. You, the C
programmer, must provide a second buffer into which characters are
transferred one line (fgets()) or "buffer-blob" (fread()) at a
time. This second buffer is then copied to the third buffer, again
provided by the <stdio.h> facilities, that is associated with the
output file.

As a result, when using low-level system functions as applied to
(say) an on-media (on-disk) file, one needs just two copy operations:
one from the source disk into RAM, and one from the copy made in
RAM back to the destination disk (which may be a different physical
drive, so at least *one* copy operation across some device bus was
required; and since the devices may have different bus speeds, it
is not unlikely that two separate across-the-bus copies, with
intermediate version in RAM, were appropriate). When using portable
C code, typically one winds up with at least three in-RAM copies
(stdio buffer for source file, line or block buffer for fgets/fread,
stdio buffer for destination file) and sometimes as many as five
(add two "kernel level" copies in kernel buffers -- the non-portable
version using read() or __sys_io_op() may result in these too, of
course).

Hence, you have your choice: fast, or portable. "Portable" may
well be "fast enough", of course. If you generally operate on smaller
files, the difference between doing the copy in 0.0000003 seconds
and 0.0000000005 seconds may be negligible.

Last, some comments on some of the code:

while (!feof(in)) {

Any time you see a "while" loop testing "!feof", you should suspect
the code to be wrong. It is possible that it is not wrong (as we
will see in a moment), but even if so, it can probably be improved.

The reason for this is that feof() does not predict that a future
read will work, but rather "predicts" whether a past read failed.

if (!fgets(buf, 4097, in)) break;

This terminates the loop if the fgets() returns NULL. This
occurs if:

- the fgets() encounters EOF (which will also set feof(in)), or
- the fgets() fails due to an error reading the input file (e.g.,
input coming from a floppy or CD/DVD that has gone bad).

In the second case, feof(in) would still not become true, but since
either one terminates the loop, all is OK. But this means that
the feof() test at the top of the loop is almost always pointless:
the only way for it ever to stop the loop is if the fgets() encounters
EOF in the "middle" of an input line (i.e., an input stream whose
last line does not end with newline).

In my opinion, then, the code would be improved if we simply used
the result of each fgets() call to decide whether to terminate the
loop:

while (fgets(buf, 4097, in) != NULL) {

num_char += strlen(buf);
fprintf(out, "%s", buf);
};

It seems a bit odd (but not wrong) to use fgets() for the input side,
but fprintf() instead of fputs() for the output side.

The semicolon after the close brace is unncessary (but otherwise
harmless).

Chris Torek · Mar 9, 2008

... I haven't used setvbuf before. what does it do?

The setvbuf() function is a Standard C function. Its action is a
bit overcomplicated due to its origins -- it came from a system
whose designers tended to write functions that served their immediate
needs, without ever thinking about generalization and abstraction.
If it had a real-world counterpart, it might be a device that would
both pick out tie *and* choose an amount of money to tip the cab
driver, on the theory that the only reason anyone ever puts on a
tie is to go out, and everyone lives in New York City and always
takes a cab anywhere they go.

The first argument to setvbuf() is a stdio stream. This stream
must be one that was "freshly opened", i.e., has not had any input
or output performed on it yet. (The three standard streams are
valid candidates as long as you have done no I/O on them yourself,
i.e., the system must act as if there are no putchar() calls before
it initially calls main(), for instance.)

If the second argument is non-NULL, it must be the address of the
first element of an array of "char" whose size is given by the
fourth argument. Thus, for instance:

char block[99];
setvbuf(file, block, _IOFBF, sizeof block);

is a correct call (albeit odd, as 99 is probably not a very good
buffer size). (The array can actually be larger than the size you
specify, so:

setbuf(file, block, _IOFBF, 42);

is also valid in this case, but even weirder.)

The third argument must be one of the three macros:

_IONBF
_IOLBF
_IOFBF

which stand for unbuffered, line-buffered, and fully-buffered
respectively. Normally you, the C programmer, must never use
identifiers beginning with an underscore followed by an uppercase
letter, but in this case, you *must* use them.

If the fourth argument is non-zero, it is a size you, the programmer,
are "suggesting" that the stdio routines use for the underlying
file. What non-zero number is good? Well, BUFSIZ is probably not
*bad*. (It is typically 512, 1024, 4096, 16384, or some other
power of two.) Unfortunately, since it is a #define for some
integer constant, it can only be optimal for some, not all, cases.
A good stdio should pick the best buffer size automatically.

Your best bet is (in my opinion) generally to pass NULL and 0 for
the second and fourth arguments; however, these are also OK:

setvbuf(in, NULL, _IOFBF, BUFSIZ);
setvbuf(out, NULL, _IOFBF, BUFSIZ);

as they will simply force the "in" and "out" streams to be
fully-buffered. Of course, if these two streams are connected
to anything other than an "interactive device", they should be
fully-buffered anyway. Hence, in a good stdio, on typical
files, these two calls should have no real effect, except
perhaps (if BUFSIZ is less than ideal) to make things run
more slowly.

anyway, without setvbuf(), it resulted into 2.580000 seconds but
with setvbuf(), it resulted into 1.230000 seconds.

This suggests that there is something wrong (or at least "not so
good") in your stdio implementation. (But be wary of "testing
artifacts": if you run the same program, or several similar programs,
multiple times on the same files, they may produce very different
times on some runs. In particular, they may be much slower on the
first one, in which may have to cache the input file. Subsequent
runs can use the cached file, without ever bothering to read from
a disk file.)

while (!feof(in)) {

As I mentioned elsethread, one should always be suspicious of a
loop of this form. In this particular case, the code was OK only
if the input file has no errors. If you were to run it with input
directed to, e.g., a partly-erased floppy disk, it could loop
forever trying to read the bad part of the disk.

CBFalconer · Mar 9, 2008

Chris said:
[regarding speed]
.... snip ...

Both fgets() and fread() have a problem that lower-level system
functions (whether these are named read, _read, or even SYS$QIO)
tend to avoid: when using the <stdio.h> facilities, the C library
provides one layer of buffering for the input file. You, the C
programmer, must provide a second buffer into which characters are
transferred one line (fgets()) or "buffer-blob" (fread()) at a
time. This second buffer is then copied to the third buffer, again
provided by the <stdio.h> facilities, that is associated with the
output file.

However you omit the useful provision for getc and putc that they
can be macros, and that those macros can evaluate arguments more
than once (unique in the library). This makes it quite possible
for those to use the existing system buffer, so the user doesn't
need to provide one, yet has the detailed char by char access
needed. This means that:

while (EOF != (ch = getc(f))) putc(out, ch);

can often be the fastest available file copy mechanism.

Chris Torek · Mar 9, 2008

However you omit the useful provision for getc and putc that they
can be macros, and that those macros can evaluate arguments more
than once (unique in the library). This makes it quite possible
for those to use the existing system buffer, so the user doesn't
need to provide one, yet has the detailed char by char access
needed. This means that:

while (EOF != (ch = getc(f))) putc(out, ch);

can often be the fastest available file copy mechanism.

I did actually mean to mention this.

The big problem on POSIX systems is that getc and putc have to
be "thread-safe", which makes macro expansion unwieldy at best
and usually not-even-done. Each call is then a call, and each
call then does a "thread lock" and "thread unlock", each of which
is in turn a fairly heavy-weight operation, even when threads are
not in use.

One can work around this by writing:

while ((ch = getc_unlocked(in)) != EOF)
if (putc_unlocked(out, ch) == EOF) ... handle error ...

or, sometimes, by predefining some macro ("please leave out thread
support").

(This shows -- in my opinion -- how something "obvious" and "simple"
like requiring threads and thread-safety from the library can have
undesired side effects. It is thus a good thing that Standard C
is as loose as it is. If you want tighter specifications, which
may lead to poor performance

, you can add other additional
more-burdensome standards.)

Herbert Rosenau · Mar 9, 2008

Hm. I haven't found it to be so.

while (c)
c=do_it(c);
c=do_another_thing(c);

looks too broken right away for me not to notice it (though, perhaps
now that I'm doing more Python coding work these days, that may
change?).

I used to actually always put the braces in. I've fallen out of that
practice, just because I find it slightly more readable without, for
one-line bodies.

Uh, a halfways intelligent editor will help in writing/editing source.

So my editor is set up expanding 'while' to

while (_) {
}

setting the cursor at the position represented by the underline
charater. Leaving the condition with TAB will insert an empty line,
placing the cursor in the new linedirectly under the 'l' from while,
so new indent is done, ready to type. Enter will insert a new line,
holding the same indent. Shift Enter in insert mode will insert a new
line under the closing bracket and the cursor under it.

Equivalence is given for do, for an so on magically. So conditional
blocks are magically written, indending is done automatically.

The behavior of enter, TAB and opening brace characters changes
depending on the insert|override mode, Enter, shift enter, Ctrl Enter
and Alöt Enter have different mode too. So typing a new program gets
easy, edit it too.

So leaving a block off from typing is at least more hard than having
it already. Indent is set magically, so misleading gets harder having
it right.

--
Tschau/Bye
Herbert

Visit http://www.ecomstation.de the home of german eComStation
eComStation 1.2R Deutsch ist da!

comp.lang.c Answers to Frequently Asked Questions (FAQ List)	1	Feb 1, 2004
comp.lang.c Answers (Abridged) to Frequently Asked Questions (FAQ)	0	Mar 15, 2008
comp.lang.c Answers (Abridged) to Frequently Asked Questions (FAQ)	0	Feb 1, 2008
comp.lang.c Answers (Abridged) to Frequently Asked Questions (FAQ)	0	Jan 15, 2008
comp.lang.c Answers to Frequently Asked Questions (FAQ List)	15	Apr 1, 2006
comp.lang.c Answers (Abridged) to Frequently Asked Questions (FAQ)	0	Dec 1, 2007
comp.lang.c Answers (Abridged) to Frequently Asked Questions (FAQ)	0	Sep 1, 2007
comp.lang.c Answers (Abridged) to Frequently Asked Questions (FAQ)	0	May 15, 2007

cat

CBFalconer

user923005

user923005

user923005

user923005

user923005

Jag

Jag

Falcon Kirtaran

Richard Heathfield

Chris Torek

Chris Torek

CBFalconer

Chris Torek

Herbert Rosenau

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads