J
jacob navia
Following the series about commenting code, here is an installment about
text processing.
What do you think?
Thanks for your attention.
-----------------------------------------------------------------------
Text manipulation
Text files are a widely used format for storing data. They are usually
quite compact (no text processing formats like bold, italics, or other
font related instructions) and they are widely portable if written in
the ASCII subset of text data.
A widely used application of text files are program files. Most
programming languages (and here C is not an exception) store the
program in text format.
So let's see a simple application of a text manipulating program.
The task at hand is to prepare a C program text to be translated
into several languages. Obviously, the character string:
"Please enter the file name"
will not be readily comprehensible to a spanish user. It would
be better if the program would show in Spain the character string:
"Entre el nombre del fichero por favor"
To prepare this translation, we need to extract all character
strings from the program text and store them in some table.
Instead of referencing directly a character string, the program
will reference a certain offset from our table. In the above
example the character string would be replaced by
StringTable[6]
To do this transformation we will write into the first line
of our program:
static char *StringTable[];
Then, in each line where a character string appears we will
replace it with an index into the string table.
printf("Please enter the file name");
will become
printf(StringTable[x]);
where "x" will be the index for that string in our table.
At the end of the file we will append the definition of our
string table with:
static char *StringTable[] = {
...,
...,
"Please enter the file name",
...,
NULL
};
After some hours of work, we come with the following solution. We test a
bit, and it seems to work.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <strings.h>
4
5 // Reads a single character constant
6 static int ReadCharConstant(FILE *infile)
7 {
8 int c;
9 c = fgetc(infile);
10 putchar('\'');
11 while (c != EOF && c != '\'') {
12 putchar(c);
13 if (c == '\\') {
14 c = fgetc(infile);
15 if (c == EOF)
16 return EOF;
17 putchar(c);
18 }
19 c = fgetc(infile);
20 }
21 if (c != EOF){
22 putchar(c);
23 c = fgetc(infile);
24 }
25 return c;
26 }
27
28 static int ReadLongComment(FILE *infile)
29 {
30 int c;
31 putchar('/');
32 putchar('*');
33 c = fgetc(infile);
34
35 do {
36
37 while (c != '*' && c != EOF) {
38 putchar(c);
39 c = fgetc(infile);
40 }
41 if (c == '*') {
42 putchar(c);
43 c = fgetc(infile);
44 }
45 } while (c != '/' && c != EOF); /* Problem 2 */
46 if (c == '/')
47 putchar(c);
48 return c;
49 }
50
51 static int ReadLineComment(FILE *infile)
52 {
53 int c = fgetc(infile);
54
55 putchar('/'); putchar('/');
56 while (c != EOF && c != '\n') {
57 putchar(c);
58 c = fgetc(infile);
59 }
60 return c;
61 }
62 static char *stringBuffer;
63 static char *stringBufferPointer;
64 static char *stringBufferEnd;
65 static size_t stringBufferSize;
66 static unsigned stringCount;
67
68 #define BUFFER_SIZE 1024
69
70 static void OutputStrings(void)
71 {
72 char *p = stringBuffer,*strPtr;
73 printf("\nstatic char *StringTable[]={\n");
74 while (*p) {
75 printf("\t\"%s\",\n",p);
76 p += strlen(p)+1;
77 }
78 printf("\tNULL\n};\n");
79 free(stringBuffer);
80 stringBuffer = NULL;
81 }
82 static void PutCharInBuffer(int c)
83 {
84 if (stringBufferPointer == stringBufferEnd) {
85 size_t newSize = stringBufferSize + BUFFER_SIZE;
86 char *tmp = realloc(stringBuffer,newSize);
87 if (tmp == NULL) {
88 fprintf(stderr,"Memory exhausted\n");
89 exit(EXIT_FAILURE);
90 }
91 stringBuffer = tmp;
92 stringBufferPointer = tmp+stringBufferSize;
93 stringBufferSize += BUFFER_SIZE;
94 stringBufferEnd = tmp + stringBufferSize;
95 }
96 *stringBufferPointer++ = c;
97 }
98
99 static int ReadString(FILE *infile)
100 {
101 int c;
102 if (stringBuffer == NULL) {
103 stringBuffer = malloc(BUFFER_SIZE);
104 if (stringBuffer == NULL)
105 return EOF;
106 stringBufferPointer = stringBuffer;
107 stringBufferEnd = stringBufferPointer+BUFFER_SIZE;
108 stringBufferSize = BUFFER_SIZE;
109 }
110 c = fgetc(infile);
111 while (c != EOF && c != '"') {
112 PutCharInBuffer(c);
113 if (c == '\\') {
114 c = fgetc(infile);
115 if (c != '\n')
116 PutCharInBuffer(c);
117 }
118 c = fgetc(infile);
119 }
120 if (c == EOF)
121 return EOF;
122 PutCharInBuffer(0);
123 printf("StringTable[%d]",stringCount);
124 stringCount++;
125 return fgetc(infile);
126 }
127
128 static int ProcessChar(int c,FILE *infile)
129 {
130 switch (c) {
131 case '\'':
132 c = ReadCharConstant(infile);
133 break;
134 case '"':
135 c = ReadString(infile);
136 break;
137 case '/':
138 c = fgetc(infile);
139 if (c == '*')
140 c = ReadLongComment(infile);
141 else if (c == '/')
142 c = ReadLineComment(infile);
143 else {
144 putchar(c);
145 c = fgetc(infile);
146 }
147 break;
148 case '#':
149 putchar(c);
150 while (c != EOF && c != '\n') {
151 c = fgetc(infile);
152 putchar(c);
153 }
154 if (c == '\n')
155 c=fgetc(infile);
156 break;
157 default:
158 putchar(c);
159 c = fgetc(infile);
160 break;
161 }
162 return c;
163 }
164 int main(int argc,char *argv[])
165 {
166 FILE *infile;
167
168 if (argc < 2) {
169 fprintf(stderr,"Usage: strings <file name>\n");
170 return EXIT_FAILURE;
171 }
172 if (!strcmp(argv[1],"-")) {
173 infile = stdin;
174 } else {
175 infile = fopen(argv[1],"r");
176 if (infile == NULL) {
177 fprintf(stderr,"Can't open %s\n",argv[1]);
178 return EXIT_FAILURE;
179 }
180 }
181 int c = fgetc(infile);
182 printf("static char *StringTable[];\n");
183 while (c != EOF) {
184 c = ProcessChar(c,infile);
185 }
186 PutCharInBuffer(0);
187 PutCharInBuffer(0);
188 OutputStrings();
189 }
The general structure of this program is simple. We
o Open the given file to process
o We process each character
o We are interested only in the following tokens:
Char constants
Comments
Character strings
Preprocessor directives
Why those?
Char constants could contain double quotes, what would lead the other
parts of our programs to see strings where there aren't any. For instance:
case'"':
would be misunderstood as the start of a never ending string.
Comments are necessary since we should not process strings in comments.
Preprocessor directives should be ignored since we do NOT want to
translate
#include "myfile.h"
Our string parsing routine stores the contents of each string in a buffer
that is grown if needed, printing into standard output only the
StringTable[x]
instead of the stored string. Each string is finished with a zero, and
after the last string we store additional zeroes to mark the end of
the buffer.
After the whole file is processed we write the contents of the buffer
in the output (written to stdout) and that was it. We have extracted
the strings into a table.
Analysis
-------
Our program seems to work, but there are several corner cases that
it doesn't handle at all.
For instance it is legal in C to write:
"String1" "String2"
and this will be understood as
"String1String2"
by the compiler. Our translation amkes this into:
StringTable[0] StringTable[1]
what is a syntax error.
Another weak point is that a string can be present several times in our
table
since we do not check if the string is present before storing it in our
table.
And there are many corner cases that are just ignored. For instance you can
continue a single line comment with a backslash, a very bad idea of course
but a legal one. We do not follow comments like these:
// This is a comment \
and this line is a comment too
And (due to low level of testing) there could be a lot of hidden bugs in it.
But this should be a simple utility to quickly extract the strings from a
file without too much manual work. We know we do not use the features it
deosn't support, and it will serve our purposes well.
What is important to know is that there is always a point where we stop
developing and decided that we will pass to another thing. Either because
we get fed up or because our boss tell us that we should do xxx instead of
continuing the development of an internal utility.
In this case we stop the first development now. See the exercises for
the many ways as to how we could improve this simple program.
Exercises:
1: This filter can read from stdin and write to stdout. Add a command line
option to specify the name of an output file. How many changes you would
need to do in the code to implement that?
2: The program can store a string several times. What would be needed to
avoid that? What data structure would you recommend?
3: Implement the concatenation of strings, i.e.
"String1" "String2" --> "String1String2"
4: Seeing in the code
printf(StringTable[21]);
is not very easy to follow. Implement the change so that we would have
instead in the output:
// StringTable[21]--> "Please enter the file name"
printf(StringTable[21]);
i.e. each line would be preceeded with one or several comment lines that
describe the strings being used.
5: Add an option so that the name of the string table can be changed from
"StringTable" to some other name. The reason is that a user complained
that the "new" string table destroyed her program: she had a
"StringTable"
variable in her program!
How could you do this change automatically?
6: The program needs to be part of an IDE where the IDE will need to
call the program as a routine (not as an independent program).
What would be needed to do that? What do you think about the global
variables used in the original program?
text processing.
What do you think?
Thanks for your attention.
-----------------------------------------------------------------------
Text manipulation
Text files are a widely used format for storing data. They are usually
quite compact (no text processing formats like bold, italics, or other
font related instructions) and they are widely portable if written in
the ASCII subset of text data.
A widely used application of text files are program files. Most
programming languages (and here C is not an exception) store the
program in text format.
So let's see a simple application of a text manipulating program.
The task at hand is to prepare a C program text to be translated
into several languages. Obviously, the character string:
"Please enter the file name"
will not be readily comprehensible to a spanish user. It would
be better if the program would show in Spain the character string:
"Entre el nombre del fichero por favor"
To prepare this translation, we need to extract all character
strings from the program text and store them in some table.
Instead of referencing directly a character string, the program
will reference a certain offset from our table. In the above
example the character string would be replaced by
StringTable[6]
To do this transformation we will write into the first line
of our program:
static char *StringTable[];
Then, in each line where a character string appears we will
replace it with an index into the string table.
printf("Please enter the file name");
will become
printf(StringTable[x]);
where "x" will be the index for that string in our table.
At the end of the file we will append the definition of our
string table with:
static char *StringTable[] = {
...,
...,
"Please enter the file name",
...,
NULL
};
After some hours of work, we come with the following solution. We test a
bit, and it seems to work.
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <strings.h>
4
5 // Reads a single character constant
6 static int ReadCharConstant(FILE *infile)
7 {
8 int c;
9 c = fgetc(infile);
10 putchar('\'');
11 while (c != EOF && c != '\'') {
12 putchar(c);
13 if (c == '\\') {
14 c = fgetc(infile);
15 if (c == EOF)
16 return EOF;
17 putchar(c);
18 }
19 c = fgetc(infile);
20 }
21 if (c != EOF){
22 putchar(c);
23 c = fgetc(infile);
24 }
25 return c;
26 }
27
28 static int ReadLongComment(FILE *infile)
29 {
30 int c;
31 putchar('/');
32 putchar('*');
33 c = fgetc(infile);
34
35 do {
36
37 while (c != '*' && c != EOF) {
38 putchar(c);
39 c = fgetc(infile);
40 }
41 if (c == '*') {
42 putchar(c);
43 c = fgetc(infile);
44 }
45 } while (c != '/' && c != EOF); /* Problem 2 */
46 if (c == '/')
47 putchar(c);
48 return c;
49 }
50
51 static int ReadLineComment(FILE *infile)
52 {
53 int c = fgetc(infile);
54
55 putchar('/'); putchar('/');
56 while (c != EOF && c != '\n') {
57 putchar(c);
58 c = fgetc(infile);
59 }
60 return c;
61 }
62 static char *stringBuffer;
63 static char *stringBufferPointer;
64 static char *stringBufferEnd;
65 static size_t stringBufferSize;
66 static unsigned stringCount;
67
68 #define BUFFER_SIZE 1024
69
70 static void OutputStrings(void)
71 {
72 char *p = stringBuffer,*strPtr;
73 printf("\nstatic char *StringTable[]={\n");
74 while (*p) {
75 printf("\t\"%s\",\n",p);
76 p += strlen(p)+1;
77 }
78 printf("\tNULL\n};\n");
79 free(stringBuffer);
80 stringBuffer = NULL;
81 }
82 static void PutCharInBuffer(int c)
83 {
84 if (stringBufferPointer == stringBufferEnd) {
85 size_t newSize = stringBufferSize + BUFFER_SIZE;
86 char *tmp = realloc(stringBuffer,newSize);
87 if (tmp == NULL) {
88 fprintf(stderr,"Memory exhausted\n");
89 exit(EXIT_FAILURE);
90 }
91 stringBuffer = tmp;
92 stringBufferPointer = tmp+stringBufferSize;
93 stringBufferSize += BUFFER_SIZE;
94 stringBufferEnd = tmp + stringBufferSize;
95 }
96 *stringBufferPointer++ = c;
97 }
98
99 static int ReadString(FILE *infile)
100 {
101 int c;
102 if (stringBuffer == NULL) {
103 stringBuffer = malloc(BUFFER_SIZE);
104 if (stringBuffer == NULL)
105 return EOF;
106 stringBufferPointer = stringBuffer;
107 stringBufferEnd = stringBufferPointer+BUFFER_SIZE;
108 stringBufferSize = BUFFER_SIZE;
109 }
110 c = fgetc(infile);
111 while (c != EOF && c != '"') {
112 PutCharInBuffer(c);
113 if (c == '\\') {
114 c = fgetc(infile);
115 if (c != '\n')
116 PutCharInBuffer(c);
117 }
118 c = fgetc(infile);
119 }
120 if (c == EOF)
121 return EOF;
122 PutCharInBuffer(0);
123 printf("StringTable[%d]",stringCount);
124 stringCount++;
125 return fgetc(infile);
126 }
127
128 static int ProcessChar(int c,FILE *infile)
129 {
130 switch (c) {
131 case '\'':
132 c = ReadCharConstant(infile);
133 break;
134 case '"':
135 c = ReadString(infile);
136 break;
137 case '/':
138 c = fgetc(infile);
139 if (c == '*')
140 c = ReadLongComment(infile);
141 else if (c == '/')
142 c = ReadLineComment(infile);
143 else {
144 putchar(c);
145 c = fgetc(infile);
146 }
147 break;
148 case '#':
149 putchar(c);
150 while (c != EOF && c != '\n') {
151 c = fgetc(infile);
152 putchar(c);
153 }
154 if (c == '\n')
155 c=fgetc(infile);
156 break;
157 default:
158 putchar(c);
159 c = fgetc(infile);
160 break;
161 }
162 return c;
163 }
164 int main(int argc,char *argv[])
165 {
166 FILE *infile;
167
168 if (argc < 2) {
169 fprintf(stderr,"Usage: strings <file name>\n");
170 return EXIT_FAILURE;
171 }
172 if (!strcmp(argv[1],"-")) {
173 infile = stdin;
174 } else {
175 infile = fopen(argv[1],"r");
176 if (infile == NULL) {
177 fprintf(stderr,"Can't open %s\n",argv[1]);
178 return EXIT_FAILURE;
179 }
180 }
181 int c = fgetc(infile);
182 printf("static char *StringTable[];\n");
183 while (c != EOF) {
184 c = ProcessChar(c,infile);
185 }
186 PutCharInBuffer(0);
187 PutCharInBuffer(0);
188 OutputStrings();
189 }
The general structure of this program is simple. We
o Open the given file to process
o We process each character
o We are interested only in the following tokens:
Char constants
Comments
Character strings
Preprocessor directives
Why those?
Char constants could contain double quotes, what would lead the other
parts of our programs to see strings where there aren't any. For instance:
case'"':
would be misunderstood as the start of a never ending string.
Comments are necessary since we should not process strings in comments.
Preprocessor directives should be ignored since we do NOT want to
translate
#include "myfile.h"
Our string parsing routine stores the contents of each string in a buffer
that is grown if needed, printing into standard output only the
StringTable[x]
instead of the stored string. Each string is finished with a zero, and
after the last string we store additional zeroes to mark the end of
the buffer.
After the whole file is processed we write the contents of the buffer
in the output (written to stdout) and that was it. We have extracted
the strings into a table.
Analysis
-------
Our program seems to work, but there are several corner cases that
it doesn't handle at all.
For instance it is legal in C to write:
"String1" "String2"
and this will be understood as
"String1String2"
by the compiler. Our translation amkes this into:
StringTable[0] StringTable[1]
what is a syntax error.
Another weak point is that a string can be present several times in our
table
since we do not check if the string is present before storing it in our
table.
And there are many corner cases that are just ignored. For instance you can
continue a single line comment with a backslash, a very bad idea of course
but a legal one. We do not follow comments like these:
// This is a comment \
and this line is a comment too
And (due to low level of testing) there could be a lot of hidden bugs in it.
But this should be a simple utility to quickly extract the strings from a
file without too much manual work. We know we do not use the features it
deosn't support, and it will serve our purposes well.
What is important to know is that there is always a point where we stop
developing and decided that we will pass to another thing. Either because
we get fed up or because our boss tell us that we should do xxx instead of
continuing the development of an internal utility.
In this case we stop the first development now. See the exercises for
the many ways as to how we could improve this simple program.
Exercises:
1: This filter can read from stdin and write to stdout. Add a command line
option to specify the name of an output file. How many changes you would
need to do in the code to implement that?
2: The program can store a string several times. What would be needed to
avoid that? What data structure would you recommend?
3: Implement the concatenation of strings, i.e.
"String1" "String2" --> "String1String2"
4: Seeing in the code
printf(StringTable[21]);
is not very easy to follow. Implement the change so that we would have
instead in the output:
// StringTable[21]--> "Please enter the file name"
printf(StringTable[21]);
i.e. each line would be preceeded with one or several comment lines that
describe the strings being used.
5: Add an option so that the name of the string table can be changed from
"StringTable" to some other name. The reason is that a user complained
that the "new" string table destroyed her program: she had a
"StringTable"
variable in her program!
How could you do this change automatically?
6: The program needs to be part of an IDE where the IDE will need to
call the program as a routine (not as an independent program).
What would be needed to do that? What do you think about the global
variables used in the original program?