embarrassing spaghetti code needs stylistic advice

luser-ex-troll · Mar 19, 2009

Hello all.
I have a problem of a somewhat different kind than the usual post. My
code works! It's just appallingly ugly. With my attention focused
sharply on clear and consistent data structures, the most important
function in my nascent postscript interpreter, the lexical scanner,
has degenerated into spaghetti.

It happened incrementally so I didn't really worry
about it until it became overwhelmingly obvious
that what I've got is terribly, horribly ugly.

I realize that this is a large post, but I couldn't
trim it any shorter without making it either
incomplete (and non-functional) or no longer
representative of the problem.

Specifically the problem is the toke function
which scans a string or file to create an object
(tag-union, variant-record). It's constructed
as a series of tests and loops within a big loop,
but uses goto to change its mind about what
type of object it has found (eg. '+' followed
by a digit is a noise character introducing the
number, but followed by anything else, it's an
executable name).

I can't seem to think of a control structure to replace it with that
affords the same flexibility.

tia.
lxt
ps. feel free to trim the entire code from any
responses. I realize it's quite long for this
medium.
/* tokentest.c
the scanner playpen
*/

#include <ctype.h>
#include <stdbool.h> //true false
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//#include "object.h"
/* object.h
global constants
object structures and typedefs
*/

//limits
#define MAXNAMES 1000
#define MAXTOKEN 256
#define OSSIZE 500
#define ESSIZE 250
#define DSSIZE 20

/* Objects */

#define Types \
X(null, int dummy) \
X(mark, int dummy2) \
X(boolean, bool b) \
X(integer, int i) \
X(real, float f) \
X(name, int n) \
X(string, String *s) \
X(array, Array *a) \
X(dict, Dict *d) \
X(operator, Operator op) \
X(file, FILE *file) \
X(font, void *font) \
X(packedarray, void *pa) \
X(save, void *save) \

struct s_operator {
char *name;
void (*fp)();
};

typedef struct s_object Object;
typedef struct s_string String;
typedef struct s_array Array;
typedef struct s_dict Dict;
typedef struct s_operator Operator;
struct s_object {
#define X(a, b) a ## type ,
enum e_type { Types } type;
#undef X
unsigned char flags;
#define READ 1
#define WRITE 2
#define EXEC 4
#define COMP 8
#define X(a, b) b;
union { Types } u;
#undef X
};

struct s_string {
int ref;
size_t length;
struct s_string *copyof;
char *s; };

struct s_array {
int ref;
size_t length;
struct s_array *copyof;
Object *a; };

struct s_pair { Object key, value; };
struct s_dict {
int ref;
size_t length;
size_t maxlength;
struct s_pair *p; };

// Singular Objects
Object null;
Object mark;

// exported functions
int error (char *fmt, ...);
Object boolean (char b);
Object integer (int i);
Object real (float f);

char *names[MAXNAMES];
//int nameslen;
Object name (char *s);

Object stringn (int n);
Object string (char *s);
void dec_string (String *s);
void inc_string (String *s);
String * substring (String *s, size_t offset, size_t length);

Object array (int n);
void dec_array (Array *a);
void inc_array (Array *a);
Object car (Array *a);
Array * cdr (Array *a);
Array * subarray (Array *a, size_t offset, size_t length);

Object dict (int n);
int eq (Object a, Object b);
struct
s_pair * lookup (Dict *d, Object key);
bool define (Dict *d, Object key, Object value);
void dec_dict (Dict *d);
void inc_dict (Dict *d);

void dec (Object *o);
void inc (Object *o);

Object executable (Object o);
Object operator (char *name, void (*fp)());

/* eof: object.h */

//#include "system.h"
/* system.h
stacks and operators
*/

#define X(a, b) #a "type",
char *typestring[] = { Types }; //names for enum e_type type member of
Object
#undef X
int defer_exec;
int defer_paren;
int quitflag;

Object os[OSSIZE];
Object *tos = os;
#define push(obj) \
(tos != os+OSSIZE)? *(tos++) = obj: (error("stackoverflow"),null)
#define pop ( (tos!=os)? (*(--tos)): (error("stackunderflow"),null) )

Object es[ESSIZE];
Object *tes = es;
#define pushe(obj) \
(tes != es+ESSIZE)? *(tes++) = obj: (error("execstackoverflow"),null)
#define pope ( (tes!=es)? (*(--tes)): (error
("execstackunderflow"),null) )

Object ds[DSSIZE];
Object *tds = ds;
#define pushd(obj) \
(tds != ds+DSSIZE)? *(tds++) = obj: (error("dictstackoverflow"),null)
#define popd ( (tds!=ds)? (*(--tds)): (error
("dictstackunderflow"),null) )

/* operator helpers */

#define stackunder(n,op) ( (tos-os >= n)?: error("stackunderflow in "
#op) )
#define typecheck(ob,tp,op) \
( (ob.type == tp ## type)?: error("typecheck in " #op) )
#define xcheck(ob,op) \
(ob.flags & EXEC)? 0: error("typecheck in " #op)

/* Operators */

/* Miscellaneous Operators */
void Oprompt ();

/* eof system.h */

int sgetc(String *s) {
if (s->length == 0) return EOF;
s->length--;
return *(s->s++);
//s->s++;
//return s->s[-1];
}

int Snext(Object s) {
return sgetc(s.u.s);
}

void Sback(int c, Object s) {
s.u.s->length++;
*(--(s.u.s->s)) = c; //back it up, follow the pointer, store
}

int Fnext(Object f) {
return fgetc(f.u.file);
}

void Fback(int c, Object f) {
ungetc(c, f.u.file);
}

// called by Otoken, below
Object toke(Object src, int (*next)(Object), void (*back)(int,
Object)) {
int i;
int d = 0;
bool negate = false;
char *punct = "()<>[]{}/%";
char s[MAXTOKEN];
char *sp = s;
#define NEXT if ((i=next(src)) == EOF) goto fail
#define NEXTor if ((i=next(src)) == EOF)
#define BACK back(i,src)

while ( (i = next(src)) != EOF ) {
top:
if(i == '\n') { Oprompt(); } //newline
if(isspace(i)) continue; //whitespace _/comments
if(i == '%') { do { NEXT; } while(i != '\n'); goto top; }
if(i == '+') { //optional +
NEXTor goto single;
if(!isdigit(i)) { BACK; i = '+'; goto aname; }
i -= '0';
goto digit; }

if(i == '-') { //optional -
NEXTor goto single;
if(!isdigit(i)) { BACK; i = '-'; goto aname; }
i -= '0'; negate = true;
goto digit; }

if(isdigit(i)) { //digits
do {
i -= '0';
d *= 10;
digit: d += i;
NEXTor goto digitskipback;
if (i == '.') goto real;
if (i == '#') goto radix;
//TODO E notation
} while (isdigit(i));
BACK;
digitskipback:
if (negate) d *= -1;
return integer(d); }

goto after_real;
real: { float f; //b/c f is a FILE *
int e;
f = (float)d; //the positive integer so far
d = 0;
e = 1;
NEXTor goto floatskipback;
while(isdigit(i)) {
i -= '0';
d *= 10;
e *= 10;
d += i;
NEXTor goto floatskipback;
}
//TODO E notation
BACK;
floatskipback:
f += (float)d/(float)e;
if (negate) f *= -1;
return real(f); }
after_real:

goto after_radix;
radix: { int r = d;
if (r > 36) error("badradix syntaxerror in token");
if (r < 2) error("badradix syntaxerror in token");
NEXTor goto radixskipback;
d = 0;
do {
if (isdigit(i)) i -= '0';
else if (islower(i)) i -= 'a'+10;
else if (isupper(i)) i -= 'A'+10;
else error("badradixdigit syntaxerror in token");
d *= r;
d += i;
NEXTor goto radixskipback;
} while(isalnum(i));
BACK;
radixskipback:
return integer(d); }
after_radix:

if(i == '(') { // string
defer_paren = 1;
NEXTor goto syntaxerror;
if (i == ')') defer_paren--;
while (defer_paren) {
if (i == '\n') Oprompt();
if (i == '(') defer_paren++;
//TODO octal and hex
if (i == '\\') {
NEXTor goto syntaxerror;
switch(i) {
case '\n': Oprompt(); goto skip;
case 'a': i = '\a'; break;
case 'b': i = '\b'; break;
case 'f': i = '\f'; break;
case 'n': i = '\n'; break;
case 'r': i = '\r'; break;
case 't': i = '\t'; break;
case 'v': i = '\v'; break;
case '(': case ')':
case '\'': case '\"':
case '?': case '\\': break;
default: error("syntaxerror (string\\escape) in token");
}
}
*sp++ = (char)i;
if (sp-s > MAXTOKEN) error("limitcheck in token");
skip: NEXTor goto syntaxerror;
if (i == ')') defer_paren--;
}
*sp++ = 0;
//no BACK! eat the paren
return string(s); }

if(i == '/') { // literal name
NEXTor goto litnameskipback;
do {
*sp++ = (char)i;
NEXTor goto litnameskipback;
} while(isgraph(i) && strchr(punct,i)==NULL );
BACK;
litnameskipback:
*sp = 0;
return name(s); }

if(strchr("[]", i)) { // array
single: s[0] = (char)i; s[1] = 0;
return executable(name(s)); }

if(i == '{') { //procedures
typedef struct s_cord Fish;
struct s_cord { Object o; struct s_cord *link; };
Fish *head, *tail;
Object o, fin;
size_t i, len = 0;

fin = name("}"); /* make a list */
(void)((head=malloc(sizeof *head)) ||error("VMerror in token"));
tail = head;
do { tail->o = toke(src,next,back);
if ( eq(tail->o,fin) ) break;
len++;
(void)((tail->link=malloc(sizeof *tail)) ||error("VMerror in
token"));
tail = tail->link;
tail->link = NULL; /* possibly unnecessary */
} while(1);

o = array((int)len); /* turn list into array */
tail = head; /* fish becomes worm which eats itself */
for(i=0;i<len;i++) {
o.u.a->a = tail->o;
head = tail->link;
free(tail);
tail = head;
}
free(head); //"}" equiv to free(tail), but this looks more
symmetrical
return executable(o);
}

if(i == '}') {
return executable(name("}"));
}

if(isgraph(i)) { //executable names
do {
aname: *sp++ = (char)i;
NEXTor goto nameskipback;
} while(isgraph(i) && !isspace(i) && strchr(punct,i)==NULL );
BACK;
nameskipback:
*sp = 0;
return executable(name(s)); }

syntaxerror:
error("syntaxerror in token");
} //while

fail:
return null;
}

void Otoken() {
Object o;
Object src;
stackunder(1,token);
src = pop;
switch(src.type) {
case stringtype: push(src);
o = toke(src, Snext, Sback);
dec(&src);
break;
case filetype:
o = toke(src, Fnext, Fback);
break;
default: error("typecheck in token");
}

if (o.type == nulltype) { push(boolean(false)); }
else {
if(eq(o,name("}"))) { error("unmatchedmark in token"); }
else { push(o); push(boolean(true)); }
}
}

int main() {
bool done = false;

push(string("this is a string"));
while(!done) {
Otoken(); //executable names
if (pop.u.b) { //check boolean return value
Object o;
o = pop;
if (o.type == nametype) {
printf("!grOK: name, %s\n", names[o.u.n]);
}
} else {
printf("!grNAK: failed to read a token");
done = true;
}
}

return 0;
}

/* eof token.c */

/* object.c
error function (to avoid a main.h or misc.c)
object allocators
and storage for singular objects null and mark
*/

#include <float.h> //FLT_EPSILON
#include <math.h> //fabsf
#include <stdarg.h> //...
#include <stdbool.h> //true false
#include <stdio.h> //vfprintf
#include <stdlib.h> //exit malloc free
#include <string.h> //strcmp strdup
//#include "object.h"

int error(char *fmt, ...) {
va_list argptr;
va_start( argptr, fmt );
(void)vfprintf(stderr, fmt, argptr);
(void)fputc('\n',stderr);
va_end(argptr);
exit(EXIT_FAILURE);
}

/* Singular objects */

Object null = { .type = nulltype, .flags = 0, .u.dummy = 0};
Object mark = { .type = marktype, .flags = 0, .u.dummy2 = 0};

/* Object Allocators and Convenience Functions */

Object boolean (char b) {
Object o = { .type = booleantype, .flags = 0, .u.b = b };
return o;
}

Object integer (int i) {
Object o = { .type = integertype, .flags = 0, .u.i = i };
return o;
}

Object real (float f) {
Object o = { .type = realtype, .flags = 0, .u.f = f };
return o;
}

char *names[MAXNAMES];
int nameslen = 0;
Object name (char *s) {
Object o = { .type = nametype, .flags = 0, .u.dummy = 0 };
int i;
for (i=0; i<nameslen; i++) { //look
if (strcmp(s, names) == 0) { //found
o.u.n = i;
return o;
}
}
o.u.n = i; //new
names = strdup(s);
nameslen++;
return o;
}

Object stringn (int n) {
Object o = { .type = stringtype, .flags = COMP, .u.dummy = 0 };
(void)((o.u.s = (String *)malloc(sizeof *o.u.s))
|| error("VMerror in stringn"));
o.u.s->ref = 1;
o.u.s->length = (size_t)n;
o.u.s->copyof = NULL;
(void)((o.u.s->s = malloc((size_t)n+1))
|| error("VMerror in stringn"));
return o; }

String *substring(String *s, size_t offset, size_t length) {
String *new;
if (offset+length > s->length)
error("rangecheck in substring");
(void)((new = malloc(sizeof *new))
|| error("VMerror in substring"));
new->ref = 1;
new->length = length;
new->copyof = s;
new->s = s->s + offset;
return new;
}

Object string (char *s) {
Object o;
size_t n;
n = strlen(s);
o = stringn((int)n);
strcpy(o.u.s->s, s);
//make substring so you can play with the pointer
//and dec can still free it later.
o.u.s = substring(o.u.s, 0, o.u.s->length);
return o; }

void dec_string(String *s) {
if (--s->ref == 0) {
if (s->copyof) dec_string(s->copyof);
else free(s->s);
free(s);
}
}

void inc_string(String *s) { s->ref++; }

Object array (int n) {
Object o = { .type = arraytype, .flags = COMP, .u.dummy = 0 };
(void)((o.u.a = (Array *)malloc(sizeof *o.u.a))
|| error("VMerror in array"));
o.u.a->ref = 1;
o.u.a->length = (size_t)n;
o.u.a->copyof = NULL;
(void)((o.u.a->a = (Object *)calloc((size_t)n, sizeof o))
|| error("VMerror in array"));
return o; }

void dec_array(Array *a) {
if (--a->ref == 0) {
int i;
for (i=0; i < (int)a->length; i++) {
//kill elements
dec(a->a + i);
}
if(a->copyof) dec_array(a->copyof);
else free(a->a);
free(a);
}
}

void inc_array(Array *a) { a->ref++; }

Array *subarray(Array *a, size_t offset, size_t length) {
Array *new;
if (offset+length > a->length)
error("rangecheck in subarray");
(void)((new = malloc(sizeof *new))
|| error("VMerror in subarray"));
new->ref = 1;
new->length = length;
new->copyof = a;
inc_array(a);
new->a = a->a + offset;
return new;
}

Object car(Array *a) { return a->a[0]; }

Array *cdr(Array *a) { return subarray(a, 1, a->length-1); }

Object dict (int n) {
Object o = { .type = dicttype, .flags = COMP, .u.dummy = 0 };
(void)((o.u.d = (Dict *)malloc(sizeof *o.u.d))
|| error("VMerror in dict"));
o.u.d->ref = 1;
o.u.d->maxlength = (size_t)n;
o.u.d->length = 0;
(void)((o.u.d->p = (struct s_pair *)calloc((size_t)n,sizeof *o.u.d-

p))

Click to expand...

|| error("VMerror in dict"));
return o; }

int eq (Object a, Object b) {
if (a.type != b.type) { return false; }
switch(a.type) {
case nulltype:
case marktype: return true;
case booleantype: return a.u.b == b.u.b;
case nametype: //ints
case integertype: return a.u.i == b.u.i;
case realtype: return (fabsf(a.u.f - b.u.f) > FLT_EPSILON);
case stringtype: return (strcmp(a.u.s->s, b.u.s->s) == 0);
case arraytype: //composites (pointers)
case filetype:
case dicttype: return a.u.d == b.u.d;
case operatortype: return a.u.op.fp == b.u.op.fp;
default:
return false;
}
}

struct s_pair *lookup (Dict *d, Object key) {
struct s_pair *p = NULL;
int i;
for (i=0; i < (int)d->length; i++) {
if (eq(d->p.key,key)) {
p = &d->p;
break;
}
}
return p;
}

bool define(Dict *d, Object key, Object value) {
struct s_pair *p;
p = lookup(d, key);
if (p) {
dec(&p->value);
p->value = value;
return true;
} else {
if (d->length >= d->maxlength) {
//error("dictfull in define");
return false;
}
p = &d->p[d->length++];
inc(&key);
p->key = key;
inc(&value);
p->value = value;
return true;
}
}

void dec_dict(Dict *d) {
if (--d->ref == 0) {
int i;
for (i=0; i < (int)d->length; i++) {
//kill elements
dec(&d->p.key);
dec(&d->p.value);
}
free(d->p);
free(d);
}
}

void inc_dict(Dict *d) { d->ref++; }

void dec(Object *o) {
if (o->flags & COMP ) { //if Composite
switch(o->type) { //decrement the ref
case stringtype: dec_string(o->u.s); break;
case arraytype: dec_array(o->u.a); break;
case dicttype: dec_dict(o->u.d); break;
default: break;
}
}
}

void inc(Object *o) {
if (o->flags & COMP) {
switch(o->type) {
case stringtype: inc_string(o->u.s); break;
case arraytype: inc_array(o->u.a); break;
case dicttype: inc_dict(o->u.d); break;
default: break;
}
}
}

Object executable (Object o) { o.flags |= EXEC; return o; }

Object operator (char *name, void (*fp)()) {
Object o = { .type = operatortype, .flags = EXEC,
.u.op = { .name = name, .fp = fp } };
return o;
}

/* eof: object.c */

/* from system.c */
void Oprompt() {
printf("> "); fflush(stdout);
}
/* end excerpt from system.c */

This ends the obnoxiously long message.

Mark Wooding · Mar 20, 2009

Tetsuya said:
Next time use pastebin.com please.

Disagree strongly. The message is asking an interesting question, and
will be archived forever. The archive becomes much less valuable
without the actual code in question.

Also, including the original code makes quoting portions of it in
criticism easier.

-- [mdw]

Guest · Mar 20, 2009

I have a problem of a somewhat different kind than the usual post. My
code works! It's just appallingly ugly. With my attention focused
sharply on clear and consistent data structures, the most important
function in my nascent postscript interpreter, the lexical scanner,
has degenerated into spaghetti.

<snip>

ooo! haven't seen code like that in a while! With gotos as well!
I might have a go later but at the moment all I do is give general
advice. Write some tests. Write lots of tests. Every time you make
a small change run all the tests.

In the past I've also drawn flow charts. At least then you can see the
structure of the whole program at once. Try and remove chunks into
separate functions that

1. have a single entrance and exit
2. perform a single function well

pass parameters, don't use global data

Bartc · Mar 20, 2009

luser-ex-troll said:
I have a problem of a somewhat different kind than the usual post. My
code works! It's just appallingly ugly. With my attention focused
sharply on clear and consistent data structures, the most important
function in my nascent postscript interpreter, the lexical scanner,
has degenerated into spaghetti.

NEXTor goto radixskipback;

<snip lots of similar code>

This is for your Postscript interpreter?

Considering Postscript itself doesn't have Goto at all, you're setting
yourself bad examples.

Possibly the code is a consequence of using C which I don't consider very
flexible when developing code. (If you'd started off with Python for
example, this problem would not have come up since it doesn't have Goto
either).

The specific function you mentioned seemed some sort of lexical parser. I've
written loads without needing lots of gotos (one or two are OK).

Use return statements instead (so add more functions). Syntax errors in
tokens I usually deal with by returning a special error token (leaving it to
the caller to report the error).

And I'd get rid of the BACK/NEXT macros which are distorting the syntactical
structure of the code so that the statement type is not recognisable.

luser-ex-troll · Mar 20, 2009

luser-ex-troll said:

Hello all.
I have a problem of a somewhat different kind than the usual post.
My code works!

Click to expand...

Delighted to hear it! (Incidentally, relying as you do on C99
features will restrict the portability of your code and reduce the
number of people able to help you with it. If those are not
concerns for you, then obviously that's not an issue.)

It's just appallingly ugly.
Concur.

With my attention
focused sharply on clear and consistent data structures, the most
important function in my nascent postscript interpreter, the
lexical scanner, has degenerated into spaghetti.

Click to expand...

Again, concur.

It happened incrementally so I didn't really worry
about it until it became overwhelmingly obvious
that what I've got is terribly, horribly ugly.

Click to expand...

I was going to suggest a state machine, but one doesn't normally
need one for something as simple as lexing. A simple loop is
usually enough.

I think you're trying to do too much in your scanner - you're not
just scanning the input for lexemes, you're also trying to
interpret stuff as you go along. To some extent that /can/ be done
neatly, but if you must do it, do it something like this:

while(!error && (p = scanner_getlexeme(stream)) != NULL)
{
n = token_gettype(p);
q = (*token_interpret[n])(p);
error = token_store(q, n, tokenlist);

}

Thanks. I like it. On first read I thought I had an objection
that the various ways of terminating a token should be tied
to the semantic meaning. But it's is probably just an artifact
of my intuitive understanding of the postscript behavior.
With this way it looks like everything is more strongly
dependent on the enum of types; which seems ideal.

lxt

luser-ex-troll · Mar 20, 2009

<snip lots of similar code>

This is for your Postscript interpreter?

Considering Postscript itself doesn't have Goto at all, you're setting
yourself bad examples.

Well, yeah. You're totally right. But it didn't seem so wrong
when I wrote the first one, and the second. Then suddenly it
was no longer obvious how to tease a structure back out of it.

Possibly the code is a consequence of using C which I don't consider very
flexible when developing code. (If you'd started off with Python for
example, this problem would not have come up since it doesn't have Goto
either).

Agreed. But I don't like block structure being controlled by
indentation. I suppose that's obvious from the code.

The specific function you mentioned seemed some sort of lexical parser. I've
written loads without needing lots of gotos (one or two are OK).

See? But what about the third?! The fourth?!

Use return statements instead (so add more functions). Syntax errors in
tokens I usually deal with by returning a special error token (leaving it to
the caller to report the error).

Agreed, but I'm looking for a nice way to organize and dispatch
those functions, rather than merely translating the problem into
function spaghetti.

And I'd get rid of the BACK/NEXT macros which are distorting the syntactical
structure of the code so that the statement type is not recognisable.

Roger that. Artifact from the version that used macros for the string/
file overloading instead of function pointers. At that
point the served a useful purpose by jumping out of the spaghetti like
uh, meatballs.

lxt

Phil Carmody · Mar 21, 2009

Richard Heathfield said:
Tetsuya said:

Why? What's wrong with Usenet?

I can't believe that hasn't elicited any kind of answer yet!

Phil

gw7rib · Mar 21, 2009

I have a problem of a somewhat different kind than the usual post. My
code works! It's just appallingly ugly. With my attention focused
sharply on clear and consistent data structures, the most important
function in my nascent postscript interpreter, the lexical scanner,
has degenerated into spaghetti.

The book "BCPL: the language and its compiler", by Martin Richards and
Coklin Whitby-Strevens, includes the code for a lexical scanner, with
plenty of comments about it. If you can get hold of this book, it
might be worth reading. BCPL is a fore-runner of C.

I've only skimmed through your code, but a few points struck me:

int Snext(Object s) {
return sgetc(s.u.s);

}

void Sback(int c, Object s) {
s.u.s->length++;
*(--(s.u.s->s)) = c; //back it up, follow the pointer, store

}

You're worryuing here about reading values and then needing to back up
a bit. The code in the book avoids the need to do this. There is a
function called RCH which will read a character in and put it in a
global variable CH. (Yes, I know globals are disapproved of these
days, but bear with me...) The function NEXTSYMB, which reads the next
tioken from the source, assumes that there is a character in CH which
has not yet been processed. It processes it, reaing more characters if
necessary using RCH, and it calls RCH at least once so that, when it
finishes, there is an unprocessed charcter left in CH. Thus you just
need to call RCH once at the beginning, and then you call NEXTSYMB
continually to get the tokens.

For example, suppose that a '<' character can either be the start of
<=, the start of <<, or just a less-than sign. You do the following
(I've converted this bit into C, and also fixed a couple of stylistic
points):

case '>':
RCH();
if (CH == '=') { RCH(); return S_LE; }
if (CH == '>') { RCH(); return S_LSHIFT; }
return S_LS;

Either way, the first unprocessed character is in CH afterwards.

#define NEXT if ((i=next(src)) == EOF) goto fail
#define NEXTor if ((i=next(src)) == EOF)

You seem very worried about reading in an EOF character. This seems a
bit unnecessary. At any point in the processing, it seems that one of
three things can be the case:

a) the characters you have processed so far need to be followed by
something of a specific type, and it is an error if they're not;
b) the characters you have processed so far may or may not be followed
by something of a specific type, if they are then process that, if
not, leave what follows to be processed next time round;
c) the characters you have processed so far are complete in themselves
and what follows is something separate.

In none of these cases does there seem to be any need to check
specifically whether what follows includes an EOF. Simply treat it as
any character which is different from what is allowed to follow.

This may in fact improve any error messages that you show the user -
there will be more of "You should have provided a ***, and didn't" and
less of "Unexpected end of file".

You should only need to worry about an EOF if i is EOF when you start
the loop - in which case you simply return null.

Specifically the problem is the toke function
which scans a string or file to create an object
(tag-union, variant-record). It's constructed
as a series of tests and loops within a big loop,
but uses goto to change its mind about what
type of object it has found (eg. '+' followed
by a digit is a noise character introducing the
number, but followed by anything else, it's an
executable name).

if(i == '+') { //optional +
NEXTor goto single;
if(!isdigit(i)) { BACK; i = '+'; goto aname; }
i -= '0';
goto digit; }

This doesn't really seem necessary. Either '+' is followed by a digit,
or it isn't. If it is, the digit (and any subsequent digits) are
processed exactly the same way as if the '+' wasn't there. So I think
you may just need a "continue" here intead of the "goto digit" - start
the processing off again, this time looking at the first digit rather
than the '+'. It may mean testing whether the first digit is a digit
twice, but that's hardly the biggest waste on the planet, is it?

[If you do read the book, note that it is itself not perfect. For one
thing, NEXTSYMB returns its result by a global, which seems an
unnecessary piece of horribleness. Also, instead of the neat code
above, it actually uses RETRUN to leave the function without doing a
RCH, and BREAK to leave the SWITCHON (equivalent to a switch) where it
hits a RCH at the end of the function.]

Anyhow, hope that helps.
Paul.

luser-ex-troll · Mar 21, 2009

I have a problem of a somewhat different kind than the usual post. My
code works! It's just appallingly ugly. With my attention focused
sharply on clear and consistent data structures, the most important
function in my nascent postscript interpreter, the lexical scanner,
has degenerated into spaghetti.

Click to expand...

The book "BCPL: the language and its compiler", by Martin Richards and
Coklin Whitby-Strevens, includes the code for a lexical scanner, with
plenty of comments about it. If you can get hold of this book, it
might be worth reading. BCPL is a fore-runner of C.

I've only skimmed through your code, but a few points struck me:

int Snext(Object s) {
return sgetc(s.u.s);

void Sback(int c, Object s) {
s.u.s->length++;
*(--(s.u.s->s)) = c; //back it up, follow the pointer, store

Click to expand...

}

Click to expand...

You're worryuing here about reading values and then needing to back up
a bit. The code in the book avoids the need to do this. There is a
function called RCH which will read a character in and put it in a
global variable CH. (Yes, I know globals are disapproved of these
days, but bear with me...) The function NEXTSYMB, which reads the next
tioken from the source, assumes that there is a character in CH which
has not yet been processed. It processes it, reaing more characters if
necessary using RCH, and it calls RCH at least once so that, when it
finishes, there is an unprocessed charcter left in CH. Thus you just
need to call RCH once at the beginning, and then you call NEXTSYMB
continually to get the tokens.

For example, suppose that a '<' character can either be the start of
<=, the start of <<, or just a less-than sign. You do the following
(I've converted this bit into C, and also fixed a couple of stylistic
points):

case '>':
RCH();
if (CH == '=') { RCH(); return S_LE; }
if (CH == '>') { RCH(); return S_LSHIFT; }
return S_LS;

Either way, the first unprocessed character is in CH afterwards.

#define NEXT if ((i=next(src)) == EOF) goto fail
#define NEXTor if ((i=next(src)) == EOF)

Click to expand...

You seem very worried about reading in an EOF character. This seems a
bit unnecessary. At any point in the processing, it seems that one of
three things can be the case:

a) the characters you have processed so far need to be followed by
something of a specific type, and it is an error if they're not;
b) the characters you have processed so far may or may not be followed
by something of a specific type, if they are then process that, if
not, leave what follows to be processed next time round;
c) the characters you have processed so far are complete in themselves
and what follows is something separate.

In none of these cases does there seem to be any need to check
specifically whether what follows includes an EOF. Simply treat it as
any character which is different from what is allowed to follow.

This may in fact improve any error messages that you show the user -
there will be more of "You should have provided a ***, and didn't" and
less of "Unexpected end of file".

You should only need to worry about an EOF if i is EOF when you start
the loop - in which case you simply return null.

Specifically the problem is the toke function
which scans a string or file to create an object
(tag-union, variant-record). It's constructed
as a series of tests and loops within a big loop,
but uses goto to change its mind about what
type of object it has found (eg. '+' followed
by a digit is a noise character introducing the
number, but followed by anything else, it's an
executable name).
if(i == '+') { //optional +
NEXTor goto single;
if(!isdigit(i)) { BACK; i = '+'; goto aname; }
i -= '0';
goto digit; }

Click to expand...

This doesn't really seem necessary. Either '+' is followed by a digit,
or it isn't. If it is, the digit (and any subsequent digits) are
processed exactly the same way as if the '+' wasn't there. So I think
you may just need a "continue" here intead of the "goto digit" - start
the processing off again, this time looking at the first digit rather
than the '+'. It may mean testing whether the first digit is a digit
twice, but that's hardly the biggest waste on the planet, is it?

[If you do read the book, note that it is itself not perfect. For one
thing, NEXTSYMB returns its result by a global, which seems an
unnecessary piece of horribleness. Also, instead of the neat code
above, it actually uses RETRUN to leave the function without doing a
RCH, and BREAK to leave the SWITCHON (equivalent to a switch) where it
hits a RCH at the end of the function.]

Anyhow, hope that helps.
Paul.

Yes. I'll add that to my bookfetch list at alibris. BCPL was
interpreted, wasn't it?

As far as improving the error messages, I'm somewhat restricted by the
behavior dictated by the Adobe spec, but I think I can add a field of
extra detail into the report. The error function I posted is just a
stub.

The big stumbling block, as I see it now, is my use of 3 kinds of test
on the character in question: if (i == 'x'), strchr("string", i), and
isalpha(i). It seems if I just pick one, I can organize the tests into
a grammar structure and drastically simplify the code.

Maybe it's time to draw flowcharts...

lxt

luser-ex-troll · Mar 21, 2009

I've barely begun this second attempt and already I want to write
gotos.

Does the following look foredoomed to devolve?

#include <stdio.h>
#include <string.h>

#define space " \t\r\n\f"
#define delim "()<>[]{}/%"
#define crlf "\r\n"
#define digit "0123456789"
#define hex digit "abcdef" "ABCDEF"
#define alpha "abcdefghijklmnopqrstuvwxyz"
#define Alpha "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#define alnum digit alpha Alpha
#define pm "+-"
#define dot "."
#define rad "#"
#define raddoteE "#.eE"
#define eE "eE"
#define epm "e+-"
#define Epm "E+-"

enum e_lext { fail, decimal, radix, real, string, hexstring, name };

typedef struct {
char *pre; /*optional prefix*/
char *chug; /*things to accept*/
int elsewise; /*thing to try if unacceptable*/
char *other; /*transfer to different test by offset matched*/
enum e_lext type; /*type to be interpreted*/
} test;

test tests[] = {
/*0*/{.type = fail, .pre = space, .chug = NULL, .elsewise =
1, .other = NULL },
/*1*/{.type = decimal, .pre = pm, .chug = digit, .elsewise = 6, .other
= raddoteE },
/*2*/{.type = radix, .pre = NULL, .chug = alnum, .elsewise =
6, .other = NULL },
/*3*/{.type = real, .pre = NULL, .chug = digit, .elsewise =
6, .other = eE },
/*4*/{.type = real, .pre = pm, .chug = digit, .elsewise =
6, .other = NULL },
/*5*/{.type = real, .pre = pm, .chug = digit, .elsewise =
6, .other = NULL },
/*6*/{.type = string, .pre = "(", .chug = NULL, .elsewise =
8, .other = NULL },
/*7*/{.type = hexstring, .pre = "<", .chug = hex, .elsewise =
8, .other = NULL },
/*8*/{.type = name, .pre = "/", .chug = alnum, .elsewise =
9, .other = NULL },
/*9*/{.type = fail, .pre = NULL, .chug = NULL, .elsewise =
10, .other = NULL }
};

#define NBUF 256

int main() {
char buf[NBUF] = "";
char *s = buf;
int i;
int testing=0;
char *off;

while( (i=getchar()) != EOF) {
top:
if (tests[testing].pre) /* try this */
while (strchr(tests[testing].pre,i)) {
*s++ = (char)i; *s = 0; i=getchar();
}

if (tests[testing].chug) /* try that */
while (strchr(tests[testing].chug,i)) {
*s++ = (char)i; *s = 0; i=getchar();
}

if (tests[testing].other) { // try the other */
off = strchr(tests[testing].other,i);
if (off) { // transfer to special test
testing += (int) (off-tests[testing].other) + 1;
*s++ = (char)i; *s = 0; i=getchar();
goto top;
}
}

if (s == buf) {
if (testing == 10) {
printf("fail: unable to grok the stream\n");
break;
} else {
testing = tests[testing].elsewise;
goto top;
}
} else {
ungetc(i,stdin);
printf("grok: %s\n", buf);
s = buf;
testing = 0;
}

} //while
return 0;
} //main

//eof

Flash Gordon · Mar 21, 2009

luser-ex-troll said:
I've barely begun this second attempt and already I want to write
gotos.

Ones you don't need...

Does the following look foredoomed to devolve?

int main() {
char buf[NBUF] = "";
char *s = buf;

Personally I would be more inclined to do use an index than a pointer.

int i;
int testing=0;
char *off;

while( (i=getchar()) != EOF) {
top:

*s++ = (char)i; *s = 0; i=getchar();

Why the cast?

goto top;

Loose the i=getchar() and replace the goto with a continue.

goto top;

Do an ungetc() followed by a continue

<snip>

You could also do with breaking the code down in to functions rather
than writing one massive function.

luser-ex-troll · Mar 22, 2009

luser-ex-troll said:

Imagine a C-like language that is exactly the same as C except that
it has no goto, switchless break, or continue. How would you write
your program in such a language?

Not sure, but equally unsure I'd want to use such a language. I think
I'm beginning to truly appreciate the dangers of goto. I also think
that a jump to the top and jump out of loop are the 1 or 2 exceptions
to the rule that may be worthwhile.

But here's take 3 with no gotos, and with much cleaner control flow, I
hope.

--
loose-rocks-trawl

#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>

int pm (int i) { return strchr("+-",i) != 0; }
int space (int i) { return strchr(" \t\n\r\f",i) != 0; }
int dot (int i) { return i == '.'; }
int e (int i) { return strchr("eE",i) != 0; }
int rad (int i) { return i == '#'; }
int notraddote (int i) { return strchr("#.eE",i) == 0; }
int lparen (int i) { return i == '('; }
int notrparen (int i) { return i != ')'; }
int rparen (int i) { return i == ')'; }
int lt (int i) { return i == '<'; }
int gt (int i) { return i == '>'; }
int delim (int i) { return strchr("()<>[]{}/%",i) != 0; }
int regular (int i) { return !space(i) && !delim(i); }
int slash (int i) { return i == '/'; }

typedef struct {
int (*fp)(int); //function predicate
int yes,no; //transition
bool pass; //store char if accepted
} test;

#define SYNE -1
#define DECI -2
#define RADI -3
#define REAL -4
#define STRG -5
#define HEXS -6
#define NAME -7

test tests[] = {
/* 0*/ {.fp= space, .yes= 0, .no= 1, .pass=false }, /*sp* */

//decimal
/* 1*/ {.fp=pm, .yes= 2, .no= 3, .pass=true },
/* 2*/ {.fp=isdigit, .yes= 4, .no=19, .pass=true }, //[+-]
[^0-9]: name
/* 3*/ {.fp=isdigit, .yes= 4, .no=13, .pass=true }, //[^0-9]:
string?
/* 4*/ {.fp=isdigit, .yes= 4, .no= 5, .pass=true }, //[0-9]+
/* 5*/ {.fp=notraddote, .yes=DECI, .no= 6, .pass=false }, //[0-9]+
[^0-9]

//radix
/* 6*/ {.fp=rad, .yes= 7, .no= 8, .pass=true }, /*([#]?)
*/
/* 7*/ {.fp=isalnum, .yes= 7, .no=RADI, .pass=true }, /*([0-9a-z]
*) */

//real
/* 8*/ {.fp=dot, .yes= 9, .no=10, .pass=true }, /*([.]?)
*/
/* 9*/ {.fp=isdigit, .yes= 9, .no=10, .pass=true }, /*([0-9]?)
*/

//exponential
/*10*/ {.fp=e, .yes=11, .no=12, .pass=true }, /*([eE]?)
*/
/*11*/ {.fp=pm, .yes=12, .no=12, .pass=true }, /*[eE]
([+-]?) */
/*12*/ {.fp=isdigit, .yes=12, .no=REAL, .pass=true }, /*([0-9]?)
*/

//string
/*13*/ {.fp=lparen, .yes=14, .no=16, .pass=false }, /*[(]?*/
/*14*/ {.fp=notrparen, .yes=14, .no=15, .pass=true }, /*([^)]?)
*/
/*15*/ {.fp=rparen, .yes=STRG, .no=SYNE, .pass=false }, /*[)]?*/

//hexstring
/*16*/ {.fp=lt, .yes=17, .no=19, .pass=false }, /*[<]?*/
/*17*/ {.fp=isxdigit, .yes=17, .no=18, .pass=true }, /*([0-9a-f]
*)*/
/*18*/ {.fp=gt, .yes=HEXS, .no=SYNE, .pass=false }, /*[>]?*/

//name
/*19*/ {.fp=slash, .yes=20, .no=20, .pass=true }, /*([/]?)*/
/*20*/ {.fp=regular, .yes=20, .no=NAME, .pass=true } /*([^sp
delim]*)*/
};

#define NBUF 256

int main() {
int i;
char buf[NBUF] = "";
char *s = buf;
int state = 0;

while ( (i=getchar()) != EOF ) {

if ( (*tests[state].fp)(i) ) {
if (tests[state].pass) {
*s++ = i; *s = 0;
}
state = tests[state].yes;
} else {
state = tests[state].no;
ungetc(i,stdin);
}

if (state < 0) {
char *typestring;
if (state == -1) { break; }
switch(state) {
case DECI: typestring = "decimal"; break;
case RADI: typestring = "radix"; break;
case REAL: typestring = "real"; break;
case STRG: typestring = "string"; break;
case HEXS: typestring = "hexstring"; break;
case NAME: typestring = "name"; break;
}
printf("grok: %d %s %s\n", state, typestring, buf);
state = 0;
s = buf; *s = 0;
}

} //while
printf("fail: 0x%x\n",(unsigned)i);
return 0;
} //main

//eof

Bartc · Mar 23, 2009

luser-ex-troll said:
Agreed. But I don't like block structure being controlled by
indentation. I suppose that's obvious from the code.

Not controlled. But represented. How else would you show the code structure?
(Imagine reading a novel, with lots of dialogue, typeset as a single giant
paragraph.)

See? But what about the third?! The fourth?!

By the third or fourth goto, you will either see a pattern emerging, or
start to realise it's getting out of hand and needs a slightly different
approach. Although this is true of a lot of coding.

CBFalconer · Mar 23, 2009

Richard said:
That's a rather dubious comparison. Novels have chapter
headings and paragraphs; they don't have the equivalent of
levels of indentation. They may have flashbacks but they are
structured to be read sequentially.

Have you never ignored a Usenet message that is written as a solid
block, possibly withour either sentence of paragraph delimitation?

luser-ex-troll · Mar 23, 2009

Have you never ignored a Usenet message that is written as a solid
block, possibly withour either sentence of paragraph delimitation?

Sounds like Finnegan's Wake! ;{>

Richard Bos · Mar 24, 2009

Bartc said:
Not controlled.

Wrong. In Python, indentation _does_ control the block structure, unlike
in sane languages or even in C.

Richard

luser-ex-troll · Mar 24, 2009

Alright now. I think I've turned a corner with this. And, of course,
everyone who said "use functions!" was spot on.

Hopefully this looks more C-worthy.

--
lexit-real

#include <ctype.h>
#include <stdio.h>
#include <string.h>

int issign (int c){return !!strchr("+-",c);}
int israd (int c){return !!strchr("#", c);}
int isdot (int c){return !!strchr(".", c);}
int ise (int c){return !!strchr("eE",c);}
int isdelim (int c){return !!strchr("()<>{}[]%/",c);}
int isregular(int c){return !isspace(c);}

typedef struct test test;
struct test {
int (*fp)(int); int y,n;
};

test decimal[] = {
/* 0*/ { issign, 1, 1 },
/* 1*/ { isdigit, 2, -1 },
/* 2*/ { isdigit, 2, -1 }, //success
};
int dec_accept(int i){ return i==2; }

test radix[] = {
/* 0*/ { isdigit, 1, -1 },
/* 1*/ { isdigit, 1, 2 },
/* 2*/ { israd, 3, -1 },
/* 3*/ { isdigit, 4, -1 },
/* 4*/ { isdigit, 4, -1 }, //success
};
int rad_accept(int i){ return i==4; }

test real[] = {
/* 0*/ { issign, 1, 1 },
/* 1*/ { isdigit, 2, 4 },
/* 2*/ { isdigit, 2, 3 },
/* 3*/ { isdot, 6, 7 }, //success
/* 4*/ { isdot, 5, -1 },
/* 5*/ { isdigit, 6, -1 },
/* 6*/ { isdigit, 6, 7 }, //success
/* 7*/ { ise, 8, -1 },
/* 8*/ { issign, 9, 9 },
/* 9*/ { isdigit, 10, -1 },
/*10*/ { isdigit, 10, -1 }, //success
};
int real_accept(int i){switch(i){case 3: case 6:case 10:return 1;}
return 0;}

int check(char *buf, test *fsm, int(*yes)(int)){ char *s = buf; int
sta = 0;
while(sta!=-1 && *s) {
if (fsm[sta].fp(*s))
{ sta = fsm[sta].y; s++; }
else { sta = fsm[sta].n; }
}
sta=yes(sta);
return sta; }

int grok(char *buf) {
if (check(buf,decimal,dec_accept)) { printf( "dec: %s\n",
buf); return 0; }
else if (check(buf,radix, rad_accept)) { printf( "rad: %s\n",
buf); return 0; }
else if (check(buf,real, real_accept)) { printf("real: %s\n",
buf); return 0; }
else { printf("grok? %s\n", buf); return -1; }
}

int puff(char *buf, int nbuf) { char *s = buf; int c;
while ((c=getchar()) != EOF) {
if(isspace(c) || isdelim(c))
break;
if(nbuf < s-buf-1)
return -1;
*s++ = c;
}
*s++ = 0;
return 0; }

int toke(char *buf, int nbuf) { char *s=buf; int sta = 0;
while(isspace(*s=getchar())) /**/;
if( (sta=puff(buf+1,nbuf-1)) == -1) return -1;
sta = grok(buf);
return sta; }

#define NBUF 10
int main() { char buf[NBUF] = ""; int sta;
while ( (sta=toke(buf,NBUF)) != -1 )
/**/;
return 0; }

luser-ex-troll · Mar 25, 2009

correction; last 2 lines of radix test should use isalnum rather than
isdigit:
/* 3*/ { isalnum, 4, -1 },
/* 4*/ { isalnum, 4, -1 }, //success

oh, and here are the regular expressions that the three machines are
intended to match:
decimals: ^[+-]?d+$
radix: ^d+[#][a-Z0-9]+$
real: ^[+-]?(d+.d*)|(d*.d+)([eE][+-]?d+)?$

Keith Thompson · Mar 25, 2009

luser-ex-troll said:
correction; last 2 lines of radix test should use isalnum rather than
isdigit:
/* 3*/ { isalnum, 4, -1 },
/* 4*/ { isalnum, 4, -1 }, //success

oh, and here are the regular expressions that the three machines are
intended to match:
decimals: ^[+-]?d+$
radix: ^d+[#][a-Z0-9]+$
real: ^[+-]?(d+.d*)|(d*.d+)([eE][+-]?d+)?$

I assume d is intended to represent a decimal digit. In the regular
expression syntaxes I've seen, that's represented as \d; d represents
the letter d itself.

luser-ex-troll · Mar 25, 2009

luser-ex-troll said:
luser-ex-troll said:

correction; last 2 lines of radix test should use isalnum rather than
isdigit:
/* 3*/ { isalnum, 4, -1 },
/* 4*/ { isalnum, 4, -1 }, //success

Click to expand...

oh, and here are the regular expressions that the three machines are
intended to match:
decimals: ^[+-]?d+$
radix: ^d+[#][a-Z0-9]+$
real: ^[+-]?(d+.d*)|(d*.d+)([eE][+-]?d+)?$

Click to expand...

I assume d is intended to represent a decimal digit. In the regular
expression syntaxes I've seen, that's represented as \d; d represents
the letter d itself.

Yes, precisely. Apologies.

Unable to read input from keyboard, in below C code, for a BST.	0	Jul 20, 2025
Need Helping adding Square root code to an existing calculator. (Absolute begginer?)	0	Jan 12, 2025
BITCOIN PROGRAMMING - CODE INCLUDED - needs slight modification in linux terminal - NSA please do not block	0	Nov 1, 2024
reference-counted dynamic tag-union objects	21	Mar 3, 2009
Building a Professional Neural Network Framework: Full C++ Implementation with Windows GUI and Real-Time Training Visualization	11	Jun 14, 2026
Database Manager: A C++ Console Application	14	May 12, 2025
Rich Text Format (RTF) Document Builder in C++: Code and Features	0	Sep 28, 2025
Privacy Shield A Clean, C++ Win32 Tool for Temporarily Masking Windows	4	Mar 25, 2026

embarrassing spaghetti code needs stylistic advice

luser-ex-troll

Mark Wooding

Guest

Bartc

luser-ex-troll

luser-ex-troll

Phil Carmody

gw7rib

luser-ex-troll

luser-ex-troll

Flash Gordon

luser-ex-troll

Bartc

CBFalconer

luser-ex-troll

Richard Bos

luser-ex-troll

luser-ex-troll

Keith Thompson

luser-ex-troll

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads