A simple parser

Discussion in 'C Programming' started by jacob navia, Oct 14, 2006.

  1. jacob navia

    jacob navia Guest

    Hi guys

    I have written this small parser to print out the functions defined in a
    C file. This is an example of parsing in C, that I want to add to my
    tutorial. Comments (and bug reports) are welcome.

    -------------------------------------------------------------cut here

    /* A simple scanner that will take a file of C source code and
    print the names of all functions therein, in the following format:
    "Function XXXX found line dddd .... ddddd"
    Algorithm. It scans for a terminating parentheses and an immediately
    following opening brace. Comments can appear between the closing
    paren and the opening braces, but no other characters besides white
    space. Functions must have the correct prototype, K & R syntax
    is not supported.
    */
    #include <stdio.h>
    #define MAXID 1024 // Longest Identifier we support. Sorry
    // Java guys...
    static char IdBuffer[MAXID]; // Buffer for remembering the function name
    static int line = 1; // We start at line 1

    // This function reads a character and if
    // it is \n it bumps the line counter
    static int Fgetc(FILE *f)
    {
    int c = fgetc(f);
    if (c == '\n')
    line++;
    return c;
    }

    // Return 1 if the character is a legal C identifier
    // character, zero if not. The parameter "start"
    // means if an identifier START character
    // (numbers) is desired.
    static int IsIdentifier(int c,int start)
    {
    if (c >= 'a' && c <= 'z')
    return 1;
    if (c >= 'A' && c <= 'Z')
    return 1;
    if (start == 0 && c >= '0' && c <= '9')
    return 1;
    if (c == '_')
    return 1;
    return 0;
    }

    // Just prints the function name
    static int PrintFunction(FILE *f)
    {
    printf("Function %s found line %d ...",IdBuffer,line);
    return Fgetc(f);
    }

    // Reads a global identifier into our name buffer
    static int ReadId(char c,FILE *f)
    {
    int i = 1;
    IdBuffer[0] = c;
    while (i < MAXID-1) {
    c = Fgetc(f);
    if (c != EOF) {
    if (IsIdentifier(c,0))
    IdBuffer[i++] = c;
    else break;
    }
    else break;
    }
    IdBuffer = 0;
    return c;
    }


    static int ParseString(FILE *f) // Skips strings
    {
    int c = Fgetc(f);
    while (c != EOF && c != '"') {
    if (c == '\\')
    c = Fgetc(f);
    if (c != EOF)
    c = Fgetc(f);
    }
    if (c == '"')
    c = Fgetc(f);
    return c;
    }

    static int ParseComment(FILE *f) // Skips comments
    {
    int c = Fgetc(f);
    restart:
    while (c != '*') {
    c = Fgetc(f);
    if (c == EOF)
    return EOF;
    }
    c = Fgetc(f);
    if (c == '/')
    return Fgetc(f);
    else goto restart;
    }


    static int ParseCppComment(FILE *f) // Skips // comments
    {
    int c = Fgetc(f);
    while (c != EOF && c != '\n') {
    if (c == '\\')
    c = Fgetc(f);
    if (c != EOF)
    c = Fgetc(f);
    }
    if (c == '\n')
    c = Fgetc(f);
    return c;
    }

    // Skips white space and comments
    static int SkipWhiteSpace(int c,FILE *f) {
    if (c > ' ')
    return c;
    while (c <= ' ') {
    c = Fgetc(f);
    if (c == '/') {
    c = Fgetc(f);
    if (c == '*')
    c = ParseComment(f);
    else if (c == '/')
    c = ParseCppComment(f);
    }
    }
    return c;
    }

    // Skips chars between simple quotes
    static int ParseQuotedChar(FILE *f)
    {
    int c = Fgetc(f);
    while (c != EOF && c != '\'') {
    if (c == '\\')
    c = Fgetc(f);
    if (c != EOF)
    c = Fgetc(f);
    }
    if (c == '\'')
    c = Fgetc(f);
    return c;
    }


    int main(int argc,char *argv[])
    {
    if (argc == 1) {
    printf("Usage: %s <file.c>\n",argv[0]);
    return 1;
    }
    FILE *f = fopen(argv[1],"r");
    if (f == NULL) {
    printf("Can't find %s\n",argv[1]);
    return 2;
    }
    int c = Fgetc(f);
    int level = 0;
    int parenlevel = 0;
    int inFunction = 0;
    while (c != EOF) {
    // Note that each of the switches must advance the
    // character read so that we avoid an infinite loop.
    switch (c) {
    case '"':
    c = ParseString(f);
    break;
    case '/':
    c = Fgetc(f);
    if (c == '*')
    c = ParseComment(f);
    else if (c == '/')
    c = ParseCppComment(f);
    break;
    case '\'':
    c = ParseQuotedChar(f);
    break;
    case '{':
    level++;
    c = Fgetc(f);
    break;
    case '}':
    if (level == 1 && inFunction) {
    printf(" %d\n",line);
    inFunction = 0;
    }
    if (level > 0)
    level--;
    c = Fgetc(f);
    break;
    case '(':
    parenlevel++;
    c = Fgetc(f);
    break;
    case ')':
    if (parenlevel > 0)
    parenlevel--;
    c = Fgetc(f);
    if ((parenlevel|level) == 0) {
    c = SkipWhiteSpace(c,f);
    if (c == '{') {
    level++;
    inFunction = 1;
    c = PrintFunction(f);
    }
    }
    break;
    default:
    if ((level | parenlevel) == 0 &&
    IsIdentifier(c,1))
    c = ReadId(c,f);
    else c = Fgetc(f);
    }
    }
    fclose(f);
    return 0;
    }
     
    jacob navia, Oct 14, 2006
    #1
    1. Advertising

  2. jacob navia wrote:
    >It scans for a terminating parentheses and an immediately
    > following opening brace.


    Um, can't if() and while() statements have that sequence?


    A better strategy might be to count curly brackets. If the nesting
    level of curlys is zero, then you're at the top level. Anything that
    has parens is probably a function declaration.

    Of course there's the problem of macros,which I'm sure you don't want
    to handle yourself. Maybe run cpp and pipe its output to your program?
     
    Ancient_Hacker, Oct 15, 2006
    #2
    1. Advertising

  3. jacob navia

    CBFalconer Guest

    jacob navia wrote:
    >
    > I have written this small parser to print out the functions
    > defined in a C file. This is an example of parsing in C, that I
    > want to add to my tutorial. Comments (and bug reports) are welcome.
    >

    .... snip ...
    >
    > // Return 1 if the character is a legal C identifier
    > // character, zero if not. The parameter "start"
    > // means if an identifier START character
    > // (numbers) is desired.
    > static int IsIdentifier(int c,int start)
    > {
    > if (c >= 'a' && c <= 'z')
    > return 1;
    > if (c >= 'A' && c <= 'Z')
    > return 1;
    > if (start == 0 && c >= '0' && c <= '9')
    > return 1;
    > if (c == '_')
    > return 1;
    > return 0;
    > }


    Just this one example will do. Obvious problems:

    1. Excessive indentation. Later it makes lines much too long.
    Use spaces, not tabs, and indent by 3 or 4 places.

    2. Use of // comments in Usenet. Causes nasty line wrapping
    problems.

    3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    contiguous, in any specific order, etc.

    4. Doesn't handle EOF.

    Here is a sample of some code that may do what you want. It
    doesn't allow line continuation within identifiers. You would
    probably need to control '{' and '}' matching within skiptoident.

    /* skips and echoes all characters that cannot start an
    identifier. echoto may be NULL, to suppress any echoing
    Returns the starting char for the following identifier,
    which may be EOF and is still in the input stream.
    */
    static int skiptoident(FILE *f, FILE *echoto)
    {
    int ch;

    do {
    ch = getc(f);
    if (('_' == ch) || isalpha(ch)) break;
    if (echoto && (EOF != ch)) putc(ch, echoto);
    } while (EOF != ch);
    ungetc(ch, f);
    return ch;
    } /* skiptoident */

    /* ----------------- */

    /* generic function to extract the next identifier from a stream
    Identifiers can begin with _ or alpha, and continue until a
    non-alpha non _ non-numeric char is encountered.
    '\0' terminate the string.
    Leading chars that do not fit an identifier are echoed.
    echoto may be NULL, to suppress any echoing
    Returns char terminating the identifier, which is 'ungotten'
    buf[bmax] should be valid storage space.
    */
    static int getident(char *buf, int bmax, FILE *f, FILE *echoto)
    {
    int ch, ix;

    if (EOF == skiptoident(f, echoto)) ch = EOF;
    else ch = getc(f);
    /* Here the input stream has been absorbed and echoed */
    /* up to the first character of an identifier, in ch */
    ix = 0;
    while (('_' == ch) || isalnum(ch)) { /* skips on eof */
    buf[ix++] = ch;
    ch = getc(f);
    if (ix >= bmax) break;
    }
    buf[ix] = '\0';
    ungetc(ch, f);
    return ch;
    } /* getident */

    These are extracts from id2id which has to do some elementary C
    lexing, and is available on my site:

    <http://cbfalconer.home.att.net/download/

    PS: I find you have committed the sin of multi-posting, so I have
    cross-posted this response. Luckily I didn't go back on line
    before checking the lcc group.

    --
    "I don't know where bin Laden is. I have no idea and really
    don't care. It's not that important." - G.W. Bush, 2002-03-13
    "No, we've had no evidence that Saddam Hussein was involved
    with September the 11th." - George Walker Bush 2003-09-17
     
    CBFalconer, Oct 15, 2006
    #3
  4. jacob navia said:

    > Hi guys
    >
    > I have written this small parser to print out the functions defined in a
    > C file. This is an example of parsing in C, that I want to add to my
    > tutorial. Comments (and bug reports) are welcome.


    foo.c:12: parse error before `/'
    foo.c:17: stray '\' in program
    foo.c:54: warning: type defaults to `int' in declaration of `IdBuffer'
    foo.c:54: warning: ANSI C forbids zero-size array `IdBuffer'
    foo.c:54: `c' undeclared here (not in a function)
    foo.c:54: ANSI C forbids data definition with no type or storage class
    foo.c:55: parse error before `while'
    foo.c:64: `i' undeclared here (not in a function)
    foo.c:64: warning: type defaults to `int' in declaration of `IdBuffer'
    foo.c:64: variable `IdBuffer' has initializer but incomplete type
    foo.c:64: conflicting types for `IdBuffer'
    foo.c:54: previous declaration of `IdBuffer'
    foo.c:64: ANSI C forbids data definition with no type or storage class
    foo.c:65: parse error before `return'
    foo.c:69: parse error before `/'
    foo.c:83: parse error before `/'
    foo.c:92: warning: type defaults to `int' in declaration of `c'
    foo.c:92: warning: implicit declaration of function `Fgetc'
    foo.c:92: `f' undeclared here (not in a function)
    foo.c:92: initializer element is not constant
    foo.c:92: ANSI C forbids data definition with no type or storage class
    foo.c:93: parse error before `if'
    foo.c:99: parse error before `/'
    foo.c: In function `main':
    foo.c:152: parse error before `*'
    foo.c:153: `f' undeclared (first use in this function)
    foo.c:153: (Each undeclared identifier is reported only once
    foo.c:153: for each function it appears in.)
    foo.c:157: parse error before `int'
    foo.c:162: parse error before `/'
    foo.c:168: case label not within a switch statement
    foo.c:171: warning: implicit declaration of function `ParseComment'
    foo.c:173: warning: implicit declaration of function `ParseCppComment'
    foo.c:175: case label not within a switch statement
    foo.c:176: warning: implicit declaration of function `ParseQuotedChar'
    foo.c:178: case label not within a switch statement
    foo.c:179: `level' undeclared (first use in this function)
    foo.c:182: case label not within a switch statement
    foo.c:183: `inFunction' undeclared (first use in this function)
    foo.c:184: `line' undeclared (first use in this function)
    foo.c:191: case label not within a switch statement
    foo.c:192: `parenlevel' undeclared (first use in this function)
    foo.c:195: case label not within a switch statement
    foo.c:200: warning: implicit declaration of function `SkipWhiteSpace'
    foo.c:204: warning: implicit declaration of function `PrintFunction'
    foo.c:208: default label not within a switch statement
    foo.c:210: warning: implicit declaration of function `IsIdentifier'
    foo.c:211: warning: implicit declaration of function `ReadId'
    foo.c: At top level:
    foo.c:215: warning: type defaults to `int' in declaration of `fclose'
    foo.c:215: warning: parameter names (without types) in function declaration
    foo.c:215: ANSI C forbids data definition with no type or storage class
    foo.c:216: parse error before `return'
    make: *** [foo.o] Error 1

    --
    Richard Heathfield
    "Usenet is a strange place" - dmr 29/7/1999
    http://www.cpax.org.uk
    email: rjh at above domain (but drop the www, obviously)
     
    Richard Heathfield, Oct 15, 2006
    #4
  5. Richard Heathfield <> writes:
    > jacob navia said:
    >
    >> Hi guys
    >>
    >> I have written this small parser to print out the functions defined in a
    >> C file. This is an example of parsing in C, that I want to add to my
    >> tutorial. Comments (and bug reports) are welcome.

    >
    > foo.c:12: parse error before `/'
    > foo.c:17: stray '\' in program

    [51 lines deleted]
    > make: *** [foo.o] Error 1


    Every one of those errors is caused by two things: "//" comments and
    mixed declarations and statements.

    The code compiles without error with "gcc -std=c99". It also compiles
    without error with "gcc -ansi -pedantic" if I remove the "//" comments
    and move a few object declarations.

    It is, as far as I can tell, valid C99.

    I understand and agree with your reasons for using a strict C90
    compiler, but not everyone does so.

    --
    Keith Thompson (The_Other_Keith) <http://www.ghoti.net/~kst>
    San Diego Supercomputer Center <*> <http://users.sdsc.edu/~kst>
    We must do something. This is something. Therefore, we must do this.
     
    Keith Thompson, Oct 15, 2006
    #5
  6. Keith Thompson said:

    > Richard Heathfield <> writes:
    >> jacob navia said:
    >>
    >>> Hi guys
    >>>
    >>> I have written this small parser to print out the functions defined in a
    >>> C file. This is an example of parsing in C, that I want to add to my
    >>> tutorial. Comments (and bug reports) are welcome.

    >>
    >> foo.c:12: parse error before `/'
    >> foo.c:17: stray '\' in program

    > [51 lines deleted]
    >> make: *** [foo.o] Error 1

    >
    > Every one of those errors is caused by two things: "//" comments and
    > mixed declarations and statements.


    <shrug> I figured it had to be something like that. So - does anyone have a
    conforming C99 compiler that we can use to test Mr Navia's code? No? Oh
    well.

    > The code compiles without error with "gcc -std=c99".


    Er, so what? Despite the misleading switch-name, gcc is not a C99-conforming
    compiler.

    > It also compiles
    > without error with "gcc -ansi -pedantic" if I remove the "//" comments
    > and move a few object declarations.


    That, in my view, is the author's job, not yours or mine. (And it shows that
    the C99isms have been adopted gratuitously, presumably to make the code
    less portable. Why anyone should wish to do this is beyond me.)

    > It is, as far as I can tell, valid C99.
    >
    > I understand and agree with your reasons for using a strict C90
    > compiler, but not everyone does so.


    Well, my principal reason for using a strict C90 compiler is simple - it's
    so that I can be sure that /my/ code doesn't use any C99isms, because I
    want /my/ code to compile everywhere, not just on the vanishingly small
    number of conforming C99 implementations.

    --
    Richard Heathfield
    "Usenet is a strange place" - dmr 29/7/1999
    http://www.cpax.org.uk
    email: rjh at above domain (but drop the www, obviously)
     
    Richard Heathfield, Oct 15, 2006
    #6
  7. jacob navia

    jacob navia Guest

    Ancient_Hacker wrote:
    > jacob navia wrote:
    >
    >>It scans for a terminating parentheses and an immediately
    >> following opening brace.

    >
    >
    > Um, can't if() and while() statements have that sequence?
    >
    >
    > A better strategy might be to count curly brackets. If the nesting
    > level of curlys is zero, then you're at the top level. Anything that
    > has parens is probably a function declaration.


    Well I *do* count parens and brackets.... See the code

    >
    > Of course there's the problem of macros,which I'm sure you don't want
    > to handle yourself. Maybe run cpp and pipe its output to your program?
    >


    Macros will be assumed function calls, but not function definitions...

    I should ignore # lines, that would make it more robust.
     
    jacob navia, Oct 15, 2006
    #7
  8. jacob navia

    jacob navia Guest

    CBFalconer wrote:
    > jacob navia wrote:
    >
    >>I have written this small parser to print out the functions
    >>defined in a C file. This is an example of parsing in C, that I
    >>want to add to my tutorial. Comments (and bug reports) are welcome.
    >>

    >
    > ... snip ...
    >
    >>// Return 1 if the character is a legal C identifier
    >>// character, zero if not. The parameter "start"
    >>// means if an identifier START character
    >>// (numbers) is desired.
    >>static int IsIdentifier(int c,int start)
    >>{
    >> if (c >= 'a' && c <= 'z')
    >> return 1;
    >> if (c >= 'A' && c <= 'Z')
    >> return 1;
    >> if (start == 0 && c >= '0' && c <= '9')
    >> return 1;
    >> if (c == '_')
    >> return 1;
    >> return 0;
    >>}

    >
    >
    > Just this one example will do. Obvious problems:
    >
    > 1. Excessive indentation. Later it makes lines much too long.
    > Use spaces, not tabs, and indent by 3 or 4 places.
    >


    OK

    > 2. Use of // comments in Usenet. Causes nasty line wrapping
    > problems.
    >


    OK

    > 3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    > contiguous, in any specific order, etc.
    >


    Mmmm, it *could* be, but I have never found a machine where they aren't
    contiguous...

    > 4. Doesn't handle EOF.
    >


    Why? It returns 0 at EOF, and EOF is NOT an identifier char, so it is
    correct...

    > Here is a sample of some code that may do what you want. It
    > doesn't allow line continuation within identifiers. You would
    > probably need to control '{' and '}' matching within skiptoident.
    >
    > /* skips and echoes all characters that cannot start an
    > identifier. echoto may be NULL, to suppress any echoing
    > Returns the starting char for the following identifier,
    > which may be EOF and is still in the input stream.
    > */
    > static int skiptoident(FILE *f, FILE *echoto)
    > {
    > int ch;
    >
    > do {
    > ch = getc(f);
    > if (('_' == ch) || isalpha(ch)) break;
    > if (echoto && (EOF != ch)) putc(ch, echoto);
    > } while (EOF != ch);
    > ungetc(ch, f);
    > return ch;
    > } /* skiptoident */
    >


    This is interesting since it would eliminate the need for a
    buffer. The problem is that

    typedef struct _tagFoo {
    int a;
    } FOO;

    would print the _tagfoo...

    I can't decide whether an indentifier is really a function until
    I see a ')' followed by {, so I can't put it out immediately.

    But I see the point of '_' || isalpha(c)

    It is much better. Thanks.


    > /* ----------------- */
    >
    > /* generic function to extract the next identifier from a stream
    > Identifiers can begin with _ or alpha, and continue until a
    > non-alpha non _ non-numeric char is encountered.
    > '\0' terminate the string.
    > Leading chars that do not fit an identifier are echoed.
    > echoto may be NULL, to suppress any echoing
    > Returns char terminating the identifier, which is 'ungotten'
    > buf[bmax] should be valid storage space.
    > */
    > static int getident(char *buf, int bmax, FILE *f, FILE *echoto)
    > {
    > int ch, ix;
    >
    > if (EOF == skiptoident(f, echoto)) ch = EOF;
    > else ch = getc(f);
    > /* Here the input stream has been absorbed and echoed */
    > /* up to the first character of an identifier, in ch */
    > ix = 0;
    > while (('_' == ch) || isalnum(ch)) { /* skips on eof */
    > buf[ix++] = ch;
    > ch = getc(f);
    > if (ix >= bmax) break;
    > }
    > buf[ix] = '\0';
    > ungetc(ch, f);
    > return ch;
    > } /* getident */
    >
    > These are extracts from id2id which has to do some elementary C
    > lexing, and is available on my site:
    >
    > <http://cbfalconer.home.att.net/download/
    >
    > PS: I find you have committed the sin of multi-posting, so I have
    > cross-posted this response. Luckily I didn't go back on line
    > before checking the lcc group.
    >
     
    jacob navia, Oct 15, 2006
    #8
  9. jacob navia

    jacob navia Guest

    Keith Thompson wrote:
    > Richard Heathfield <> writes:
    >
    >>jacob navia said:
    >>
    >>
    >>>Hi guys
    >>>
    >>>I have written this small parser to print out the functions defined in a
    >>>C file. This is an example of parsing in C, that I want to add to my
    >>>tutorial. Comments (and bug reports) are welcome.

    >>
    >>foo.c:12: parse error before `/'
    >>foo.c:17: stray '\' in program

    >
    > [51 lines deleted]
    >
    >>make: *** [foo.o] Error 1

    >
    >
    > Every one of those errors is caused by two things: "//" comments and
    > mixed declarations and statements.
    >
    > The code compiles without error with "gcc -std=c99". It also compiles
    > without error with "gcc -ansi -pedantic" if I remove the "//" comments
    > and move a few object declarations.
    >
    > It is, as far as I can tell, valid C99.
    >
    > I understand and agree with your reasons for using a strict C90
    > compiler, but not everyone does so.
    >


    Since you compiled Keith, can you pass it through some
    C code... Does it work?

    jacob
     
    jacob navia, Oct 15, 2006
    #9
  10. jacob navia said:

    > CBFalconer wrote:


    <snip>

    >> 3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    >> contiguous, in any specific order, etc.
    >>

    >
    > Mmmm, it *could* be, but I have never found a machine where they aren't
    > contiguous...


    That doesn't mean such machines don't exist. I've spent several years
    working on such machines. Look up "EBCDIC" in Google.

    <snip>

    --
    Richard Heathfield
    "Usenet is a strange place" - dmr 29/7/1999
    http://www.cpax.org.uk
    email: rjh at above domain (but drop the www, obviously)
     
    Richard Heathfield, Oct 15, 2006
    #10
  11. jacob navia

    Cong Wang Guest

    On Oct 15, 2:42 pm, Richard Heathfield <>
    wrote:
    > jacob navia said:
    >
    > > CBFalconer wrote:<snip>

    >
    > >> 3.  Faulty code.  There is no guarantee 'a' .. 'z' etc. are
    > >> contiguous, in any specific order, etc.

    >
    > > Mmmm, it *could* be, but I have never found a machine where they aren't
    > > contiguous...That doesn't mean such machines don't exist. I've spent several years

    > working on such machines. Look up "EBCDIC" in Google.
    >
    >


    Yeah. In fact, C99 gives us little guarantee about how characters are
    portable. The following is found in C99 5.2.1:

    A byte with all bits set to 0, called the null character,
    shall exist in the basic execution character set; it is used to
    terminate a character string.

    Both the basic source and basic execution character sets shall have the
    following members: the 26 uppercase letters of the Latin alphabet
    A B C D E F G H I J K L M
    N O P Q R S T U V W X Y Z
    the 26 lowercase letters of the Latin alphabet
    a b c d e f g h i j k l m
    n o p q r s t u v w x y z
    the 10 decimal digits
    0 1 2 3 4 5 6 7 8 9
    the following 29 graphic characters
    ! " # % & ' ( ) * + , - . /
    :
    ; < = > ? [ \ ] ^ _ { | } ~
    the space character, and control characters representing horizontal
    tab, vertical tab, and form feed. The representation of each member of
    the source and execution basic
    character sets shall ï¬t in a byte. In both the source and execution
    basic character sets, the value of each character after 0 in the above
    list of decimal digits shall be one greater than
    the value of the previous. In source ï¬les, there shall be some way of
    indicating the end of each line of text; this International Standard
    treats such an end-of-line indicator as if it
    were a single new-line character. In the basic execution character set,
    there shall be control characters representing alert, backspace,
    carriage return, and new line.
     
    Cong Wang, Oct 15, 2006
    #11
  12. jacob navia

    Malcolm Guest

    "Keith Thompson" <> wrote in message
    >
    > Every one of those errors is caused by two things: "//" comments and
    > mixed declarations and statements.
    >

    I decided that surely, slash slash comments were so widespread by now that I
    could use them.
    Only to have my code break on the next complier, a parallel job.
    --
    www.personal.leeds.ac.uk/~bgy1mm
    freeware games to download.
     
    Malcolm, Oct 15, 2006
    #12
  13. jacob navia

    Guest

    On Sun, 15 Oct 2006 06:42:06 +0000, Richard Heathfield
    <> wrote:

    >jacob navia said:
    >
    >> CBFalconer wrote:

    >
    ><snip>
    >
    >>> 3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    >>> contiguous, in any specific order, etc.
    >>>

    >>
    >> Mmmm, it *could* be, but I have never found a machine where they aren't
    >> contiguous...

    >
    >That doesn't mean such machines don't exist. I've spent several years
    >working on such machines. Look up "EBCDIC" in Google.
    ><snip>

    Also BCD, as used on a 14xx series system. (came before "EBCDIC") :)
    --
    ArarghMail610 at [drop the 'http://www.' from ->] http://www.arargh.com
    BCET Basic Compiler Page: http://www.arargh.com/basic/index.html

    To reply by email, remove the garbage from the reply address.
     
    , Oct 15, 2006
    #13
  14. On Sun, 15 Oct 2006 07:55:01 +0200, jacob navia wrote:
    >CBFalconer wrote:
    >> 3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    >> contiguous, in any specific order, etc.

    >
    >Mmmm, it *could* be, but I have never found a machine where they aren't
    >contiguous...


    What's wrong with isalpha and similar functions?

    Best wishes,
    Roland Pibinger
     
    Roland Pibinger, Oct 15, 2006
    #14
  15. jacob navia

    jacob navia Guest

    Roland Pibinger wrote:
    > On Sun, 15 Oct 2006 07:55:01 +0200, jacob navia wrote:
    >
    >>CBFalconer wrote:
    >>
    >>>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    >>>contiguous, in any specific order, etc.

    >>
    >>Mmmm, it *could* be, but I have never found a machine where they aren't
    >>contiguous...

    >
    >
    > What's wrong with isalpha and similar functions?
    >
    > Best wishes,
    > Roland Pibinger


    Yes, I will change that as proposed by Chuck.
     
    jacob navia, Oct 15, 2006
    #15
  16. On Sun, 15 Oct 2006 00:27:43 +0200, jacob navia wrote:
    >I have written this small parser to print out the functions defined in a
    >C file. This is an example of parsing in C, that I want to add to my
    >tutorial. Comments (and bug reports) are welcome.

    [...]
    >static char IdBuffer[MAXID]; // Buffer for remembering the function name
    >static int line = 1; // We start at line 1


    Why statics? They make your code non-reusable. BTW, if you also made
    FILE *f and int c static here you (almost) wouldn't have to pass any
    arguments to your functions. IMO, non-const globals (statics) should
    be avoided in a C tutorial (because they should be avoided in a C
    program).

    >static int Fgetc(FILE *f)


    Why are the functions static? That's only confusing for a newbie.

    >static int ParseComment(FILE *f) // Skips comments
    >{
    > int c = Fgetc(f);
    >restart:
    > while (c != '*') {
    > c = Fgetc(f);
    > if (c == EOF)
    > return EOF;
    > }
    > c = Fgetc(f);
    > if (c == '/')
    > return Fgetc(f);
    > else goto restart;
    >}


    Hmm, goto in a C tutorial? You could add an exercise for the reader:
    'Enhance the clarity of this function by rewriting it with one return
    statement and without using goto'.

    Best regards,
    Roland Pibinger
     
    Roland Pibinger, Oct 15, 2006
    #16
  17. writes:

    > On Sun, 15 Oct 2006 06:42:06 +0000, Richard Heathfield
    > <> wrote:
    >
    >>jacob navia said:
    >>
    >>> CBFalconer wrote:

    >>
    >><snip>
    >>
    >>>> 3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    >>>> contiguous, in any specific order, etc.
    >>>>
    >>>
    >>> Mmmm, it *could* be, but I have never found a machine where they aren't
    >>> contiguous...

    >>
    >>That doesn't mean such machines don't exist. I've spent several years
    >>working on such machines. Look up "EBCDIC" in Google.
    >><snip>

    > Also BCD, as used on a 14xx series system. (came before "EBCDIC") :)


    BCD is worse, not only there are not contiguous, they aren't even in order.

    Yours,

    --
    Jean-Marc
     
    Jean-Marc Bourguet, Oct 15, 2006
    #17
  18. jacob navia

    jacob navia Guest

    Roland Pibinger wrote:
    > On Sun, 15 Oct 2006 00:27:43 +0200, jacob navia wrote:
    >
    >>I have written this small parser to print out the functions defined in a
    >>C file. This is an example of parsing in C, that I want to add to my
    >>tutorial. Comments (and bug reports) are welcome.

    >
    > [...]
    >
    >>static char IdBuffer[MAXID]; // Buffer for remembering the function name
    >>static int line = 1; // We start at line 1

    >
    >
    > Why statics? They make your code non-reusable. BTW, if you also made
    > FILE *f and int c static here you (almost) wouldn't have to pass any
    > arguments to your functions. IMO, non-const globals (statics) should
    > be avoided in a C tutorial (because they should be avoided in a C
    > program).
    >


    If you make global variables visible by other parts of the
    program this can lead to name conflicts.

    Making global variables static limits their scope and
    allows for code reuse. Actually I believe the
    contrary is true. I think the code is more reusable
    BECAUSE it will export just ONE function.

    >
    >>static int Fgetc(FILE *f)

    >
    >
    > Why are the functions static? That's only confusing for a newbie.
    >


    No. See above.

    >
    >>static int ParseComment(FILE *f) // Skips comments
    >>{
    >> int c = Fgetc(f);
    >>restart:
    >> while (c != '*') {
    >> c = Fgetc(f);
    >> if (c == EOF)
    >> return EOF;
    >> }
    >> c = Fgetc(f);
    >> if (c == '/')
    >> return Fgetc(f);
    >> else goto restart;
    >>}

    >
    >
    > Hmm, goto in a C tutorial? You could add an exercise for the reader:
    > 'Enhance the clarity of this function by rewriting it with one return
    > statement and without using goto'.
    >
    > Best regards,
    > Roland Pibinger



    I believe goto is not bad when used correctly. It is part of the
    language anyway, and if used correctly it is perfectly OK.

    Care to solve your problem?

    I will add it to the tutorial.
     
    jacob navia, Oct 15, 2006
    #18
  19. jacob navia

    jacob navia Guest

    Jean-Marc Bourguet wrote:
    > writes:
    >
    >
    >>On Sun, 15 Oct 2006 06:42:06 +0000, Richard Heathfield
    >><> wrote:
    >>
    >>
    >>>jacob navia said:
    >>>
    >>>
    >>>>CBFalconer wrote:
    >>>
    >>><snip>
    >>>
    >>>>>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    >>>>>contiguous, in any specific order, etc.
    >>>>>
    >>>>
    >>>>Mmmm, it *could* be, but I have never found a machine where they aren't
    >>>>contiguous...
    >>>
    >>>That doesn't mean such machines don't exist. I've spent several years
    >>>working on such machines. Look up "EBCDIC" in Google.
    >>><snip>

    >>
    >>Also BCD, as used on a 14xx series system. (came before "EBCDIC") :)

    >
    >
    > BCD is worse, not only there are not contiguous, they aren't even in order.
    >
    > Yours,
    >


    Well, EBCDIC was a 7 bit code, used for punched cards. The eighth bit
    was there to signal the card reader that a character was in that column.

    Using only 7 bits, the codes are continuous. When punched cards weren't
    so much used (approx beginning of the 80es) IBM added foreign language
    characters in those positions.

    This has a maybe anectodical importance, but its practical impact is ...

    Anyway Chuck was right and I changed that.
     
    jacob navia, Oct 15, 2006
    #19
  20. jacob navia <> writes:

    > Jean-Marc Bourguet wrote:
    >> writes:
    >>
    >>>On Sun, 15 Oct 2006 06:42:06 +0000, Richard Heathfield
    >>><> wrote:
    >>>
    >>>
    >>>>jacob navia said:
    >>>>
    >>>>
    >>>>>CBFalconer wrote:
    >>>>
    >>>><snip>
    >>>>
    >>>>>>3. Faulty code. There is no guarantee 'a' .. 'z' etc. are
    >>>>>>contiguous, in any specific order, etc.
    >>>>>>
    >>>>>
    >>>>>Mmmm, it *could* be, but I have never found a machine where they aren't
    >>>>>contiguous...
    >>>>
    >>>> That doesn't mean such machines don't exist. I've spent several years
    >>>> working on such machines. Look up "EBCDIC" in Google.
    >>>><snip>
    >>>
    >>>Also BCD, as used on a 14xx series system. (came before "EBCDIC") :)

    >> BCD is worse, not only there are not contiguous, they aren't even in
    >> order.


    BTW, BCD is a 6 bits code which would be unsuitable for C, there are no
    lower case letters and the digits are contiguous but not in numerical
    order (0 is after 9)

    > Well, EBCDIC was a 7 bit code, used for punched cards.


    While it is true that EBCDIC was strongly constrained by punched cards,
    EBCDIC was a 8 bit code since its inconception. And the number of
    codepoint defined was not constrained by punched cards considerations but
    by keyboard, printer and typewriter one.

    > The eighth bit
    > was there to signal the card reader that a character was in that column.
    > Using only 7 bits, the codes are continuous.


    I wonder if you ever saw a table. Here is one, now just give me the bit to
    ignore so that A-Z are contiguous:

    0 1 2 3 4 5 6 7 8 9 A B C D E F
    0 & - 0
    1 / a j A J 1
    2 b k s B K S 2
    3 c l t C L T 3
    4 d m u D M U 4
    5 e n v E N V 5
    6 f o w F O W 6
    7 g p x G P X 7
    8 h q y H Q Y 8
    9 i r z I R Z 9
    A NL NL
    B . NU , NU
    C < * % NU
    D ( ) _ '
    E + ; > =
    F | ~ ? NL


    > When punched cards weren't so much used (approx beginning of the 80es)
    > IBM added foreign language characters in those positions.


    National were present since its inconception. The table shows the reserved
    position with NL (national lowercase) NU (national uppercase). The constraint
    constraint

    > This has a maybe anectodical importance, but its practical impact is ...


    EBCDIC is still alive on IBM mainframes. And I'd not be surprised if the
    volume of data stored in EBCDIC was still greater than the volume of data
    stored in codesets compatible with ISO-646.

    Yours,

    --
    Jean-Marc
     
    Jean-Marc Bourguet, Oct 15, 2006
    #20
    1. Advertising

Want to reply to this thread or ask your own question?

It takes just 2 minutes to sign up (and it's free!). Just click the sign up button to choose a username and then you can ask your own questions on the forum.
Similar Threads
  1. Bernd Oninger
    Replies:
    0
    Views:
    764
    Bernd Oninger
    Jun 9, 2004
  2. ZOCOR

    XML Parser VS HTML Parser

    ZOCOR, Oct 3, 2004, in forum: Java
    Replies:
    11
    Views:
    821
    Paul King
    Oct 5, 2004
  3. Bernd Oninger
    Replies:
    0
    Views:
    815
    Bernd Oninger
    Jun 9, 2004
  4. Joel Hedlund
    Replies:
    2
    Views:
    511
    Joel Hedlund
    Nov 11, 2006
  5. Joel Hedlund
    Replies:
    0
    Views:
    309
    Joel Hedlund
    Nov 11, 2006
Loading...

Share This Page