Sorry for the late reply! Anyhow...
Groovy hepcat Franken Sense was jivin' in comp.lang.c on Thu, 7 May 2009
4:39 pm. It's a cool scene! Dig it.
How do I write a get_script() for data that look like these:
44:004:037 Having land, sold it, and brought the money, and laid it at
the apostles' feet.
44:005:001 But a certain man named Ananias, with Sapphira his wife,
sold
a possession,
44:005:002 And kept back part of the price, his wife also being privy
to
it, and brought a certain part, and laid it at the
apostles' feet.
I don't just want a line, so I can't read until '\n'. If I took it in
a char at a time, how do I write the control so that it stops when I
get to a string of ten digits or colons?
So, each part consists of a group of three numbers delimeted by
colons, followed by one or more lines of text, right? So use that
format to write a simple parser. It isn't really all that hard. You
just need to learn the basics.
First, try a Google search for BNF (or Bachus Naur form). It's a
format for specifying language grammars for parsers. (You may already
have come across BNF.) The following EBNF (extended BNF, search for
s026153_ISO_IEC_14977_1996(E).pdf for the ISO spec.) could be used for
your parser.
script = verse, {verse};
verse = versenum, text;
versenum = decnum, colon-symbol, decnum, colon-symbol, decnum;
text = textline, {textline};
Simple, eh? This says that a "script" is one or more instance of a
"verse"; a "verse" is a "versenum" followed by "text"; "versenum" is
a "decnum" followed by a "colon-symbol", another "decnum", a second
"colon-symbol" and a third "decnum"; and "text" is one or more
instances of a "textline". The symbols decnum, colon-symbol and
textline are terminal symbols (items that can't be broken down into
smaller parts). This small parser will be very easy to create.
But first you need a lexical scanner that can return tokens (terminal
symbols) in the form of decnum (a decimal number), colon-symbol (a
literal ':' character) and textline (a line of text). Blank lines can
simply be ignored by your scanner or treated as normal textlines.
Your scanner needs to know how to recognise each token type. That's
very simple in this case. A decnum begins with a decimal digit
(including 0), and ends just before the first character that is not a
decimal digit. A colon-symbol is simply a single ':'. And a textline is
any sequence of text beginning with any character that is not a decimal
digit or ':', and ends with a '\n'.
Next, your parser needs to call your scanner to get tokens, one by
one. It must verify that the token is of the type expected. The parsing
technique known as "recursive descent parsing" is pretty easy to
understand.
You write a function for each non-terminal symbol, and each gets
tokens from the lexical scanner routine, checks them for validity and
takes some action, including calling other non-terminal handlers,
performing semantic actions, checking for errors and displaying
diagnostic output. Your parser would, perhaps, store the verses in some
easily searchable form. (I'm assuming that's what you want, to search
for verses by number.)
Anyhow, there are texts and tutorials on parsing. Search for Compiler
Construction by Niklaus Wirth. The author has made it available for
free download in PDF format. (Sorry, can't remember the URL.) This will
give you a good tutorial on the subject. You could easily write a
simple parser with the knowledge you'll gain from it. The following
pseudocode may give you some idea:
#include all relevant headers
#define MAX_TOKEN_LEN 100
enum toktype {decnum, colon_symbol, textline, end};
struct token
{
enum toktype type;
union
{
char *text;
int num;
} value;
int line;
};
static struct token lookahead;
static FILE *fp;
int next_token(void)
{
static char buf[MAX_TOKEN_LEN];
int c, n;
static int line = 1;
lookahead.line = line;
/* Skip leading white space. */
while(isspace(c = getc(fp)))
{
if('\n' == c)
line++;
}
switch(c)
{
case EOF:
lookahead.type = end;
break;
case ':'
lookahead.type = colon_symbol;
break;
case '0': case '1': case '2': ... case '9':
lookahead.type = decnum;
n = c - '0';
while(isnum(c = getc(fp)))
n = n * 10 + c - '0';
lookahead.value.num = n;
/* Last character read was not a digit, so ungetc it. */
ungetc(c, fp);
break;
case '\n':
line++;
break; /* Ignore blank lines. */
default:
lookahead.type = textline;
buf[0] = c;
for(n = 1; (buf[n] = c) != '\n'; c = getc(fp))
;
buf[n] = '\0';
lookahead.value.text = buf;
line++;
break;
}
return 0;
}
void print_error(char *txt)
{
fprintf(stderr, "Line %ul: %s\n", (unsigned long)lookahead.line, txt);
}
int parse_text(char **txt)
{
size_t len;
/* A text must begin with a textline. */
if(lookahead.type != textline)
{
print_error("Expected a line of text.");
return errorcode;
}
do
{
len = strlen(lookahead.value.text) + (*txt ? strlen(*txt) : 0) + 1;
*txt = realloc(*txt, len);
strcat(*txt, lookahead.value.text);
next_token();
}while(lookahead.type == textline);
return 0;
}
int parse_versenum(int *a, int *b, int *c)
{
/* A versenum must begin with a decnum. */
if(lookahead.type != decnum)
{
print_error("Expected a decimal number.");
return errorcode;
}
*a = lookahead.value.num;
next_token();
/* Next token must be a colon-symbol. */
if(lookahead.type != colon_symbol)
{
print_error("Expected a ':'.");
return errorcode;
}
next_token();
/* Next token must be a decnum. */
if(lookahead.type != decnum)
{
print_error("Expected a decimal number.");
return errorcode;
}
*b = lookahead.value.num;
next_token();
/* Next token must be a colon-symbol. */
if(lookahead.type != colon_symbol)
{
print_error("Expected a ':'.");
return errorcode;
}
next_token();
/* Next token must be a decnum. */
if(lookahead.type != decnum)
{
print_error("Expected a decimal number.");
return errorcode;
}
*c = lookahead.value.num;
next_token();
return 0;
}
int parse_verse(void)
{
int a, b, c;
char *txt = NULL;
do
{
int status = parse_versenum(&a, &b, &c);
if(0 != status)
return errorcode;
int status = parse_text(&txt);
if(0 != status && EOF != status)
return errorcode;
}while(EOF != status);
store_verse_in_easily_searchable_format(a, b, c, txt);
return 0;
}
int parse_script(void)
{
if(lookahead.type == end)
{
print_error("Expected a decimal number.");
return errorcode;
}
int status = parse_verse();
if(0 != status && EOF != status)
return errorcode;
while(lookahead.type != end)
{
int status = parse_verse();
if(0 != status && EOF != status)
return errorcode;
}
return 0;
}
int main(void)
{
int rtn = 0;
fp = fopen("r", "your_file.txt");
/* Initialise the lookahead. */
next_token();
if(errorcode == parse_script())
rtn = EXIT_FAILURE;
fclose(fp);
return rtn;
}