simple BST implementation

M

Mark

Hello

I've implemented a very simple and small binary-search tree library as a
training exercise. I'm posting it here together with a testing program and
expect to receive critics and useful recommendations :)

It compiles fine with the following GCC options:

-std=c99 -pedantic -W -Wall -Wcast-align -Wcast-qual -Wmissing-prototypes -Wshadow
-Wnested-externs -Wstrict-prototypes -Waggregate-return -O0 -ggdb

I put in various checkpoints tracing messages which help me to better
understand BST and to debug, it can be disabled with -DWITH_TRACE compiler
flag. I'm yet to decide where to carefully put 'asserts'.

/* tree.h */
#ifndef TREE_H
#define TREE_H

#include <stddef.h> /* for size_t */

/* error codes */
enum {
T_ESUCCESS,
T_EINVALID,
T_EBADPTR,
T_EDUP,
T_ENOMEM,
T_EMPTYTREE
};

struct tree_node {
void *data;
size_t size; /* for future to indicate the data size */
struct tree_node *left;
struct tree_node *right;
};

struct tree_node *tree_allocnode(void *data);

int tree_addnode(struct tree_node **root, struct tree_node *node,
int(*cmp_fn)(const void *d1, const void *d2));

void tree_freenode(struct tree_node *node);

struct tree_node *tree_lookup(struct tree_node *root, const void *data,
int(*cmp_fn)(const void *d1, const void *d2));

void tree_freetree(struct tree_node *node, void (*fn)(void *data));

int tree_traverse(struct tree_node *root,
int(*fn)(struct tree_node *node, void *arg), void *arg);

#endif

/* tree.c */
#ifdef WITH_TRACE
#include <stdarg.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include "tree.h"

#ifdef WITH_TRACE
#define TRACE(...) (printf("TRACE: "), printf(__VA_ARGS__), printf("\n"))
#else
#define TRACE(...)
#endif

struct tree_node *tree_allocnode(void *data)
{
struct tree_node *node = NULL;

node = malloc(sizeof *node);
if (node != NULL)
{
if (data != NULL)
{
node->data = data;
node->left = node->right = NULL;
TRACE("node @ %p, data=%s", (void *)node, (char *)data);
}
else
{
free(node);
node = NULL;
}
}

return node;
}

int tree_addnode(struct tree_node **root, struct tree_node *node,
int(*cmp_fn)(const void *d1, const void *d2))
{
int res = T_ESUCCESS;
int d;

if (node != NULL)
{
/* empty tree case */
if (NULL == *root)
{
*root = node;
node->left = node->right = NULL;
TRACE("root @ %p", (void *)*root);
}
else
{
d = cmp_fn(node->data, (*root)->data);
if (d < 0)
{
tree_addnode(&(*root)->left, node, cmp_fn);
TRACE("node(%p)->left=%p", (void *)(*root), (void
*)(*root)->left);
}
else if (d > 0)
{
tree_addnode(&(*root)->right, node, cmp_fn);
TRACE("node(%p)->right=%p", (void *)(*root), (void
*)(*root)->right);
}
else /* d == 0, i.e. duplicate entry */
{
res = T_EDUP;
}
}
}
else
{
res = T_EBADPTR;
}

return res;
}

void tree_freenode(struct tree_node *node)
{
if (node != NULL)
{
if (node->data != NULL)
{
free(node->data);
}
free(node);
}
}

struct tree_node *tree_lookup(struct tree_node *root, const void *data,
int(*cmp_fn)(const void *d1, const void *d2))
{
struct tree_node *node = NULL;
int d;

if (root != NULL)
{
d = cmp_fn(data, root->data);
if (d > 0)
{
return tree_lookup(root->right, data, cmp_fn);
}
else if (d < 0)
{
return tree_lookup(root->left, data, cmp_fn);
}
else
{
node = root;
TRACE("node found %p", (void *)node);
}
}

return node;
}

void tree_freetree(struct tree_node *node, void (*fn)(void *data))
{
if (node != NULL)
{
if (fn != NULL)
{
fn(node->data);
}
TRACE("deleting node(%p)->left=%p", (void *)node, (void
*)node->left);
tree_freetree(node->left, fn);
TRACE("deleting node(%p)->right=%p", (void *)node, (void
*)node->right);
tree_freetree(node->right, fn);

free(node);
}
}

int tree_traverse(struct tree_node *root,
int(*fn)(struct tree_node *root, void *arg), void *arg)
{
int res = T_ESUCCESS;

if (root != NULL)
{
tree_traverse(root->left, fn, arg);
fn(root, arg);
tree_traverse(root->right, fn, arg);
}
else
{
res = T_EMPTYTREE;
}

return res;
}

/* tree-test.c */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "tree.h"

static int tdata1[] = { 10, 6, 900, 7, 1, 57, 928, 1, 987 };
static char *tdata[] = { "yellow", "apple", "chess", "penguin", "car",
"mixer", "Zoo", "loop", "zebra" };
#define DNUM (sizeof(tdata) / sizeof(tdata[0]))

static int comp(const void *d1, const void *d2)
{
int res;

res = strcmp((const char *)d1, (const char *)d2);
if (res > 0) {
printf("%s > %s\n", (const char *)d1, (const char *)d2);
return 1;
} else if (res < 0) {
printf("%s < %s\n", (const char *)d1, (const char *)d2);
return -1;
} else {
printf("%s = %s\n", (const char *)d1, (const char *)d2);
return 0;
}
}

static int comp1(const void *d1, const void *d2)
{
if (*(const int *)d1 < *(const int *)d2) {
printf("%d < %d\n", *(const int *)d1, *(const int *)d2);
return -1;
}
else if (*(const int *)d1 > *(const int *)d2) {
printf("%d > %d\n", *(const int *)d1, *(const int *)d2);
return 1;
}
else { /* if (*(const int *)d1 == *(const int * )d2) */
printf("%d = %d\n", *(const int *)d1, *(const int *)d2);
return 0;
}
}

static int walk(struct tree_node *node, void *arg)
{
printf(arg, (char *)node->data);
return 0;
}

int main(void)
{
struct tree_node *tptr, *root = NULL;
unsigned int i;

puts("Populating the tree...\n");
for (i = 0; i < DNUM; i++) {
puts("-------------------------------------");
tptr = tree_allocnode(tdata);
if (NULL == tptr)
{
puts("Failed to allocate memory!");
exit(EXIT_FAILURE);
}
tree_addnode(&root, tptr, comp);
}

puts("Traversing the tree...\n");
tree_traverse(root, walk, "data=%s\n");
printf("found node %p\n", (void *)tree_lookup(root, tdata[3], comp));
puts("Destroying the tree...\n");
tree_freetree(root, NULL);
puts("Traversing the tree...\n");
tree_traverse(root, walk, "data=%s\n");

return 0;
}
 
S

Seebs

/* error codes */
enum {
T_ESUCCESS,

No.

"EFOO" is widely recognized as indicating an *error*.

Success is not an error, normally.
T_EINVALID,
T_EBADPTR,
T_EDUP,
T_ENOMEM,
T_EMPTYTREE

It seems to me that you might see whether there's enough standard/common
errno values defined that you could just use errno.
struct tree_node {
void *data;
size_t size; /* for future to indicate the data size */

If you're not using it, don't put it here, IMHO.
struct tree_node *tree_allocnode(void *data);

I'm not totally sold on this, although I'm not totally unsold on it
either.
int tree_addnode(struct tree_node **root, struct tree_node *node,
int(*cmp_fn)(const void *d1, const void *d2));

I'd probably do these two differently.

tree_node *tree_addnode(struct tree_node *root, void *data, int (*cmp_fn))...

But even this is error-prone. In this case, I think it makes more sense
to distinguish betwen the tree and its nodes.

So I recommend:

struct tree {
int (*cmp_fn)(const void *d1, const void *d2);
struct tree_node *root;
}

Then:
struct tree *tree_new(int (*cmp_fn)(const void *d1, const void *d2));

.... and consider
typedef int (*tree_cmp)(const void *d1, const_void *d2);

Some people recommend omitting the parameter names in prototypes, although
I'm currently leaning against it.

So I guess:
typedef int (*tree_cmp)(const void *d1, const_void *d2);
struct tree {
tree_cmp cmp;
struct tree_node *root;
}

then:

struct tree *tree_new(tree_cmp cmp);
int tree_add(struct tree *t, void *data);
struct tree_node *tree_root(struct tree *t);
struct tree_node *tree_lookup(struct tree *t, const void *key);

.... But you will, of course, note that this means you can't grab a subtree
and do a lookup. But that can be addressed:

struct tree_node *tree_node_lookup(struct tree_node *tn, const void *key);

in which case:
struct tree_node *tree_lookup(struct tree *t, const void *key) {
if (!t || !t->root)
return NULL;
return tree_node_lookup(t->root, key);
}

The other corresponding change, then:

int tree_add(struct tree *t, const void *data) {
... do the node allocation in here, don't make the user
do it separately.
}

-s
 
S

Seebs

No. EFOO is reserved for the implementation. Bad Seebs - no biscuit!

Yeah, too terse, bad me. "T_E*" good, "ESUCCESS" bad, "T_ESUCCESS" still
bad because people will see "T_E*" and assume it's an [E]rorr.
It is very unlikely that the standard ones will suffice. I could only
find three, EDOM, EILSEQ, and ERANGE. The moment you add "common"
ones, you make your program less portable.
True.

He's probably going to be using it in the very next mod, so it's not
such a big deal. (Yes, okay, he could #if 0 it for now.)
Agreed.

I would take a pointer to a tree structure (which manages the root and
any housekeeping information needed by the tree), the data, the size,
and an optional void * for side-channel information. (Consequently, I
would add an extra void * to the comparison function.)

Good point about the side-channel. Hmm.

Actually, that's an interesting question. Possibly the function should
take:
* item 1
* item 2
* tree
* aux

Because the tree might be relevant in some cases to the comparison function.
(Not that I can think of them, but...)
Um, fair enough to factor it out, but tree_add should call
tree_allocate - one should not have to call it separately in advance.
Is that what you meant?

Yes.

At which point, I honestly don't think the user ever has a reason to
call tree_allocate, because the only thing you can usefully do with that
new node, normally, is add it to a tree.

(Interesting question: Imagine a tree comparison function which, for any
two items in a tree, tells you whether the first item comes before, or
after, or is in the same place as the second function. Now reverse it and
try to sort by it...)

-s
 
N

Nick Keighley

it's not uncommon (nor very wrong) to lump "success" in with the
"error" codes.
Though I must I admit I prefer to call them "return codes"

Tough call, arguments on both sides, and I think the last time I
thought about this I was, like you, edging towards including names.

I tend to omit the names unless putting them in adds useful
information.
I try to arrange it so the header file is as informative as possible.

Retval transmitter_reset (Transmitter*);
double sum (double[], size_t);
void clear_element_in_instance_table (int element_id);

though perhaps the last one is an argument for using a better type.

<snip>
 
N

Nick Keighley

No. EFOO is reserved for the implementation. Bad Seebs - no biscuit!

Yeah, too terse, bad me.  "T_E*" good, "ESUCCESS" bad, "T_ESUCCESS" still
bad because people will see "T_E*" and assume it's an [E]rorr.

only if they were trying *really* hard not to understand

"There are some ideas so wrong that only a very intelligent person
could believe in them." -- George Orwell


I was searching for a Dan Pop quote along these lines but I thought
George would do in a pinch
 
B

Ben Bacarisse

Mark said:
I've implemented a very simple and small binary-search tree library as
a training exercise. I'm posting it here together with a testing
program and expect to receive critics and useful recommendations :)

Here are a few thoughts:

I don't like interfaces where the library signals what the caller
knows already. Your insert function returns T_EBADPTR if and only if
node == NULL. The caller can test this as easily as the function can.
In fact, the caller can probably test for it more simply than it can
test for all the various return values.

I'd write insert so that it adds data rather a node. I.e. it
allocates the node when it will do the insert. In your design, the
caller will probably have to allocate the node, try to insert it and
then delete it if the insert did not happen.

A more serious issue: don't delete what you don't allocate! Your
freenode frees the data but it shouldn't. I may want to add pointers
to non-'malloc'ed things. Moreover, when I free a node, I may want to
keep the data.

You could consider making the whole tree freeing function a call to a
post-order walk function.

<snip code>
 
N

Nisse Engström

I was searching for a Dan Pop quote along these lines but I thought
George would do in a pinch

You may have been thinking of this [from memory, so might be slightly
wrong]:

"The expression isn't unclear /at all/, and only an expert could
possibly have any doubts about it."

"The expression isn't unclear *at all* and only an expert could actually
have doubts about it" - Dan Pop 2001-02-06


/Nisse
 
C

Chris M. Thomasson

Ben Bacarisse said:
Here are a few thoughts:

I don't like interfaces where the library signals what the caller
knows already. Your insert function returns T_EBADPTR if and only if
node == NULL. The caller can test this as easily as the function can.
In fact, the caller can probably test for it more simply than it can
test for all the various return values.

I'd write insert so that it adds data rather a node. I.e. it
allocates the node when it will do the insert. In your design, the
caller will probably have to allocate the node, try to insert it and
then delete it if the insert did not happen.

[...]

IMHO, intrusive data-structures can be more "efficient" than the more
flexible container based allocation schemes. I basically agree with the
first point you made. The caller can probably allocate the node more
efficiently/simply than the end library function call can... Perhaps the
end-user application has various objects that can be directly linked into a
tree without the need of any auxiliary nodes:
___________________________________________________
struct tree
{
struct tree* left;
struct tree* right;
struct tree** plink;
};


struct end_user_object
{
struct end_user_object* next;
struct tree tree1;
char x;
struct tree tree2;
double y;
struct tree tree3;
};


struct end_user_object_region
{
struct end_user_object buffer[1024];
struct end_user_object* flist;
size_t bump;
};


static struct end_user_object_region g_region;

static struct tree* g_trees[3];
___________________________________________________




The application should be able to store an `end_user_object' on three
separate trees at the same time without making a single function call to
allocate any memory.




IMVVHO, a generic container library should work on a free standing
implementation that does not have access to `malloc' and friends...

Intrusive data-structures to the rescue!!!

:^o
 
S

Seebs

It should just take item 1, item 2, aux. (Good name, aux - I might
steal that!)

It's hardly original to me, but I've been a big fan ever since I first
saw it used, because it's as close as I've ever seen to self-documenting
but still terse for sideband data.

-s
 
M

Mark

Thank you for commentaries. Hope my late response will be noticed. I have a
few more queries.

Seebs wrote:
[skip]
I'd probably do these two differently.

tree_node *tree_addnode(struct tree_node *root, void *data, int
(*cmp_fn))...

But even this is error-prone. In this case, I think it makes more
sense to distinguish betwen the tree and its nodes.

So I recommend:

struct tree {
int (*cmp_fn)(const void *d1, const void *d2);
struct tree_node *root;
}

Richard said:
I would take a pointer to a tree structure (which manages the root and
any housekeeping information needed by the tree), the data, the size,
and an optional void * for side-channel information. (Consequently, I
would add an extra void * to the comparison function.)

So as I see it, both Seebs and Richard recommend to represent the
abstraction of the BST in two structures - one for the tree's node and the
other one is for the tree itself. What is the profit of doing this? When is
it good to exploit such method and for what data structures?

PS. Now I remember I have seen such technique somewhere in a circular list
implementation.
 
S

Seebs

So as I see it, both Seebs and Richard recommend to represent the
abstraction of the BST in two structures - one for the tree's node and the
other one is for the tree itself. What is the profit of doing this? When is
it good to exploit such method and for what data structures?

I usually do something like that when I feel that the container is somehow
genuinely distinct from its "top" member. The tree seems to be the right
place to, for instance, stash a comparator function -- since presumably you
always want the same one for the tree (and some trees need a comparator to
decide where a new item goes). It is not obvious to me that individual
nodes are, in real use cases, likely to make sense to treat the same way
you treat the tree as a whole.

By contrast, I wouldn't usually do this for a singly-linked or doubly-linked
list...
PS. Now I remember I have seen such technique somewhere in a circular list
implementation.

That can make sense too.

So why for a doubly-linked list, but not a circular list? Because for a
doubly-linked list, you can always find the ends by iterating, but there
isn't necessarily an end for a circular list.

-s
 
U

user923005

I usually do something like that when I feel that the container is somehow
genuinely distinct from its "top" member.  The tree seems to be the right
place to, for instance, stash a comparator function -- since presumably you
always want the same one for the tree (and some trees need a comparator to
decide where a new item goes).  It is not obvious to me that individual
nodes are, in real use cases, likely to make sense to treat the same way
you treat the tree as a whole.

By contrast, I wouldn't usually do this for a singly-linked or doubly-linked
list...


That can make sense too.

So why for a doubly-linked list, but not a circular list?  Because for a
doubly-linked list, you can always find the ends by iterating, but there
isn't necessarily an end for a circular list.

Plus, the end of a ring buffer often rolls forward. So even after you
found a start or an end, chances are it has moved shortly thereafter.
Plus, ring buffers are either empty, partly full, or full. The
elements of the ring buffer have no idea which state the ring buffer
as a whole is in.
A ring buffer needs a secondary observer thing that understands the
ring state.
 
U

user923005

In

user923005 wrote:



Not necessarily! :)  They can be dynamic, expanding as demand
requires. (Mine tend to do that.)

<snip>

Do you mean to tell me that your ring buffers are neither empty, full,
nor partly full?

Now that's an interesting data structure.
 
K

Keith Thompson

user923005 said:
Do you mean to tell me that your ring buffers are neither empty, full,
nor partly full?

Now that's an interesting data structure.

If it can expand as demand requires, it needn't ever be full.
 
J

James Kuyper

Keith said:
If it can expand as demand requires, it needn't ever be full.

In that case, it should be either empty or partly full. To claim the
existence of a situation where none of three options listed of
user923005 apply you'll have to explain a lot more than just that.
 
U

user923005

If it can expand as demand requires, it needn't ever be full.

In that case, I want one. My data structures always fill up.

I do think I will give expandable ring buffers a bit more thought.

I have lots of ring buffers in my code base, but they tend to be fixed
size (determined by some parameter).

A little thought about ring buffers that size themselves might result
in better code.
 
U

user923005

In <[email protected]>,

user923005 wrote:



Don't you read the books you help write? :)

It's not the reading part that stumbles me. It's the thinking.

My data structures have to run by themselves in a darkened room, with
evil men throwing their worst at them (I write database software). So
I should expect the worst possible inputs to arrive billions at a
time. What will my data structure do? I need to consider that the
same evil perpetrators will be pounding the stuffings out of other
data structures on 100 other threads and I am simply not allowed to
run out of resources or fail to bring the calculation to completion in
a timely manner.

So (for me at least) these things are never as simple as they might
seem on the surface.
 
D

David Thompson

it's not uncommon (nor very wrong) to lump "success" in with the
"error" codes.

Especially with value 0, as here. Although I would write explicitly
the formally-redundant = 0 .
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

Forum statistics

Threads
473,744
Messages
2,569,482
Members
44,901
Latest member
Noble71S45

Latest Threads

Top