How to implement a Hash Table in C

U

user923005

/* Here is that cuckoo hash function converted to C */
/* Dann Corbit did the quick & dirty C conversion. */
/* */
/* K-ary cuckoo hashing (c) 2002-2003 by Peter Sanders */
/* this is a simple implementation for the purposes */
/* of measuring the performance of cuckoo hashing in abstract */
/* terms (number of probes per insert, storage efficiency) */
/* usage: compile using g++ or another ISO C++ compiler */
/* a.out <K> <n> <r> <repeat> <seed for rng> */
/* there is also a constant tMax that defines the maximum number */
/* of trials for an insertion */
/* allocates space for n elements in K subtables, and repeats */
/* the follwing measurements repeat time: */
/* - find a hash function by filling full lookup tables */
/* with pseudo-random numbers. */
/* - insert elements i=0..n-1 into the table until this fails */
/* for the first time. (The cost of these insertions is not */
/* measured.) */
/* Every n/r successfully inserted elements, the follwing */
/* measurement is repeated n/r times: */
/* * a random element i2 is deleted */
/* * the hash table entries for i2 are filled with new random values
*/
/* * i2 is reinserted. */
/* Note that this is equivalent to removing a random element */
/* inserting a new element that */
/* has never been in the table before. */
/* The output is a table that gives */
/* - x the number of elements in the table at each measuremnt interval
*/
/* - the average number of probes for an insertion during the
measruements */
/* for the given number of inserted elements */
/* - K */
/* - n */
/* - repeat */
/* - seed */
#define DEBUGLEVEL 1
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include "util.h"
#include "mt-real.c"

#define split 1


const int tMax = 10000; /* max number of random walk trials */
int K; /* number of probes */
int n; /* number of elements */
int m; /* number of elements per table */
int **hash; /* K times n array */
int **table; /* K times m array */


double limit(double x, double bound)
{
if (x > bound) {
return bound;
} else if (x < -bound) {
return -bound;
} else
return x;
}


double cpuTime()
{
return clock() * 1e-6;
}

/* generate random int in 0..x-1 */
int rand0K(int x)
{
return (int) (genrand() * x);
}

/* insert element i into table */
/* return value: */
/* -1 failure */
/* otherwise number of hash function evaluation */
int insert(int i)
{
int forbidden = -1;
int j = rand0K(K);
int t;
for (t = 1; t <= tMax; t++) {
int p = hash[j];
int newI = table[j][p];
table[j][p] = i; /* play the cuckoo */
if (newI == -1)
return t; /* done */
forbidden = j;
i = newI; /* find new home for cuckoo victim */
j = rand0K(K - 1);
if (j == forbidden)
j = K - 1;
}
return tMax + 1; /* insertion failed */
}

/* delete element i from the table */
void delete(int i)
{
int j;
for (j = 0; j < K; j++) {
int p = hash[j];
if (table[j][p] == i) {
table[j][p] = -1;
return;
}
}
}

int main(int argc, char **argv)
{
int i,
j;
int r;
int step;
int repeat;

int seed;
double *sumT;
int *cf;
int rep;

assert(argc == 6);
K = atoi(argv[1]); /* number of probes */
n = atoi(argv[2]); /* number of elements */
r = atoi(argv[3]); /* number of measured densities */
repeat = atoi(argv[4]); /* how often to start from scratch */
seed = atoi(argv[5]);
m = (int) (n / K + 0.5);
step = n / r;
sgenrand(seed);
puts("# x tAvg(x) K N repeat seed");

/* allocate hash function and table */
/* and an empty table */
hash = calloc(K, sizeof(int *));
table = calloc(K, sizeof(int *));
for (j = 0; j < K; j++) {
hash[j] = calloc(n, sizeof(int));
table[j] = calloc(m, sizeof(int));
}

/* initialize statistics */
/* sumT is the average time for size i*step */
sumT = calloc(r + 1, sizeof(double));
cf = calloc(r + 1, sizeof(int));
for (i = 0; i < r; i++) {
sumT = 0;
cf = 0;
}

/* main loop */
for (rep = 0; rep < repeat; rep++) {
/* init hash function and empty table */
for (j = 0; j < K; j++) {
for (i = 0; i < n; i++) {
hash[j] = rand0K(m);
}
for (i = 0; i < m; i++) {
table[j] = -1;
}
}

/* fill table and keep measuring from time to time */
for (i = 0; i < n; i++) {
if (insert(i) > tMax)
break; /* table is full */
/* measure in detail here */
if (((i + 1) % step) == 0) {
int i1;
for (i1 = 0; i1 < step; i1++) {
int t;
/* delete & reinsert a random element */
int i2 = rand0K(i);
delete(i2);
for (j = 0; j < K; j++)
hash[j][i2] = rand0K(m);
t = insert(i2);
cf[i / step] += (t > tMax);
/* cout << t << endl; */
sumT[i / step] += t;
}
}
}
}

for (rep = 0; rep < r; rep++) {
printf("%d %f %d %d %d %d %d\n",
rep * step + step,
sumT[rep] / step / repeat,
K,
n,
repeat,
seed,
cf[rep]);
}
return 0;
}

/*
mt-real.c follows:
*/
/* A C-program for MT19937B: Real number version */
/* genrand() generate one pseudorandom number with double precision
*/
/* which is uniformly distributed on [0,1]-interval for each call.
*/
/* sgenrand(seed) set initial values to the working area of 624
words.*/
/* sgenrand(seed) must be called once before calling genrand()
*/
/* (seed is any integer except 0).
*/

/*
LICENCE CONDITIONS:

Matsumoto and Nishimura consent to GNU General
Public Licence

NOTE:
When you use it in your program, please let Matsumoto
<[email protected]> know it.

Because of a machine-trouble, Matsumoto lost emails
which arrived during May 28-29.
*/


#include<stdio.h>

/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0df /* constant vector a */
#define UPPER_MASK 0x80000000 /* most significant w-r bits */
#define LOWER_MASK 0x7fffffff /* least significant r bits */

/* for tempering */
#define TEMPERING_MASK_B 0x9d2c5680
#define TEMPERING_MASK_C 0xefc60000
#define TEMPERING_SHIFT_U(y) (y >> 11)
#define TEMPERING_SHIFT_S(y) (y << 7)
#define TEMPERING_SHIFT_T(y) (y << 15)
#define TEMPERING_SHIFT_L(y) (y >> 18)

static unsigned long ptgfsr[N]; /* set initial seeds: N = 624 words */

void
sgenrand(unsigned long seed) /* seed should not be 0 */
{
int k;

/* setting initial seeds to ptgfsr[N] using */
/* the generator Line 25 of Table 1 in */
/* [KNUTH 1981, The Art of Computer Programming */
/* Vol. 2 (2nd Ed.), pp102] */

ptgfsr[0]= seed & 0xffffffff;
for (k=1; k<N; k++)
ptgfsr[k] = (69069 * ptgfsr[k-1]) & 0xffffffff;
}

double
genrand()
{
unsigned long y;
static int k = 1;
static unsigned long mag01[2]={0x0, MATRIX_A};
/* mag01[x] = x * MATRIX_A for x=0,1 */

if(k == N){ /* generate N words at one time */
int kk;
for (kk=0;kk<N-M;kk++) {
y = (ptgfsr[kk]&UPPER_MASK)|(ptgfsr[kk+1]&LOWER_MASK);
ptgfsr[kk] = ptgfsr[kk+M] ^ (y >> 1) ^ mag01[y & 0x1];
}
for (;kk<N-1;kk++) {
y = (ptgfsr[kk]&UPPER_MASK)|(ptgfsr[kk+1]&LOWER_MASK);
ptgfsr[kk] = ptgfsr[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1];
}
y = (ptgfsr[N-1]&UPPER_MASK)|(ptgfsr[0]&LOWER_MASK);
ptgfsr[N-1] = ptgfsr[M-1] ^ (y >> 1) ^ mag01[y & 0x1];

k = 0;
}

y = ptgfsr[k++];
y ^= TEMPERING_SHIFT_U(y);
y ^= TEMPERING_SHIFT_S(y) & TEMPERING_MASK_B;
y ^= TEMPERING_SHIFT_T(y) & TEMPERING_MASK_C;
y &= 0xffffffff; /* you may delete this line if word size = 32 */
y ^= TEMPERING_SHIFT_L(y);

return ( (double)y / (unsigned long)0xffffffff );
}

/*
util.h follows:
*/
/*
// this files contains all the application independent little
// functions and macros used for the optimizer.
// In particular Peters debug macros and Dags stuff
// from dbasic.h cdefs, random,...

//////////////// stuff originally from
debug.h ///////////////////////////////
// (c) 1997 Peter Sanders
// some little utilities for debugging adapted
// to the paros conventions
*/

#ifndef UTIL
#define UTIL


#ifndef Max
#define Max(x,y) ((x)>=(y)?(x):(y))
#endif

#ifndef Min
#define Min(x,y) ((x)<=(y)?(x):(y))
#endif

#ifndef Abs
#define Abs(x) ((x) < 0 ? -(x) : (x))
#endif

#ifndef PI
#define PI 3.1415926535
#endif

#endif
 
R

Richard Heathfield

James Dow Allen said:
Gak! You're not coding ala Falconer are you?

No, I don't think so. I don't actually use hash tables a great deal
(which is not to say that I never use them), but when I *do* use them,
I don't probe at all. I just use stretchy buckets instead (using trees
to handle the stretchy bit).

<snip>
 
E

Eric Sosman

James said:
[...]
Speaking of Falconer, his method (linear probing
with second hash value used for probe increment)
*requires* (even more so than quadratic probing,
see upthread) prime table size.

Well, not exactly: It requires only that the increment
between probes be relatively prime to the table size. A prime
table size makes this criterion easy to satisfy, but it's not
the only way. A table size equal to a power of two, for example,
works just fine if the probe increment is odd. (Degenerate
special case: pure linear probing with an increment of unity
works with any table size at all, prime or composite.)

Some people seem very interested in the relative speed
of AND vs MOD for this purpose, and on most machines AND
is faster or at the very least no slower. I think, though,
that the time savings must be viewed as just one component
of the search cost -- another piece (sometimes rather large)
is computing the key's hash value in the first place, not
to mention the cost of equality comparisons on the table
items encountered along the way.

(Slight digression): It's often suggested that those
equality comparisons can be made cheaper by storing hash
codes in the table entries along with the payload. Before
making a possibly expensive strcmp() or whatever, one can
first do a (fast) arithmetic comparison of the hashes: if
the hashes don't match, the objects mush be unequal and
it is unnecessary to make the expensive comparison. But this
stratagem doesn't seem to me to be a "Well, doh!" proposition;
at least two counter-arguments exist:

- If the table stores pointers to items rather than item
instances, storing hash values in the table roughly
doubles the amount of memory required. If the same
amount of memory were used for a pointers-only table,
the number of table slots would double and the load
factor would be halved. This should lead to shorter
searches -- and it is not at all clear _a priori_ that
more fast probes will outrace fewer slow probes.

- Storing the hash codes saves time whenever a probe hits
a slot occupied by the "wrong" key (it saves no time when
the probe finds the "right" key). If the time savings is
significant, it must mean that the key comparisons are
very expensive or that the probe sequences are very long.
If we've taken care to construct good hash functions and
have reason to believe that searches take two-point-mumble
probes, on the average, the savings is only one-point-mumble
comparisons per search.

As always when one is interested in speed, the gold standard
is the familiar "Measure, measure, measure!"
 
B

Ben Bacarisse

A lot of algorithm books make a big meal of the basic data
structures. I checked the Java queue classes before writing this post,
and they were pretty complicated. I takes two interfaces, the
Collection and the Iterable, and there are nine implementations -
linked list queues, delay queues, and so on. Then you've got add
methods, which throw execeptions if the queue is full, and offer
methods which return an error code, so that you can decide whether the
queue filling up is or is not part of normal operation. It's quite an
impressive interface.

I am not asking you to over-complicate it so I don't see how anyone
else's complex interface has anything to do with it.
However the structures chapter is onyl chapter two. I want to describe
the simplicity of the structure instead of its complexity. I could
have provided something similar to the Java classes, using void
pointers and access functions. But it is unlikely that anyone would
use such a function suite. If you want a trivial queue you have a
buffer, a circular buffer if you don't want the inefficiency of
shifting everything up on every remove operation. Just as you build
linked lists by incorporating a "next" pointer in your structure, and
trees built as you go.

Don't go there again. There is nothing wrong with using a fixed
circular buffer to implement a bounded queue. I never said there was
(and I have said it was fine before).

The whole chapter says "these are are basic data structures you will
need, here are some code fragments which show how to implement
them". The reader should be able to extend the expandable array to
produce a dynamically expanding queue, add functions to check for
length and spare capacity, and so on.

I think we will have to leave this argument. I am beginning to see
that we have different opinions about what makes a data structure and
what makes a good program. It seems you think it is fine that your
readers will have to re-invent one of the various methods to
distinguish between a full and an empty queue, and pepper their code
with references to the head and tail counters (like you do in your
program that uses a stack with no 'push', 'pop', 'empty' or 'full'
functions). They will also have to re-write the bits that you *did*
write, since you don't test for full or emptiness, in order to do
anything with it at all.

If you don't believe me, please just answer my original question and
show how to wrote a simple algorithm (any algorithm) using your queue
operations. If you do, you will see how you have move all sorts of
implementation details into the program the uses the queue. Of
course, you could also do it my ditching what you wrote in the book
and implementing a bounded queue properly. It is only a few lines
more code. There would be no need to make it complicated, just usable
and self-contained.
An "empty" function is pretty
patronising. The priority queue, implemented efficiently, is the red
black tree, but it is appropriate to hold that back. Actually it is
possible to speed up the deletion, I believe - a web seach on
"priority queue" and "efficiency" turns up a slew of algorithms. The
book is Basic Algorithms, covering a wide range of topics in outline,
not a massively detailed description of any one.

You miss the point. I don't want it to be "massively detailed", but
clear, usable and self-contained would be nice.
 
R

Richard Harter

<OT>
James said:
[...]
Speaking of Falconer, his method (linear probing
with second hash value used for probe increment)
*requires* (even more so than quadratic probing,
see upthread) prime table size.

Well, not exactly: It requires only that the increment
between probes be relatively prime to the table size. A prime
table size makes this criterion easy to satisfy, but it's not
the only way. A table size equal to a power of two, for example,
works just fine if the probe increment is odd. (Degenerate
special case: pure linear probing with an increment of unity
works with any table size at all, prime or composite.)

In a thread in comp.programming (IIRCC) Paul Hsieh pointed out
that always using the same probe increment in a power of two
table size also has bad properties. His solution is to use the
unused bits of the full hash to select an increment size from a
table of primes.

[snip]

re: counterargument to storing the hash code.
- If the table stores pointers to items rather than item
instances, storing hash values in the table roughly
doubles the amount of memory required. If the same
amount of memory were used for a pointers-only table,
the number of table slots would double and the load
factor would be halved. This should lead to shorter
searches -- and it is not at all clear _a priori_ that
more fast probes will outrace fewer slow probes.

This doesn't sound right - ordinarily a table has to hold a
key/value pair. Adding a hash would increase the load factor by
50% - for a given amount of memory.
- Storing the hash codes saves time whenever a probe hits
a slot occupied by the "wrong" key (it saves no time when
the probe finds the "right" key). If the time savings is
significant, it must mean that the key comparisons are
very expensive or that the probe sequences are very long.
If we've taken care to construct good hash functions and
have reason to believe that searches take two-point-mumble
probes, on the average, the savings is only one-point-mumble
comparisons per search.

As always when one is interested in speed, the gold standard
is the familiar "Measure, measure, measure!"

</OT>
 
R

Richard Harter

/* Here is that cuckoo hash function converted to C */
/* Dann Corbit did the quick & dirty C conversion. */

<snip code>

Thanks for posting the code - much appreciated.
 
C

CBFalconer

Richard said:
Eric Sosman said:
James said:
[...]
Speaking of Falconer, his method (linear probing with second
hash value used for probe increment) *requires* (even more so
than quadratic probing, see upthread) prime table size.

Well, not exactly: It requires only that the increment
between probes be relatively prime to the table size. A prime
table size makes this criterion easy to satisfy, but it's not
the only way. A table size equal to a power of two, for example,
works just fine if the probe increment is odd. (Degenerate
special case: pure linear probing with an increment of unity
works with any table size at all, prime or composite.)

In a thread in comp.programming (IIRCC) Paul Hsieh pointed out
that always using the same probe increment in a power of two
table size also has bad properties. His solution is to use the
unused bits of the full hash to select an increment size from a
table of primes.

That is dangerous. The purpose of the multiple hash is to have
different search patterns for different objects that have the same
primary hash pattern. This avoids clustering, and tends to keep
the search chain short. The very iffy thing in hashlib is the
reduction of the range of the secondary hash, which encourages
light clustering in nearly empty tables. The system attempts to
maintain tables at between 44 and 88% of capacity.

Also, I want the table action to be independent of the hash
functions (apart from speed), so the functions are computed every
time needed. The system functions with all the hash functions
replaced by the constant 1 (unity) (tested in the test suite) at a
considerable speed penalty. So the action is hash function
independent, but the speed is highly dependent.

The system is intended to isolate all these decisions in the hash
table code, so the user need not worry about them. Similarly the
algorithms for table expansion, etc. The result is a storage and
retrieval black box.
 
M

Malcolm McLean

CBFalconer said:
Makes no sense, lacking descriptions of struct deque,
deque_init_null, xnmalloc.
For it to be OK dequeue->capacity would have to be type guaranteed to be
wider than a size_t. There is no such type in standard C. So you can see
that the code is incorrect.
Actually calling with ~0 is a mistake a calling programmer is not too
unlikely to make. If the he knows he needs N-1 spaces in his queue for a
dental surgery, one patient on the chair and thus not in queue, then he
might pass in N -1 and forget for check for N == 0.
 
M

Malcolm McLean

Ben Bacarisse said:
I am not asking you to over-complicate it so I don't see how anyone
else's complex interface has anything to do with it.
The Java class is not written by idiots. It is basically what you need for a
production generic queue class. However my code is not production code. It
is there to illustrate how to implement the algorithm.
I think we will have to leave this argument. I am beginning to see
that we have different opinions about what makes a data structure and
what makes a good program. It seems you think it is fine that your
readers will have to re-invent one of the various methods to
distinguish between a full and an empty queue, and pepper their code
with references to the head and tail counters (like you do in your
program that uses a stack with no 'push', 'pop', 'empty' or 'full'
functions). They will also have to re-write the bits that you *did*
write, since you don't test for full or emptiness, in order to do
anything with it at all.
The alternative to to write generic structures. Whilst that would be a
workable approach, in fact C doesn't lend itself to this. Many such
libraries have been written, none has gained acceptance of been included in
the standard library. Readers are better off understanding that these
relatively simple structures are fundamental, and will be implemented in a
wide variety of functions.
If you don't believe me, please just answer my original question and
show how to wrote a simple algorithm (any algorithm) using your queue
operations.
That's not the intention. The queue takes only integers and has a hardwired
size. The code shows someone who doesn't know that queues are importnat data
structures how to implement one. It doesn't give nor is in intended to give
a generic "queue" class.
If you do, you will see how you have move all sorts of
implementation details into the program the uses the queue. Of
course, you could also do it my ditching what you wrote in the book
and implementing a bounded queue properly. It is only a few lines
more code. There would be no need to make it complicated, just usable
and self-contained.
Exactly. You go to the book, then you implement your own queue, as needed,
using the code in the book as a reference if necessary.
You miss the point. I don't want it to be "massively detailed", but
clear, usable and self-contained would be nice.
There are many generic queue libraries. None has gained wide acceptance.
Therefore the bar for "usable" is in fact very high. This is not true of
other languages, but it is true of C.
 
U

user923005

There are many generic queue libraries. None has gained wide acceptance.
Therefore the bar for "usable" is in fact very high. This is not true of
other languages, but it is true of C.

These queues have a large following:
http://www.microsoft.com/windowsserver2003/technologies/msmq/default.mspx
http://www-306.ibm.com/software/integration/wmq/

Here is a list of queues:
http://sourceforge.net/search/?type_of_search=soft&words=queue

I like this one:
http://sourceforge.net/projects/safmq/
It's C++ though, and is a lot more that most people probably need for
a simple fifo.

For those with simple needs, here is a thread-safe fifo:
http://fifoembed.sourceforge.net/
It's POSIX oriented, but it should be easy enough to convert to
generic use (POSIX threads for Windows could be used for that OS, for
instance).
 
M

Malcolm McLean

Richard Harter said:
This doesn't sound right - ordinarily a table has to hold a
key/value pair. Adding a hash would increase the load factor by
50% - for a given amount of memory.
Normally you can derive the keys from the data item. So in C you would pass
in a function pointer to extract a key from a datum, or if you want to be a
little faster but less flexible, pass in a length and offset.
Unfortunately it make the table harder to use.
 
B

Ben Bacarisse

Malcolm McLean said:
The Java class is not written by idiots. It is basically what you need
for a production generic queue class. However my code is not
production code. It is there to illustrate how to implement the
algorithm.

But it is not an excuse for your useless example.
The alternative to to write generic structures.

No. The alternative is to write a functional, useful, simple,
fixed-length queue of integers (since those are the very sensible
restrictions you put on your example). I am not asking you to make it
generic (in types), or flexible (in size), just usable as your text
claims it to be.

Write a short example that uses your add and remove functions do
anything at all. You'll have to put half of the code for add and
remove (the bits that checks if adding or removing is possible) into
the algorithm. You can't be suggesting that is how one should use a
simple queue -- by putting the slightly tricky full and empty tests
outside of any queue operations? Of course, it is quite reasonable to
leave them out of the add and remove functions for efficiency,
provided there is a user note: "don't call add() unless is_full()
returns false" (and the mirror for the awkwardly named remove()). In
an introductory text, I'd have add() and remove() do these tests, but
if you don't do that they need to be provided as queue operations.

If you'd said, "now you just need to define is_full() and is_empty()
functions and you can start using this queue" that would be a fine
exercise (but I'd include an answer later since it is slightly tricky)
but you seem to be suggesting that it can be used as you have left it.

Maybe this is at the heart of out differences. Maybe you are happy
that a significant implementation issue (how to distinguish between a
full and an empty circular buffer) be peppered about the algorithms
that use the buffer. If so, it partially explains why you left the
queue unfinished, but I don't think it is a good example to set, and it
is a terrible way to write maintainable programs. You do it in the
stack example, so maybe that is what you expect.

If this is not the reason, then I am just at a loss. A book called
Basic Algorithms includes code for a queue. This code can't be used
in any basic algorithm I know without first finishing the design by
deciding how to distinguish between full and empty queues and then either

(a) putting this code, which tests variables that belong to the queue
all over the algorithms that use the queue;
(b) rewriting add and remove so they can signal failure and have them do
these tests; or
(c) finish the interface by adding these two testing operations.

In all these cases I think you need to tell your readers that the queue
is not finished.

That's not the intention. The queue takes only integers and has a
hardwired size. The code shows someone who doesn't know that queues
are importnat data structures how to implement one. It doesn't give
nor is in intended to give a generic "queue" class.

I don't care that it not generic. Did I not say that enough yet? I
am curious that the intention was not that it be used. Given that
assumption, it is easier to write good books. Would it not have been
better to provide a simple but usable (allbeit limited) example? It
is only a few more lines of code. And in case you mistake my
meaning -- no need to make it generic or dynamic.
Exactly. You go to the book, then you implement your own queue, as
needed, using the code in the book as a reference if necessary.

Had you finished the implementation it would have been a more useful
reference. Is the "bug" in the code just the fact the you missed out
the sentence: "You need to make one further design decision and then
finish the implementation by writing is_full() and is_empty()
functions."?
There are many generic queue libraries. None has gained wide
acceptance. Therefore the bar for "usable" is in fact very high. This
is not true of other languages, but it is true of C.

Set the bar as low as you like. Yours is not usable without either
finishing it, rewriting it, or messing up the code that tries to use
it. I am talking about two functions of a line or two each. I can't
see why you think your "sketch" is more helpful without these or why
adding them is too complex for you to consider.
 
E

Eric Sosman

Richard said:
<OT>
James said:
[...]
Speaking of Falconer, his method (linear probing
with second hash value used for probe increment)
*requires* (even more so than quadratic probing,
see upthread) prime table size.
Well, not exactly: It requires only that the increment
between probes be relatively prime to the table size. A prime
table size makes this criterion easy to satisfy, but it's not
the only way. A table size equal to a power of two, for example,
works just fine if the probe increment is odd. (Degenerate
special case: pure linear probing with an increment of unity
works with any table size at all, prime or composite.)

In a thread in comp.programming (IIRCC) Paul Hsieh pointed out
that always using the same probe increment in a power of two
table size also has bad properties. His solution is to use the
unused bits of the full hash to select an increment size from a
table of primes.

I did not and do not imply that the probe increment must
be the same for all keys, nor even for all keys that probe
the same slot on the first attempt. This is one of the reasons
twin primes are popular as table sizes: you use the hash code
mod N for the first probe, and then if necessary you use the
hash mod (N-2) plus 1 as the increment for further probes. The
primality of N guarantees that the increment is relatively prime
to it; the primality of N-2 helps scramble groups of keys that
have simple relationships (sum1, sum2, sum3, ...).
[snip]

re: counterargument to storing the hash code.
- If the table stores pointers to items rather than item
instances, storing hash values in the table roughly
doubles the amount of memory required. If the same
amount of memory were used for a pointers-only table,
the number of table slots would double and the load
factor would be halved. This should lead to shorter
searches -- and it is not at all clear _a priori_ that
more fast probes will outrace fewer slow probes.

This doesn't sound right - ordinarily a table has to hold a
key/value pair. Adding a hash would increase the load factor by
50% - for a given amount of memory.

I think you said the same thing I did; perhaps we have a
confusion of terminology. (Not surprising, since "hash table"
itself is not "a" thing, but a wide and varied family of things.)

My point is that the choice between a pointer/hash pair and
a pointer alone is not clear-cut. Storing the pair allows one
to short-circuit the (possibly expensive) comparisons when the
search key's hash and the stored hash disagree. For the same
expenditure of memory, a pointer-only table could have twice as
many slots and a corresponding reduction in collision frequencies,
leading to fewer probes per search.

Hmmm: Maybe our disagreement is about where the key lives.
I've been assuming that the key is part of the item, so the
choice is between pointer/hash and pointer. But maybe you're
assuming the key is a separate entity, so the choice is between
key_pointer/data_pointer and key_pointer/data_pointer/hash. The
argument still holds with a different expansion factor: Storing
the hash now adds 50% to the table size, while leaving it out
allows 33% more slots and a load factor reduced by one-third.

If the table stores complete "records" rather than pointers
and if the records are significantly larger than hash values,
then storing the hashes exacts a proportionally smaller penalty
(flip side: declining to store them gains a proportionally smaller
advantage).

Measure, measure, measure!
 
R

Richard Heathfield

Ben Bacarisse said:

I'd have add() and remove() do these tests, but
if you don't do that they need to be provided as queue operations.

I pointed out the name clash with <stdio.h>'s remove() some days - or
possibly even weeks - ago. That should have been plenty of time to
correct the electronic version of the book (given that MM seems to have
plenty of time to discuss it, so I presume he has enough time to fix it
too - the more important task).

So I went to have a look, to see if it had been fixed yet. (This is the
first time I've looked at the chapter.)

MM's A123 XYZ / B123 CDE example is wrong, in that it suggests checking
Slot 134. If that's empty, MM says, B123 CDE goes goes in Slot 124. I
can see no justification whatsoever for checking slot (S + 10) before
deciding whether to use Slot S.

I was puzzled to find that hash table entries were not abstracted as the
table itself was. "Divide and conquer" is too useful a strategy to be
ignored lightly, especially in teaching.

Oddly, MM seems to be using int rather than size_t for recording lengths
and sizes (neither of which can be negative). Presumably this is a
typo, since no experienced C programmer would deliberately be that
daft.

His malloc is not ideal - answer = malloc(sizeof *answer); would be
preferable. He repeats this mistake later in the (poorly named)
hashtable() function.

Good functions perform a task. They are doers. And they need "doing
words" (which we sometimes call "verbs") in their names. For example,
stdio's "remove" and "rename" and "fopen" and "printf" - all reasonably
good names (given their creator's habit of terseness).

The indentation is appalling, and led me to believe at first that
hashtable() always returns 0. I had to double back to check. Blech.

There's a reference to a strdup call, but as far as I can see there is
neither a definition of a strdup function nor indeed a call to one.
There is a call to a mystrdup function, but that isn't the same thing.
(Nor is it defined, so we have no idea what it does.)

I wonder what "menas" means.

MM seems to be thinking that "quadratic probing" means "square the hash
value" (and that's a direct quote). Either he's misunderstood quadratic
probing completely, or I have. As far as I'm concerned, quadratic
probing has nothing whatsoever to do with the hash value except as a
place to start looking for a slot. My understanding is that, if the
hash value h has so far resulted in n collisions in a table of size s,
then the place to look next is slot (h + n*n) % s - and most certainly
NOT (h * h) % s!

At this point, I started to lose interest, since it seems evident that
the chapter is so badly broken as to be beyond recovery - and I started
to skim. At random, I discovered an invasion of implementation
namespace, a correct claim that the primality test was interesting (but
alas, it is interesting for all the wrong reasons), an incorrect claim
that trial division is the only guaranteed way to determine whether a
number is prime, and a prime number function that wrongly decides 2 is
not a prime number.

Overall, I found the chapter to be of disappointingly low quality, and I
could not possibly recommend it. If that chapter is the advertisement,
the product itself must be dire indeed.
 
M

Malcolm McLean

Richard Heathfield said:
Ben Bacarisse said:



I pointed out the name clash with <stdio.h>'s remove() some days - or
possibly even weeks - ago. That should have been plenty of time to
correct the electronic version of the book (given that MM seems to have
plenty of time to discuss it, so I presume he has enough time to fix it
too - the more important task).
Corrrections are being stored up for an edition 2, which will be released
shortly.
MM's A123 XYZ / B123 CDE example is wrong, in that it suggests
checking Slot 134. If that's empty, MM says, B123 CDE goes goes in Slot
124. I can see no justification whatsoever for checking slot (S + 10)
before
deciding whether to use Slot S.

I was puzzled to find that hash table entries were not abstracted as the
table itself was. "Divide and conquer" is too useful a strategy to be
ignored lightly, especially in teaching.
There are two tables. One stores data items of any size, one stores
pointers.
Oddly, MM seems to be using int rather than size_t for recording lengths
and sizes (neither of which can be negative). Presumably this is a
typo, since no experienced C programmer would deliberately be that
daft.
It's an algorithms book. The algorithms work on integers. The hash function
then has to to return a size_t, which is confusing to people using the book
to understand how to port to another language.
>
His malloc is not ideal - answer = malloc(sizeof *answer); would be
preferable. He repeats this mistake later in the (poorly named)
hashtable() function.
Too hard to read. malloc( sizeof *ptr) is a bad clc ism that you will seldom
see in another environment. The reader has to perform a mental dereference
which makes it harder to see if the size of correct.
Good functions perform a task. They are doers. And they need "doing
words" (which we sometimes call "verbs") in their names. For example,
stdio's "remove" and "rename" and "fopen" and "printf" - all reasonably
good names (given their creator's habit of terseness).
My convention is to have an opaque structure, and a constructor with the
name of the structure in lower case. This is analagous to C++. As you say,
create_xxx would have the advantage of being a verb.
The indentation is appalling, and led me to believe at first that
hashtable() always returns 0. I had to double back to check. Blech.
Read the actual book - preview is available. The html is dumped form the
Open Office book files, unfortunately it doesn't understand code formatting.
I really ought to be able to find a way around that, I know.
There's a reference to a strdup call, but as far as I can see there is
neither a definition of a strdup function nor indeed a call to one.
There is a call to a mystrdup function, but that isn't the same thing.
(Nor is it defined, so we have no idea what it does.)
strdup() is discussed in chapter 1.
I wonder what "menas" means.
typo.

MM seems to be thinking that "quadratic probing" means "square the hash
value" (and that's a direct quote). Either he's misunderstood quadratic
probing completely, or I have. As far as I'm concerned, quadratic
probing has nothing whatsoever to do with the hash value except as a
place to start looking for a slot. My understanding is that, if the
hash value h has so far resulted in n collisions in a table of size s,
then the place to look next is slot (h + n*n) % s - and most certainly
NOT (h * h) % s!
I'll have to check on this one.
At this point, I started to lose interest, since it seems evident that
the chapter is so badly broken as to be beyond recovery - and I started
to skim. At random, I discovered an invasion of implementation
namespace, a correct claim that the primality test was interesting (but
alas, it is interesting for all the wrong reasons), an incorrect claim
that trial division is the only guaranteed way to determine whether a
number is prime, and a prime number function that wrongly decides 2 is
not a prime number.

Overall, I found the chapter to be of disappointingly low quality, and I
could not possibly recommend it. If that chapter is the advertisement,
the product itself must be dire indeed.
You seem determined to pick faults. The problem it then becomes distorting,
like Ben Bacarisse's comments. It is then impossible to know whether the
criticisms actually represent anything legitimate or are just fussing.

abstracted - is an entry of arbitrary size abstract enough? The keys, yes, I
admit it might have been better to go for non-strings.

size_t - yes, there is an obvious case, but you don't seem to have
considered the implications.

malloc(size *ptr) - that's not the convention in most places. The argument
for it is based on the needs of the compiler rather than the human reader.

verbs for function names. There is a case, but it is not obvious that you
are right. C++ doesn't agree.

indentations. Yes, it's my fault for not getting a decent HTML dumper. If I
edit the html

strdup - discussed in chapter 1.

menas - unambiguously valid criticism.

quadratic probing - you might be right here.

prime stuff - yes, that needs a bit of fixing, I will agree.

So actually most of the criticism dissolves. I am not saying that there are
not things you can say against the book, or that it couldn't have been done
better. However can you point to a better introductory web page on hash
tables? That's the real test.
 
U

user923005

So actually most of the criticism dissolves. I am not saying that there are
not things you can say against the book, or that it couldn't have been done
better.

I don't understand how the criticism dissolves. I think that the
major issue is really the reaction to the criticism even more so than
the defects themselves.
The worst possible reaction is "The emperor's new clothes" reaction --
pretending that problems do not exist.
The best possible reaction is "Yes, I see the problem. I'll have it
fixed right away." -- and then actually fixing it right away.
Everyone makes mistakes, no matter how careful we are. But
recognizing our mistakes and correcting them is very, very important.
When creating a reference work, special effort should be made to
ensure correctness on the first pass. And diligent effort to improve
it is also highly important.
However can you point to a better introductory web page on hash
tables? That's the real test.

I suspect that something in this tangle will turn up worthwhile:
http://www.google.com/search?hl=en&q=introduction+to+hash+tables

Personally, I like this:
http://eternallyconfuzzled.com/tuts/datastructures/jsw_tut_hashtable.aspx
When the twisted minds of Julienne Walker and The EC Team get together
and write a book, I will buy a copy for sure.
 
M

Malcolm McLean

user923005 said:
I don't understand how the criticism dissolves. I think that the
major issue is really the reaction to the criticism even more so than
the defects themselves.
The worst possible reaction is "The emperor's new clothes" reaction --
pretending that problems do not exist.
The best possible reaction is "Yes, I see the problem. I'll have it
fixed right away." -- and then actually fixing it right away.
Everyone makes mistakes, no matter how careful we are. But
recognizing our mistakes and correcting them is very, very important.
When creating a reference work, special effort should be made to
ensure correctness on the first pass. And diligent effort to improve
it is also highly important.
I don't see how else I could react. Criticisms have been made, some of them
indisuptable bug reports, some of them very good points, some of them
defensible but not really good criticisms, some criticisms which fail to
understand the purpose of the code or the book, and some which are
objectively false.
That's what you would expect. However there is also a sort of assuption that
the book is no good. In fact I found a bug in Ben Pfaff's circular buffer
code within a couple of minutes, but no one was demanding that the code be
taken down. Nor did I report it in a silly way, like "this implementation
exhibits catastrophic and resource-hogging behaviour if size_t is used
beyond the range of a typical signed integer type". I just pointed it out,
as any sensible, experienced person does when making bug reports.
I suspect that something in this tangle will turn up worthwhile:
http://www.google.com/search?hl=en&q=introduction+to+hash+tables

Personally, I like this:
http://eternallyconfuzzled.com/tuts/datastructures/jsw_tut_hashtable.aspx
When the twisted minds of Julienne Walker and The EC Team get together
and write a book, I will buy a copy for sure.
It is a good description. The writing is rather dry and my car park metaphor
is more interesting. Also I describe how to do deletions with linear
probing. However it is a vey good explanation of hashing, it would be wrong
to say it is not.

The code works on global variables. Whilst this is acceptable for
demonstration code, it also tends to encourage bad programming habits.

This looks like a bug to me

(Quote recommended hash table website)

Insertion and deletion are equally simple relative to linear probing:

1void jsw_insert ( void *key, int len )
2...{
3 unsigned h = hash ( key, len ) % N;
4 unsigned step;
5
6 for ( step = 1; table[h] != NULL; ++step )
7 h = ( h + ( step * step - step ) / 2 ) % N;
8
9 table[h] = key;
10}
1void jsw_remove ( void *key, int len )
2...{
3 unsigned h = hash ( key, len ) % N;
4 unsigned step;
5
6 for ( step = 1; table[h] != NULL; ++step )
7 h = ( h + ( step * step - step ) / 2 ) % N;
8
9 table[h] = DELETED;
10}

You need to call the comparison function and break if equal. As it is the
code marks a null at the end of the cluster as deleted.

There are always problems. As you say, it depends how you react to them.
There are errors and weaknesses in Basic Algorithms, of course. However you
can tell by the tone that mostly these are used as a pretext for envy.
 
W

websnarf

Richard said:
Eric Sosman said:
James Dow Allen wrote:
[...]
Speaking of Falconer, his method (linear probing with
second hash value used for probe increment) *requires*
(even more so than quadratic probing, see upthread)
prime table size.

Well, not exactly: It requires only that the
increment between probes be relatively prime to the table
size. A prime table size makes this criterion easy to
satisfy, but it's not the only way. A table size equal
to a power of two, for example, works just fine if the
probe increment is odd. (Degenerate special case: pure
linear probing with an increment of unity works with any
table size at all, prime or composite.)
In a thread in comp.programming (IIRCC) Paul Hsieh pointed
out [...]

Yeah it was probably me, since I am unable to find a citation from
anyone else that seems to be aware of this. Its as if people think
there is no need to think past Knuth.
[...] that always using the same probe increment in a
power of two table size also has bad properties. His
solution is to use the unused bits of the full hash to
select an increment size from a table of primes.

That is dangerous. The purpose of the multiple hash is to
have different search patterns for different objects that
have the same primary hash pattern.

No, its to have a different search pattern for different object that
have the same primary hash *INDEX* (i.e., post masking). This is
obvious since, for any good hash function, index collisions will be
many many times more common than full hash function collisions.

Mathematically speaking, on average a single 32-bit (good) hash
function collision will occur roughly when 65536 entries appear, while
a hash index collision will occur roughly when the square root of the
table size number of entries occur (so for a 9 million entry table,
that's just 3000 entries).
This avoids clustering, and tends to keep
the search chain short. The very iffy thing in hashlib is
the reduction of the range of the secondary hash, which
encourages light clustering in nearly empty tables.

You are ignoring the fact that skip values of 2, 4 and 6, for example,
for indexes starting on even offsets near each other will tend to
collide with each other as the table fills up.

Same skip-value, and commensurate offset collisions must be minimized
by increasing the chance they miss each other. This happens most
easily if the skip values are predominantly co-prime, which is easiest
to construct when they are simply taken from a list of prime numbers.

Now if we imagine a rough worse case scenario of about a 17-sequence
longest chain, then that's at most 16 collisions from other chains. A
sample size of 16 from a random set will cause about 1 collision if
the random set is 256 entries in size. So a prime table of 256, 512
or 1024 entries should be fine (notice how this calculation is
independent of table size). And obviously a quick way of getting at
those tables is through the high bits of a full 32 bit hash function.
[...] The
system attempts to maintain tables at between 44 and 88% of
capacity.

Also, I want the table action to be independent of the hash
functions (apart from speed), so the functions are computed
every time needed. The system functions with all the hash
functions replaced by the constant 1 (unity) (tested in the
test suite) at a considerable speed penalty. So the action
is hash function independent, but the speed is highly
dependent.

I don't understand the point you are making. This is an interesting
robustness test that any well defined hash table should be able to
pass. Its a good idea, but independent of what's being discussed.
 
F

Flash Gordon

Malcolm McLean wrote, On 18/08/07 10:58:
I don't see how else I could react.

Had you started reacting by either fixing the valid criticisms or
publishing an errata for them the reaction to your book would not
steadily get worse.
> Criticisms have been made, some of
them indisuptable bug reports,

For which you have not published an errata so that your audience know
about them.
> some of them very good points,

For which you have not published an errata so that your audience knowa
about them.
> some of
them defensible but not really good criticisms,

If it is a defensible criticism then it still needs to be dealt with.
> some criticisms which
fail to understand the purpose of the code or the book,

It is to present and teach about basic algorithms. As far as I can see
everyone understands that, however as this group is about C rather than
algorithms a significant amount of the criticism has focused on the C
code, and this has been pointed out to you.
> and some which
are objectively false.

This is no reason to dismiss all the other criticisms and not publish an
errata for them.
That's what you would expect.

So why does it make you get defensive and start attacking those
criticising it rather than publishing an errata for the things you
accept are either wrong or could be improved?
> However there is also a sort of assuption
that the book is no good.

No, it is not an assumption, it is an opinion formed by first reading
the material you present and then seeing your reaction to criticism of it.
> In fact I found a bug in Ben Pfaff's circular
buffer code within a couple of minutes, but no one was demanding that
the code be taken down.

You noted in your criticism that it was a failure that you would be
unlikely to hit in the real world, since in the real world you would not
be requesting a buffer large enough to hit the problem, and people did
not attack you for criticising it.
> Nor did I report it in a silly way, like "this
implementation exhibits catastrophic and resource-hogging behaviour if
size_t is used beyond the range of a typical signed integer type". I
just pointed it out, as any sensible, experienced person does when
making bug reports.

In my opinion peoples reaction to you has become more extreme because
you refuse to accept any criticism of anything you post.
It is a good description. The writing is rather dry and my car park
metaphor is more interesting. Also I describe how to do deletions with
linear probing. However it is a vey good explanation of hashing, it
would be wrong to say it is not.

The code works on global variables. Whilst this is acceptable for
demonstration code, it also tends to encourage bad programming habits.

This looks like a bug to me

A valid criticism, so when will you fix this particular bug in your
stack and queue implementations? You are the one saying using globals is
a bug, so you cannot now claim it isn't.
(Quote recommended hash table website)

Insertion and deletion are equally simple relative to linear probing:

That is only one of several methods of doing insertion and deletion
presented in the text, and not even the first method of dealing with
clashes that is presented! Before mentioning the linear probing, it even
states there are several methods, and after the linear probing it goes
on to quadratic probing. So your criticism here is misleading at best.
1void jsw_insert ( void *key, int len )
2...{
3 unsigned h = hash ( key, len ) % N;
4 unsigned step;
5
6 for ( step = 1; table[h] != NULL; ++step )
7 h = ( h + ( step * step - step ) / 2 ) % N;
8
9 table[h] = key;
10}
1void jsw_remove ( void *key, int len )
2...{
3 unsigned h = hash ( key, len ) % N;
4 unsigned step;
5
6 for ( step = 1; table[h] != NULL; ++step )
7 h = ( h + ( step * step - step ) / 2 ) % N;
8
9 table[h] = DELETED;
10}

You need to call the comparison function and break if equal. As it is
the code marks a null at the end of the cluster as deleted.

Immediately after that code it says, "Keep in mind that the insertion
algorithm must also be modified..." so this is clearly presented as NOT
being a complete implementation of linear probing.
There are always problems.

Yes, but at least some of your criticisms of that text look like they
were deliberately trying to make it look bad by, for example, saying
that it uses linear probing whilst ignoring that it does this just after
saying there are several methods and when it goes on to quadratic
immediately after.
> As you say, it depends how you react to them.

Also how you react to the criticisms and whether they are valid. You
have just presented invalid criticisms in an apparant attempt to make
your work look better.
There are errors and weaknesses in Basic Algorithms, of course.

Yet you don't publish an errata for those you know know about and have
admitted.
> However
you can tell by the tone that mostly these are used as a pretext for envy.

That is your opinion of peoples motives, but it certainly is not my
motive and I don't believe it is the motive of the other criticisms. I
just don't like bad code being presented or bad advice.
 
P

pete

Flash said:
Malcolm McLean wrote, On 18/08/07 10:58:

You noted in your criticism that it was a failure that you would be
unlikely to hit in the real world,
since in the real world you would not
be requesting a buffer large enough to hit the problem, and people did
not attack you for criticising it.

I took a look at that,
but I don't hash and I didn't understand the code.

What was it that was being counted
by the size_t type object?

If it was the number of allocations,
then size_t is the wrong type.
The concept of what size_t is supposed to be,
is not related to how many allocated
objects can exist at the same time.
There's nothing in the standard which suggests
that there should be a type which is capable of holding
the number of allocated objects that can exist at one time.
The best type for counting allocations, is the largest unsigned type.

Or was it something else that was being counted
by the size_t type in Ben's circular buffer code?
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
474,432
Messages
2,571,682
Members
48,796
Latest member
Greg L.

Latest Threads

Top