What do your benchmarks show for this approach vs. HashSet?
Well, that the precise data inputs matter a lot (see benchmark code
below). For the test as written, the custom logic approach is roughly
twice as fast. But.. I saw that the length of input Strings matter a
lot, which is to be expected since the longer the Strings, the deeper
the if nesting gets. It's a race between HashSet's O(1) logic vs my
O(n) logc.
I also had a quick look at the generated bytecode (using Eclipse's
compiler, javac fans YMMV).. and found some recurring sillyness which
would not feature in the hand generated logic, so the custom approach
would give a significant performance advantage over HashSet.contains,
but only for really short Strings. Mine are about 3-5x longer than
those in the benchmark.. so for me, HashSet.contains() and String's
clever hashCode() wins.
import java.util.HashSet;
import java.util.Set;
public class MatchStringSetBenchmark {
private static final int TIMES_TO_RUN_TEST = 10;
private static final int HAMMER_SIZE = 1000000;
private static final double TIMES_UNROLLED = 6;
public static void main(String[] args) {
String[] stringsToMatch = { "hello", "highe", "maybe" };
Set<String> realSet = fillHashSet(stringsToMatch);
for (int t = 0; t < TIMES_TO_RUN_TEST; t++) {
long then = System.nanoTime();
for (int i = 0; i < HAMMER_SIZE; i++) {
// three matching inputs and three failing inputs to
fairly distribute
// match/fail bias in algorithm
// assert realSet.contains("hello");
// assert realSet.contains("hello");
// assert realSet.contains("hello");
// assert !realSet.contains("hella");
// assert !realSet.contains("hella");
// assert !realSet.contains("hella");
realSet.contains("hello");
realSet.contains("hello");
realSet.contains("hello");
realSet.contains("hella");
realSet.contains("hella");
realSet.contains("hella");
}
long now = System.nanoTime();
long elapsed = now - then;
double costOfSearch = ((double) elapsed) / HAMMER_SIZE *
TIMES_UNROLLED;
System.out.println("ns per HashSet.contains(): " +
costOfSearch);
}
for (int t = 0; t < TIMES_TO_RUN_TEST; t++) {
long then = System.nanoTime();
for (int i = 0; i < HAMMER_SIZE; i++) {
// assert contains("hello");
// assert contains("hello");
// assert contains("hello");
// assert ! contains("hella");
// assert ! contains("hella");
// assert ! contains("hella");
contains("hello");
contains("hello");
contains("hello");
contains("hella");
contains("hella");
contains("hella");
}
long now = System.nanoTime();
long elapsed = now - then;
double costOfSearch = ((double) elapsed) / HAMMER_SIZE *
TIMES_UNROLLED;
System.out.println("ns per CUSTOM.contains(): " +
costOfSearch);
}
}
private static Set<String> fillHashSet(String[] stringsToMatch) {
Set<String> realSet = new HashSet<String>();
for (String string : stringsToMatch) {
realSet.add(string);
}
return realSet;
}
public static boolean contains(String input) {
char first = input.charAt(0);
if (first == 'h') {
char second = input.charAt(1);
if (second == 'e') {
char third = input.charAt(2);
if (third == 'l') {
char fourth = input.charAt(3);
if (fourth == 'l') {
char fifth = input.charAt(4);
return (fifth == 'o');
} else {
return false;
}
} else {
return false;
}
}
if (second == 'i') {
char third = input.charAt(2);
if (third == 'g') {
char fourth = input.charAt(3);
if (fourth == 'h') {
char fifth = input.charAt(4);
return (fifth == 'e');
} else {
return false;
}
} else {
return false;
}
}
return false;
}
if (first == 'm') {
char second = input.charAt(1);
if (second == 'a') {
char third = input.charAt(2);
if (third == 'y') {
char fourth = input.charAt(3);
if (fourth == 'b') {
char fifth = input.charAt(4);
return (fifth == 'e');
} else {
return false;
}
} else {
return false;
}
}
}
return false;
}
}