Patricia said:
Here's my attempt. The test cases shown in the main method
are ALL the testing I've done.
It works for the other 4294967292 inputs too. I've tried 'em all ;-)
int roundedBits = Float.floatToIntBits(rounded);
float truncated = Float.intBitsToFloat(roundedBits - 1);
return truncated;
That's clever. At least, I /hope/ it's clever because I don't understand why
it works. Do you care to expand a bit ? Please ?
I got interested in this myself and have tried out a number of approaches,
including an attempt to convert rounding into truncation by consideration of
ULPs (as I suggested to the OP earlier). The best, IMO, is to do the
truncation in the integer domain before converting into a float. Its a little
complicated to do, but it runs fast, and has the aesthetic advantage of not
requiring double-precision operations to produce a single-precision result
(which all the other things I tried do).
I've appended the code, including the test-harness, for anyone who's
interested.
-- chris
=========== Test.java ===============
/*
* contains several static implementations of truncating a 32-bit integer
* to a 32-bit float with rounding "towards zero" rather that to the nearest
* float (as in the Java language spec for casting an int to a float).
* Also contains an exhaustive test which checks all the implementations
* against the simplest one (presumed to be correct).
*
* Approximate times when run on a 1.5.0_06-b05 JVM, in "client" mode on
* a 1.5 GHz celeron WinXP Pro box. In each case the time is the time in
* nanoseconds to convert one value averaged over the full 32-bit range of
* integers.
*
* Slow: 130
* ULP: 63.5
* FastULP: 65
* Twiddle: 19
* Patricia: 74.5
*
* Approximate times for inputs in the resticted range [-10,000,000,
10,000,000),
* 83% of which is exactly representable as a float.
*
* Slow: 12
* ULP: 12
* FastULP: 20.5
* Twiddle: 18.5
* Patricia: 24
*/
public class Test
{
public static void
main(String[] args)
{
int i = 0;
int errors = 0;
for (;
{
if (i % 10000000 == 0)
{
// progress...
System.out.printf("%11d...\r", i);
System.out.flush();
}
float shouldBe = slowTruncate(i);
float is;
if ((is = ulpTruncate(i)) != shouldBe)
{
System.out.printf("Slow: %d: %f != %f%n", i, is, shouldBe);
errors++;
}
if ((is = fastTruncate(i)) != shouldBe)
{
System.out.printf("Fast: %d: %f != %f%n", i, is, shouldBe);
errors++;
}
if ((is = twiddleTruncate(i)) != shouldBe)
{
System.out.printf("Twiddle: %d: %f != %f%n", i, is, shouldBe);
errors++;
}
if ((is = patriciaTruncate(i)) != shouldBe)
{
System.out.printf("Patricia: %d: %f != %f%n", i, is, shouldBe);
errors++;
}
if (errors > 10)
break;
if (i == -1)
break; // we've wrapped all the way around
i++;
}
System.out.printf("%ndone, %d errors%n", errors);
}
/**
* Slow implementation of truncating an int to a 32-bit floating point.
* This implementation is intended to be the "so simple it can't possibly
* be wrong" benchmark against which other implementations can be compared
*/
public static float
slowTruncate(int i)
{
// since all +ve numbers have negations, but the reverse
// isn't true, we work with -ve numbers
if (i > 0)
return -slowTruncate(-i);
// loop up (towards zero) until we find the right answer
for (;
{
float f = (float)i;
if ((double)f >= (double)i)
return f;
i++;
}
}
/**
* Implementation of truncating an int to a 32-bit floating point based on
* ULP manipulation
*/
public static float
ulpTruncate(int i)
{
float f = (float)i;
double error = (double)f - (double)i;
// no error ?
if (error == 0)
return f;
// or rounded towards 0 anyway ?
if ((error < 0) == (i > 0))
return f;
// adjust by one ulp towards 0
// the double ulp() is because ulp is the distance to the next
// number away from 0, we need the distance to the next float
// nearest zero
if (i > 0)
{
float ulp = Math.ulp(f-Math.ulp(f));
f -= ulp;
}
else
{
float ulp = Math.ulp(f+Math.ulp(f));
f += ulp;
}
return f;
}
/**
* Faster implementation of truncating an int to a 32-bit floating point.
* This implementation uses a simple test to catch a useful range of
* "eaasy" cases. If the input is likely to be in that range then
* this provides a useful optimisation, otherwise it's marginally
* slower.
* NB1: this version uses ulpTruncate() for the difficult cases, but
* it could just as easily use a different one -- even slowTruncate().
* NB2: since this consists only of a simple test guarding simple
operations,
* it seems reasonable to hope that the JIT will inline it in callers
*/
public static float
fastTruncate(int i)
{
// these are all exact
if (i >= -8388608 && i <= 8388608)
return (float)i;
else
return ulpTruncate(i); // or any other xxxTruncate()
}
/**
* Faster implementation of truncating an int to a 32-bit floating point
based
* on bit-twiddling.
* The idea is to take the number, discover how many high-bits it has which
* don't fit into the 24-bit mantissa of a float, and to mask off that many
* low bits before converting to a float. I.e. we do the truncation in the
* integer domain before converting to a float, so the conversion is always
* is always exact.
* The actual implementation is complicated by sign issues, but that's the
* basic idea.
* Note that we don't actually mess with the physical layout of an IEEE
float,
* but only use the fact that it has exactly 24 bits in which to represent
the
* mantissa
*/
public static float
twiddleTruncate(int i)
{
if (i >= 0)
{
// because i is +ve the high bit is always zero,
// and so (i >>> 24) is always in the range [0, 128)
return (float)(i & MASKS[i >>> 24]);
}
// nasty special case
if (i == Integer.MIN_VALUE)
return -2147483648.0f;
return -twiddleTruncate(-i);
}
// helper data for twiddleTruncate(). A set of precomuted masks which
// will be indexed by bits [24, 30] (zero-based) of the input integer.
private static final int[] MASKS = new int[128];
static
{
for (int i = 0; i < 128; i++)
MASKS
= -1;
for (int bit = 0; bit < 7; bit++)
{
int threshold = 1 << bit;
int mask = 1 << bit;
for (int i = threshold; i < 128; i++)
MASKS ^= mask;
}
}
/**
* Clever implementation of truncating an int to a 32-bit floating point
posted
* to comp.lang.java.programmer by Patricia Shanahan on 13 May 2006
* (Message-ID: <[email protected]> )
*/
public static float
patriciaTruncate(int i)
{
float rounded = (float) i;
if (Math.abs((double) rounded) <= Math.abs((double) i))
return rounded;
int roundedBits = Float.floatToIntBits(rounded);
float truncated = Float.intBitsToFloat(roundedBits - 1);
return truncated;
}
}
==========================