I
Ingo Nolden
Dear Group,
I am a little confused by the result of a code that should give me some
information about CPU cache effects.
I wrote a function that performs some flops on a vector/array of doubles
or floats and of changing size . While playing around and trying
different things I compared the use of a standard c array with a
std::vector and the vector from the boost library.
The program was compiled with a VC++ 7.1 comiler with the default
release settings, and later with also whole program optimization and
global optimization activated ( which made no difference ).
On my laptop intel P4 2.6GHz and 512 Mbyte RAM the result was extremely
surprising as the raw c array performed as expected in a range between
300 and 350 MFlops ( if my Flops calculation is right ).
The other arrays however were about 80 times !!!! slower.
I had them expected to be probably some percentage slower.
Investigating the asm code ( as far as I can guess what it means ) seemd
to be doing the same thing however.
This made me think that it must be an issue about memory access. It can
not be due to main memory size because the difference occurs at any
array size, beginning from 1 Mbyte.
Now I wanted to prove that it is not an compiler/optimization dependant
issue. I ran the executable on a different machine, which is a AMD
Athlon 2400+ Desktop with 1Gb RAM. On this machine I got an even more
surprising result:
The std::vector and uBlas::vector performed well and even superceded the
plain c array.
I usually don't care so much about performance, but 8000% is worth
thinking about it.
Below is my source. If one has no uBlas at hand, he can comment the two
lines and it should work.
Also, to get back to my original intention, I want to change from
sequential access to arbitrary access of the vector items. Does anyone
know a good/standard way to do so? It should put additional effort on
the CPU.
So, here goes my code:
#include <iostream>
#include <fstream>
#include <vector>
#include <list>
#include <windows.h>
//#include <math.h>
//#include <float.h>
#include <boost/numeric/ublas/vector.hpp>
using namespace std;
ofstream trash( "trash.txt" );
namespace my
{
template< typename ValueT >
inline ValueT const& max( ValueT const& l, ValueT const& r )
{
return ( l > r ) ? l : r;
}
template< typename ValueT >
inline ValueT const& min( ValueT const& l, ValueT const& r )
{
return ( l < r ) ? l : r;
}
}
template< typename ValueT, typename ArrayT > inline
void InitializeArray( ArrayT array, unsigned &length, ValueT &initializer )
{
for( unsigned i = 0; i < length; ++i )
array[ i ] = initializer;
}
template< typename ValueT > inline
ValueT ProcessMixedOps( ValueT &value )
{
return static_cast<ValueT>( (1.0 + value) * (1.5 - value) / value );
}
template< typename ValueT, typename ArrayT > inline
ValueT ProcessMixedArray( ArrayT &array, unsigned &length, unsigned &loops )
{
ValueT result = 1;
for( unsigned j = 0; j < loops; ++j )
for( unsigned i = 0; i < length; ++i )
result *= ProcessMixedOps( array[ i ] );
return result;
}
template< typename ValueT >
class Memory
{
public:
template< typename ArrayT >
ArrayT Alloc( unsigned &length )
{
return ArrayT( length );
}
template<>
ValueT* Alloc< ValueT* >( unsigned &length )
{
return new ValueT[ length ];
}
template< typename ArrayT >
void Dealloc( ArrayT &array )
{
//array.clear( );
}
template<>
void Dealloc< ValueT* >( ValueT* &array )
{
delete array;
}
};
template< typename ValueT, typename ArrayT >
double Test( ValueT init, unsigned memLength )
{
unsigned length = memLength / sizeof( ValueT );
ArrayT Vector = Memory<ValueT>( ).Alloc<ArrayT>( length );
InitializeArray( Vector, length, init );
unsigned loops = my::max( 10000000 / length, (unsigned)1 );
unsigned tick = GetTickCount( );
double res = ProcessMixedArray<ValueT>( Vector, length, loops );
tick = GetTickCount( ) - tick;
Memory<ValueT>( ).Dealloc<ArrayT>( Vector );
double dSec = (double) tick / (double) 1000;
trash << res << endl; // output and forget result
double dFlops = (double) length * 5.0 * loops;
double dMFlops = dFlops / 1000000.0;
return dMFlops / dSec;
}
int main2( )
{
//unsigned min_size_p = 10; // 2 ^ 10 = 1.024
//unsigned min_size_p = 12; // 2 ^ 12 = 4.096
unsigned min_size_p = 1000000; // 2 ^ 3 = 16.384
unsigned max_size_p = 200000000; // 2 ^ 25 = 33.554.432
//unsigned max_size_p = 10; // 2 ^ 25 = 33.554.432
cout << "Max Vector memory length: ";
cout << (unsigned)pow( 2, max_size_p );
cout << endl;
DWORD dwNumber = GetTickCount( );
dwNumber = GetTickCount( ) / dwNumber;
short number = static_cast<short>( dwNumber );
cout << "number: " << number << endl;
//cout << "double \t float \t int\n";
cout << "\tdouble* 1\tvector<double>\tuBlas::vector<double>\n";
for( unsigned v_size_p = min_size_p ; v_size_p < max_size_p; v_size_p
+= 2000000 )
{
unsigned size = v_size_p;//(unsigned)pow( 2, v_size_p );
cout << fixed << size << "\t";
cout << Test<double, double*>( number, size ) << "\t";
cout << Test<double, vector<double> >( number, size ) << "\t";
cout << Test<double, boost::numeric::ublas::vector< double> >( number,
size ) << "\t";
//cout << Test<double, list<double> >( number, size ) << "\t";
//cout << Test<float, float*>( number, size ) << "\t";
//cout << Test<float, vector<float> >( number, size ) << "\t";
//cout << Test<double, double*>( number, size ) << "\t";
//cout << Test<double, double*>( number, size ) << "\t";
//cout << Test<double, double*>( number, size ) << "\t";
//cout << Test<double, double*>( number, size ) << "\t";
//cout << Test<float, float*>( number, size ) << "\t";
//cout << Test<int, int*>( number, size ) << "\t";
cout << "\n";
}
//cout << dMFlops << " / " << dSec << " = " << dMFlops / dSec;
cout << endl;
return 0;
}
int main( )
{
return main2( );
}
I am a little confused by the result of a code that should give me some
information about CPU cache effects.
I wrote a function that performs some flops on a vector/array of doubles
or floats and of changing size . While playing around and trying
different things I compared the use of a standard c array with a
std::vector and the vector from the boost library.
The program was compiled with a VC++ 7.1 comiler with the default
release settings, and later with also whole program optimization and
global optimization activated ( which made no difference ).
On my laptop intel P4 2.6GHz and 512 Mbyte RAM the result was extremely
surprising as the raw c array performed as expected in a range between
300 and 350 MFlops ( if my Flops calculation is right ).
The other arrays however were about 80 times !!!! slower.
I had them expected to be probably some percentage slower.
Investigating the asm code ( as far as I can guess what it means ) seemd
to be doing the same thing however.
This made me think that it must be an issue about memory access. It can
not be due to main memory size because the difference occurs at any
array size, beginning from 1 Mbyte.
Now I wanted to prove that it is not an compiler/optimization dependant
issue. I ran the executable on a different machine, which is a AMD
Athlon 2400+ Desktop with 1Gb RAM. On this machine I got an even more
surprising result:
The std::vector and uBlas::vector performed well and even superceded the
plain c array.
I usually don't care so much about performance, but 8000% is worth
thinking about it.
Below is my source. If one has no uBlas at hand, he can comment the two
lines and it should work.
Also, to get back to my original intention, I want to change from
sequential access to arbitrary access of the vector items. Does anyone
know a good/standard way to do so? It should put additional effort on
the CPU.
So, here goes my code:
#include <iostream>
#include <fstream>
#include <vector>
#include <list>
#include <windows.h>
//#include <math.h>
//#include <float.h>
#include <boost/numeric/ublas/vector.hpp>
using namespace std;
ofstream trash( "trash.txt" );
namespace my
{
template< typename ValueT >
inline ValueT const& max( ValueT const& l, ValueT const& r )
{
return ( l > r ) ? l : r;
}
template< typename ValueT >
inline ValueT const& min( ValueT const& l, ValueT const& r )
{
return ( l < r ) ? l : r;
}
}
template< typename ValueT, typename ArrayT > inline
void InitializeArray( ArrayT array, unsigned &length, ValueT &initializer )
{
for( unsigned i = 0; i < length; ++i )
array[ i ] = initializer;
}
template< typename ValueT > inline
ValueT ProcessMixedOps( ValueT &value )
{
return static_cast<ValueT>( (1.0 + value) * (1.5 - value) / value );
}
template< typename ValueT, typename ArrayT > inline
ValueT ProcessMixedArray( ArrayT &array, unsigned &length, unsigned &loops )
{
ValueT result = 1;
for( unsigned j = 0; j < loops; ++j )
for( unsigned i = 0; i < length; ++i )
result *= ProcessMixedOps( array[ i ] );
return result;
}
template< typename ValueT >
class Memory
{
public:
template< typename ArrayT >
ArrayT Alloc( unsigned &length )
{
return ArrayT( length );
}
template<>
ValueT* Alloc< ValueT* >( unsigned &length )
{
return new ValueT[ length ];
}
template< typename ArrayT >
void Dealloc( ArrayT &array )
{
//array.clear( );
}
template<>
void Dealloc< ValueT* >( ValueT* &array )
{
delete array;
}
};
template< typename ValueT, typename ArrayT >
double Test( ValueT init, unsigned memLength )
{
unsigned length = memLength / sizeof( ValueT );
ArrayT Vector = Memory<ValueT>( ).Alloc<ArrayT>( length );
InitializeArray( Vector, length, init );
unsigned loops = my::max( 10000000 / length, (unsigned)1 );
unsigned tick = GetTickCount( );
double res = ProcessMixedArray<ValueT>( Vector, length, loops );
tick = GetTickCount( ) - tick;
Memory<ValueT>( ).Dealloc<ArrayT>( Vector );
double dSec = (double) tick / (double) 1000;
trash << res << endl; // output and forget result
double dFlops = (double) length * 5.0 * loops;
double dMFlops = dFlops / 1000000.0;
return dMFlops / dSec;
}
int main2( )
{
//unsigned min_size_p = 10; // 2 ^ 10 = 1.024
//unsigned min_size_p = 12; // 2 ^ 12 = 4.096
unsigned min_size_p = 1000000; // 2 ^ 3 = 16.384
unsigned max_size_p = 200000000; // 2 ^ 25 = 33.554.432
//unsigned max_size_p = 10; // 2 ^ 25 = 33.554.432
cout << "Max Vector memory length: ";
cout << (unsigned)pow( 2, max_size_p );
cout << endl;
DWORD dwNumber = GetTickCount( );
dwNumber = GetTickCount( ) / dwNumber;
short number = static_cast<short>( dwNumber );
cout << "number: " << number << endl;
//cout << "double \t float \t int\n";
cout << "\tdouble* 1\tvector<double>\tuBlas::vector<double>\n";
for( unsigned v_size_p = min_size_p ; v_size_p < max_size_p; v_size_p
+= 2000000 )
{
unsigned size = v_size_p;//(unsigned)pow( 2, v_size_p );
cout << fixed << size << "\t";
cout << Test<double, double*>( number, size ) << "\t";
cout << Test<double, vector<double> >( number, size ) << "\t";
cout << Test<double, boost::numeric::ublas::vector< double> >( number,
size ) << "\t";
//cout << Test<double, list<double> >( number, size ) << "\t";
//cout << Test<float, float*>( number, size ) << "\t";
//cout << Test<float, vector<float> >( number, size ) << "\t";
//cout << Test<double, double*>( number, size ) << "\t";
//cout << Test<double, double*>( number, size ) << "\t";
//cout << Test<double, double*>( number, size ) << "\t";
//cout << Test<double, double*>( number, size ) << "\t";
//cout << Test<float, float*>( number, size ) << "\t";
//cout << Test<int, int*>( number, size ) << "\t";
cout << "\n";
}
//cout << dMFlops << " / " << dSec << " = " << dMFlops / dSec;
cout << endl;
return 0;
}
int main( )
{
return main2( );
}