Help optimize nbody bench program (c++ sse2 intrinsics)

Melzzzzz · Oct 12, 2012

I have succeed to optimize nbody program to be faster
than c++ what we have on shootout site currently
but is still slower than intel fortran

http://shootout.alioth.debian.org/u64q/performance.php?test=nbody

I have started from java implementation and converted it to c++,
but it seems somewhat clumsy.
If someone is interested to play with this I will be glad to
see improvement

On q6600 @ 2.4 GHz executes:

[bmaxa@bmaxa nbody]$ g++ -Wall -O3 -std=c++0x -msse2 nbodysse2.cpp -o nbodysse2cpp
[bmaxa@bmaxa nbody]$ time ./nbodysse2cpp 50000000
-0.169075164
-0.169059907

real 0m17.108s
user 0m17.076s
sys 0m0.022s

while program that is currently on site executes:

[bmaxa@bmaxa nbody]$ time ./nbodycpp 50000000
-0.169075164
-0.169059907

real 0m20.621s
user 0m20.554s
sys 0m0.039s

Program follows:

/* The Computer Language Benchmarks Game

http://shootout.alioth.debian.org/

contributed by Mark C. Lewis

modified slightly by Chad Whipkey

converted to c++ and added sse2 support by Branimir Maksimovic

*/
#include <cstdio>
#include <cmath>
#include <cstdlib>
#include <vector>
#include <immintrin.h>

static const double PI = 3.141592653589793;
static const double SOLAR_MASS = 4 * PI * PI;
static const double DAYS_PER_YEAR = 365.24;

class Body {

public: double x, y, z, filler, vx, vy, vz, mass;

public: Body(){}

static Body& jupiter(){
static Body p;
p.x = 4.84143144246472090e+00;
p.y = -1.16032004402742839e+00;
p.z = -1.03622044471123109e-01;
p.vx = 1.66007664274403694e-03 * DAYS_PER_YEAR;
p.vy = 7.69901118419740425e-03 * DAYS_PER_YEAR;
p.vz = -6.90460016972063023e-05 * DAYS_PER_YEAR;
p.mass = 9.54791938424326609e-04 * SOLAR_MASS;
return p;
}

static Body& saturn(){
static Body p;
p.x = 8.34336671824457987e+00;
p.y = 4.12479856412430479e+00;
p.z = -4.03523417114321381e-01;
p.vx = -2.76742510726862411e-03 * DAYS_PER_YEAR;
p.vy = 4.99852801234917238e-03 * DAYS_PER_YEAR;
p.vz = 2.30417297573763929e-05 * DAYS_PER_YEAR;
p.mass = 2.85885980666130812e-04 * SOLAR_MASS;
return p;
}

static Body& uranus(){
static Body p;
p.x = 1.28943695621391310e+01;
p.y = -1.51111514016986312e+01;
p.z = -2.23307578892655734e-01;
p.vx = 2.96460137564761618e-03 * DAYS_PER_YEAR;
p.vy = 2.37847173959480950e-03 * DAYS_PER_YEAR;
p.vz = -2.96589568540237556e-05 * DAYS_PER_YEAR;
p.mass = 4.36624404335156298e-05 * SOLAR_MASS;
return p;
}

static Body& neptune(){
static Body p;
p.x = 1.53796971148509165e+01;
p.y = -2.59193146099879641e+01;
p.z = 1.79258772950371181e-01;
p.vx = 2.68067772490389322e-03 * DAYS_PER_YEAR;
p.vy = 1.62824170038242295e-03 * DAYS_PER_YEAR;
p.vz = -9.51592254519715870e-05 * DAYS_PER_YEAR;
p.mass = 5.15138902046611451e-05 * SOLAR_MASS;
return p;
}

static Body& sun(){
static Body p;
p.mass = SOLAR_MASS;
return p;
}

Body& offsetMomentum(double px, double py, double pz){
vx = -px / SOLAR_MASS;
vy = -py / SOLAR_MASS;
vz = -pz / SOLAR_MASS;
return *this;
}
};

template <typename T, std::size_t N = 16>
class AlignmentAllocator {
public:
typedef T value_type;
typedef std::size_t size_type;
typedef std:

trdiff_t difference_type;

typedef T * pointer;
typedef const T * const_pointer;

typedef T & reference;
typedef const T & const_reference;

public:
inline AlignmentAllocator () throw () { }

template <typename T2>
inline AlignmentAllocator (const AlignmentAllocator<T2, N> &) throw () { }

inline ~AlignmentAllocator () throw () { }

inline pointer adress (reference r) {
return &r;
}

inline const_pointer adress (const_reference r) const {
return &r;
}

inline pointer allocate (size_type n) {
return (pointer)_mm_malloc(n*sizeof(value_type), N);
}

inline void deallocate (pointer p, size_type) {
_mm_free (p);
}

inline void construct (pointer p, const value_type & wert) {
new (p) value_type (wert);
}

inline void destroy (pointer p) {
p->~value_type ();
}

inline size_type max_size () const throw () {
return size_type (-1) / sizeof (value_type);
}

template <typename T2>
struct rebind {
typedef AlignmentAllocator<T2, N> other;
};

bool operator!=(const AlignmentAllocator<T,N>& other) const {
return !(*this == other);
}

// Returns true if and only if storage allocated from *this
// can be deallocated from other, and vice versa.
// Always returns true for stateless allocators.
bool operator==(const AlignmentAllocator<T,N>& other) const {
return true;
}
};

class NBodySystem {
private: std::vector<Body,AlignmentAllocator<Body,4096> > bodies;

public: NBodySystem()
: bodies {
Body::sun(),
Body::jupiter(),
Body::saturn(),
Body::uranus(),
Body::neptune()
}
{
double px = 0.0;
double py = 0.0;
double pz = 0.0;
for(unsigned i=0; i < bodies.size(); ++i) {
px += bodies.vx * bodies.mass;
py += bodies.vy * bodies.mass;
pz += bodies.vz * bodies.mass;
}
bodies[0].offsetMomentum(px,py,pz);
}

public: void advance(double dt) {
unsigned N = (bodies.size()-1)*bodies.size()/2;
struct R{
double dx,dy,dz,filler;
};
__m128d ddt = _mm_set1_pd(dt);

static __attribute__((aligned(16))) R r[1000];
static __attribute__((aligned(16))) double mag[1000];

for(unsigned i=0,k=0; i < bodies.size(); ++i) {
Body& iBody = bodies;
__m128d idx = _mm_load_pd(&iBody.x);
for(unsigned j=i+1; j < bodies.size(); ++j,++k) {
__m128d jdx = _mm_load_pd(&bodies[j].x);
_mm_store_pd(&r[k].dx,idx-jdx);
r[k].dz = iBody.z - bodies[j].z;
}
}

for(unsigned i=0; i < N; i+=2) {
__m128d dx,dy,dz;
dx = _mm_loadl_pd(dx,&r.dx);
dy = _mm_loadl_pd(dy,&r.dy);
dz = _mm_loadl_pd(dz,&r.dz);

dx = _mm_loadh_pd(dx,&r[i+1].dx);
dy = _mm_loadh_pd(dy,&r[i+1].dy);
dz = _mm_loadh_pd(dz,&r[i+1].dz);

__m128d dSquared = dx*dx + dy*dy + dz*dz;
__m128d distance = _mm_sqrt_pd(dSquared);
__m128d dmag = ddt/(dSquared * distance);
_mm_store_pd(&mag,dmag);
}

for(unsigned i=0,k=0; i < bodies.size(); ++i) {
Body& iBody = bodies;
for(unsigned j=i+1; j < bodies.size(); ++j,++k) {
__m128d jmass = _mm_set1_pd(bodies[j].mass);
__m128d kmag = _mm_set1_pd(mag[k]);
__m128d jkmm = jmass * kmag;

__m128d kdx = _mm_load_pd(&r[k].dx);

__m128d ivx = _mm_load_pd(&iBody.vx);

_mm_store_pd(&iBody.vx, ivx - kdx*jkmm);

iBody.vz -= r[k].dz * bodies[j].mass * mag[k];

__m128d imass = _mm_set1_pd(iBody.mass);

jkmm = imass * kmag;

__m128d jvx = _mm_load_pd(&bodies[j].vx);

_mm_store_pd(&bodies[j].vx, jvx + kdx * jkmm);

bodies[j].vz += r[k].dz * iBody.mass * mag[k];
}
}

for (unsigned i = 0; i < bodies.size(); ++i) {
__m128d ix = _mm_load_pd(&bodies.x);
__m128d ivx = _mm_load_pd(&bodies.vx);
_mm_store_pd(&bodies.x, ix + ddt * ivx);
bodies.z += dt * bodies.vz;
}
}

public: double energy(){
double dx, dy, dz, distance;
double e = 0.0;

for (unsigned i=0; i < bodies.size(); ++i) {
Body& iBody = bodies;
e += 0.5 * iBody.mass *
( iBody.vx * iBody.vx
+ iBody.vy * iBody.vy
+ iBody.vz * iBody.vz );

for (unsigned j=i+1; j < bodies.size(); ++j) {
Body& jBody = bodies[j];
dx = iBody.x - jBody.x;
dy = iBody.y - jBody.y;
dz = iBody.z - jBody.z;

distance = sqrt(dx*dx + dy*dy + dz*dz);
e -= (iBody.mass * jBody.mass) / distance;
}
}
return e;
}
};

int main(int argc, char** argv) {
int n = atoi(argv[1]);

NBodySystem bodies;
printf("%.9f\n", bodies.energy());
for (int i=0; i<n; ++i)
bodies.advance(0.01);
printf("%.9f\n", bodies.energy());
}

Carlo Milanesi · Oct 14, 2012

I have succeed to optimize nbody program to be faster
than c++ what we have on shootout site currently
but is still slower than intel fortran

#include <immintrin.h>
__m128d ddt = _mm_set1_pd(dt);

This statements are not standard C++, therefore I presume they are not
allowed in the shootout.
If they were, you could write a highly optimized library routine in
assembly language and then call that routine from any language,
resulting in all the languages having the same performance.

Melzzzzz · Oct 14, 2012

This statements are not standard C++, therefore I presume they are
not allowed in the shootout.

While not standard c++, they are standard on windows max and linux
compilers. Those are standard intrinsics for x86 processors.
They are allowed and used in the shootout.

If they were, you could write a highly optimized library routine in
assembly language and then call that routine from any language,
resulting in all the languages having the same performance.

Heh, I wrote both sse2 and avx versions in assembler and is very fast
(avx version is really fast but q6600 doesn;t have avx

.
Problem is that I don;t have gcc 4.6.3 (or newer) on computer with
q6600 processor so don't know real end result.
You are right about implementing lib in assembler, but these are
intrinsics which are used in real programming so that shows advantage
of c++ over , say Java.

Melzzzzz · Oct 14, 2012

On Fri, 12 Oct 2012 20:53:14 +0200

I have succeed in getting fastest speed by using
sqrt reciprocal instruction.

[bmaxa@bmaxa nbody]$ time taskset -c 0 ./nbodysse2cpp 50000000
-0.169075164
-0.169059907

real 0m12.743s
user 0m12.735s
sys 0m0.001s
[bmaxa@bmaxa nbody]$

Relevant code is:

__m128d dSquared = dx*dx + dy*dy + dz*dz;

__m128d distance =
_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dSquared)));
for(unsigned j=0;j<2;++j)
{
distance = distance * _mm_set1_pd(1.5) -
((_mm_set1_pd(0.5) * dSquared) * distance) *
(distance * distance);
}

__m128d dmag = ddt/(dSquared) * distance;

bench: binary trees	10	Jul 29, 2012
Drawing missing in bitmap in a pure C win32 program	4	Jun 3, 2023
help to convert this fortan program to c++	2	Apr 11, 2009
Need help with first C# console program	0	Sep 4, 2015
Tightening up Shootout code	2	Jan 15, 2006
Computing a distance matrix using SSE2	15	Aug 4, 2009
help-I'm new in c++	2	Jun 3, 2007
Old template class works in VC++ Not in C++ Builder 5	3	Nov 30, 2006

Help optimize nbody bench program (c++ sse2 intrinsics)

Melzzzzz

Carlo Milanesi

Melzzzzz

Melzzzzz

Ask a Question

Similar Threads

Members online

Forum statistics

Latest Threads