Help optimize nbody bench program (c++ sse2 intrinsics)

M

Melzzzzz

I have succeed to optimize nbody program to be faster
than c++ what we have on shootout site currently
but is still slower than intel fortran ;)

http://shootout.alioth.debian.org/u64q/performance.php?test=nbody

I have started from java implementation and converted it to c++,
but it seems somewhat clumsy.
If someone is interested to play with this I will be glad to
see improvement ;)

On q6600 @ 2.4 GHz executes:

[bmaxa@bmaxa nbody]$ g++ -Wall -O3 -std=c++0x -msse2 nbodysse2.cpp -o nbodysse2cpp
[bmaxa@bmaxa nbody]$ time ./nbodysse2cpp 50000000
-0.169075164
-0.169059907

real 0m17.108s
user 0m17.076s
sys 0m0.022s

while program that is currently on site executes:

[bmaxa@bmaxa nbody]$ time ./nbodycpp 50000000
-0.169075164
-0.169059907

real 0m20.621s
user 0m20.554s
sys 0m0.039s

Program follows:

/* The Computer Language Benchmarks Game

http://shootout.alioth.debian.org/



contributed by Mark C. Lewis

modified slightly by Chad Whipkey

converted to c++ and added sse2 support by Branimir Maksimovic


*/
#include <cstdio>
#include <cmath>
#include <cstdlib>
#include <vector>
#include <immintrin.h>

static const double PI = 3.141592653589793;
static const double SOLAR_MASS = 4 * PI * PI;
static const double DAYS_PER_YEAR = 365.24;


class Body {

public: double x, y, z, filler, vx, vy, vz, mass;

public: Body(){}

static Body& jupiter(){
static Body p;
p.x = 4.84143144246472090e+00;
p.y = -1.16032004402742839e+00;
p.z = -1.03622044471123109e-01;
p.vx = 1.66007664274403694e-03 * DAYS_PER_YEAR;
p.vy = 7.69901118419740425e-03 * DAYS_PER_YEAR;
p.vz = -6.90460016972063023e-05 * DAYS_PER_YEAR;
p.mass = 9.54791938424326609e-04 * SOLAR_MASS;
return p;
}

static Body& saturn(){
static Body p;
p.x = 8.34336671824457987e+00;
p.y = 4.12479856412430479e+00;
p.z = -4.03523417114321381e-01;
p.vx = -2.76742510726862411e-03 * DAYS_PER_YEAR;
p.vy = 4.99852801234917238e-03 * DAYS_PER_YEAR;
p.vz = 2.30417297573763929e-05 * DAYS_PER_YEAR;
p.mass = 2.85885980666130812e-04 * SOLAR_MASS;
return p;
}

static Body& uranus(){
static Body p;
p.x = 1.28943695621391310e+01;
p.y = -1.51111514016986312e+01;
p.z = -2.23307578892655734e-01;
p.vx = 2.96460137564761618e-03 * DAYS_PER_YEAR;
p.vy = 2.37847173959480950e-03 * DAYS_PER_YEAR;
p.vz = -2.96589568540237556e-05 * DAYS_PER_YEAR;
p.mass = 4.36624404335156298e-05 * SOLAR_MASS;
return p;
}

static Body& neptune(){
static Body p;
p.x = 1.53796971148509165e+01;
p.y = -2.59193146099879641e+01;
p.z = 1.79258772950371181e-01;
p.vx = 2.68067772490389322e-03 * DAYS_PER_YEAR;
p.vy = 1.62824170038242295e-03 * DAYS_PER_YEAR;
p.vz = -9.51592254519715870e-05 * DAYS_PER_YEAR;
p.mass = 5.15138902046611451e-05 * SOLAR_MASS;
return p;
}

static Body& sun(){
static Body p;
p.mass = SOLAR_MASS;
return p;
}

Body& offsetMomentum(double px, double py, double pz){
vx = -px / SOLAR_MASS;
vy = -py / SOLAR_MASS;
vz = -pz / SOLAR_MASS;
return *this;
}
};

template <typename T, std::size_t N = 16>
class AlignmentAllocator {
public:
typedef T value_type;
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;

typedef T * pointer;
typedef const T * const_pointer;

typedef T & reference;
typedef const T & const_reference;

public:
inline AlignmentAllocator () throw () { }

template <typename T2>
inline AlignmentAllocator (const AlignmentAllocator<T2, N> &) throw () { }

inline ~AlignmentAllocator () throw () { }

inline pointer adress (reference r) {
return &r;
}

inline const_pointer adress (const_reference r) const {
return &r;
}

inline pointer allocate (size_type n) {
return (pointer)_mm_malloc(n*sizeof(value_type), N);
}

inline void deallocate (pointer p, size_type) {
_mm_free (p);
}

inline void construct (pointer p, const value_type & wert) {
new (p) value_type (wert);
}

inline void destroy (pointer p) {
p->~value_type ();
}

inline size_type max_size () const throw () {
return size_type (-1) / sizeof (value_type);
}

template <typename T2>
struct rebind {
typedef AlignmentAllocator<T2, N> other;
};

bool operator!=(const AlignmentAllocator<T,N>& other) const {
return !(*this == other);
}

// Returns true if and only if storage allocated from *this
// can be deallocated from other, and vice versa.
// Always returns true for stateless allocators.
bool operator==(const AlignmentAllocator<T,N>& other) const {
return true;
}
};

class NBodySystem {
private: std::vector<Body,AlignmentAllocator<Body,4096> > bodies;

public: NBodySystem()
: bodies {
Body::sun(),
Body::jupiter(),
Body::saturn(),
Body::uranus(),
Body::neptune()
}
{
double px = 0.0;
double py = 0.0;
double pz = 0.0;
for(unsigned i=0; i < bodies.size(); ++i) {
px += bodies.vx * bodies.mass;
py += bodies.vy * bodies.mass;
pz += bodies.vz * bodies.mass;
}
bodies[0].offsetMomentum(px,py,pz);
}

public: void advance(double dt) {
unsigned N = (bodies.size()-1)*bodies.size()/2;
struct R{
double dx,dy,dz,filler;
};
__m128d ddt = _mm_set1_pd(dt);

static __attribute__((aligned(16))) R r[1000];
static __attribute__((aligned(16))) double mag[1000];

for(unsigned i=0,k=0; i < bodies.size(); ++i) {
Body& iBody = bodies;
__m128d idx = _mm_load_pd(&iBody.x);
for(unsigned j=i+1; j < bodies.size(); ++j,++k) {
__m128d jdx = _mm_load_pd(&bodies[j].x);
_mm_store_pd(&r[k].dx,idx-jdx);
r[k].dz = iBody.z - bodies[j].z;
}
}

for(unsigned i=0; i < N; i+=2) {
__m128d dx,dy,dz;
dx = _mm_loadl_pd(dx,&r.dx);
dy = _mm_loadl_pd(dy,&r.dy);
dz = _mm_loadl_pd(dz,&r.dz);

dx = _mm_loadh_pd(dx,&r[i+1].dx);
dy = _mm_loadh_pd(dy,&r[i+1].dy);
dz = _mm_loadh_pd(dz,&r[i+1].dz);


__m128d dSquared = dx*dx + dy*dy + dz*dz;
__m128d distance = _mm_sqrt_pd(dSquared);
__m128d dmag = ddt/(dSquared * distance);
_mm_store_pd(&mag,dmag);
}

for(unsigned i=0,k=0; i < bodies.size(); ++i) {
Body& iBody = bodies;
for(unsigned j=i+1; j < bodies.size(); ++j,++k) {
__m128d jmass = _mm_set1_pd(bodies[j].mass);
__m128d kmag = _mm_set1_pd(mag[k]);
__m128d jkmm = jmass * kmag;

__m128d kdx = _mm_load_pd(&r[k].dx);

__m128d ivx = _mm_load_pd(&iBody.vx);

_mm_store_pd(&iBody.vx, ivx - kdx*jkmm);

iBody.vz -= r[k].dz * bodies[j].mass * mag[k];

__m128d imass = _mm_set1_pd(iBody.mass);

jkmm = imass * kmag;

__m128d jvx = _mm_load_pd(&bodies[j].vx);

_mm_store_pd(&bodies[j].vx, jvx + kdx * jkmm);

bodies[j].vz += r[k].dz * iBody.mass * mag[k];
}
}

for (unsigned i = 0; i < bodies.size(); ++i) {
__m128d ix = _mm_load_pd(&bodies.x);
__m128d ivx = _mm_load_pd(&bodies.vx);
_mm_store_pd(&bodies.x, ix + ddt * ivx);
bodies.z += dt * bodies.vz;
}
}

public: double energy(){
double dx, dy, dz, distance;
double e = 0.0;

for (unsigned i=0; i < bodies.size(); ++i) {
Body& iBody = bodies;
e += 0.5 * iBody.mass *
( iBody.vx * iBody.vx
+ iBody.vy * iBody.vy
+ iBody.vz * iBody.vz );

for (unsigned j=i+1; j < bodies.size(); ++j) {
Body& jBody = bodies[j];
dx = iBody.x - jBody.x;
dy = iBody.y - jBody.y;
dz = iBody.z - jBody.z;

distance = sqrt(dx*dx + dy*dy + dz*dz);
e -= (iBody.mass * jBody.mass) / distance;
}
}
return e;
}
};


int main(int argc, char** argv) {
int n = atoi(argv[1]);

NBodySystem bodies;
printf("%.9f\n", bodies.energy());
for (int i=0; i<n; ++i)
bodies.advance(0.01);
printf("%.9f\n", bodies.energy());
}
 
C

Carlo Milanesi

I have succeed to optimize nbody program to be faster
than c++ what we have on shootout site currently
but is still slower than intel fortran

#include <immintrin.h>
__m128d ddt = _mm_set1_pd(dt);

This statements are not standard C++, therefore I presume they are not
allowed in the shootout.
If they were, you could write a highly optimized library routine in
assembly language and then call that routine from any language,
resulting in all the languages having the same performance.
 
M

Melzzzzz

This statements are not standard C++, therefore I presume they are
not allowed in the shootout.

While not standard c++, they are standard on windows max and linux
compilers. Those are standard intrinsics for x86 processors.
They are allowed and used in the shootout.
If they were, you could write a highly optimized library routine in
assembly language and then call that routine from any language,
resulting in all the languages having the same performance.

Heh, I wrote both sse2 and avx versions in assembler and is very fast
(avx version is really fast but q6600 doesn;t have avx ;).
Problem is that I don;t have gcc 4.6.3 (or newer) on computer with
q6600 processor so don't know real end result.
You are right about implementing lib in assembler, but these are
intrinsics which are used in real programming so that shows advantage
of c++ over , say Java.
 
M

Melzzzzz

On Fri, 12 Oct 2012 20:53:14 +0200

I have succeed in getting fastest speed by using
sqrt reciprocal instruction.

[bmaxa@bmaxa nbody]$ time taskset -c 0 ./nbodysse2cpp 50000000
-0.169075164
-0.169059907

real 0m12.743s
user 0m12.735s
sys 0m0.001s
[bmaxa@bmaxa nbody]$

Relevant code is:

__m128d dSquared = dx*dx + dy*dy + dz*dz;

__m128d distance =
_mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(dSquared)));
for(unsigned j=0;j<2;++j)
{
distance = distance * _mm_set1_pd(1.5) -
((_mm_set1_pd(0.5) * dSquared) * distance) *
(distance * distance);
}

__m128d dmag = ddt/(dSquared) * distance;
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Members online

No members online now.

Forum statistics

Threads
473,769
Messages
2,569,580
Members
45,055
Latest member
SlimSparkKetoACVReview

Latest Threads

Top