Performance problem with Pthread code


T

tutul

Hi,

I am running the following code on a 4-processor/8-core IBM Blade with
Red Hat Enterprise Linux Server release 5.4 (Tikanga) and GCC 4.1.2.
I am not getting any performance gain due to pthread multi-threading.
I get the best performance with 1 thread and it gets worse with 2, 4,
8, .. threads, essentially indicating that the threads are actually
runnnig serially. Although I have C++ class definitions, I am not
actually using any C++ features e.g. std::cout or anything in
associated classes.

Any suggestion will be much appreciated.

-----------------------------------------------------
#include <stdio.h>
#include <time.h>
#include <sys/time.h>
#include <unistd.h>
#include "pthread.h"

#include "Particle.H"
#include "Space.H"

//---------------------------------------------------------------------------

#define NUM_BOXES 64
#define NUM_STEPS 10
#define BOX_X_SIZE 100
#define BOX_Y_SIZE 100
#define RADIUS 0.0
#define DT 0.01
#define MAX_PARTTICLES_PER_BOX 100000

typedef struct // Info needed by a worker thread.
{
int id;
double mySum;

} ThreadData_t;

Space box[NUM_BOXES];
double globalSum = 0.0;
int eggCount = 0;
int loglevel = 0;

pthread_mutex_t sum_mutex = PTHREAD_MUTEX_INITIALIZER;

typedef struct {
pthread_mutex_t cond_mutex; // the mutex
pthread_cond_t cond_var; // the condition variable
int data; // the data item used as a flag.
} flag;

flag ourFlag = { // default initialization
PTHREAD_MUTEX_INITIALIZER,
PTHREAD_COND_INITIALIZER,
-1 };

void *threadFunction( void *threadData_ );

//---------------------------------------------------------------------------

int main( int argc, char* argv[] )
{
if( argc < 2 )
{
printf("\nUsage: simulator <num_threads> [loglevel]\n\n");
return( 0 );
}

if( argc > 2 ) loglevel = atoi( argv[2] );

struct timeval time1, time2;
int numThreads = atoi( argv[1] );
printf("Number of threads: %d", numThreads );

if( argc > 2 ) loglevel = atoi( argv[2] );

//
// Initialize each box and particles inside it.
//
for( int i=0; i<NUM_BOXES; ++i )
{
box.initialize( 0, 0, BOX_X_SIZE, BOX_Y_SIZE,
MAX_PARTTICLES_PER_BOX );
int n = box.initParticles( (u_int)i, RADIUS );

printf("\nNumber of particles in box %d: %d", i, n );
}

//
// Create specified number of threads and assign NUM_BOXES/
num_threads
// boxes to each thread. The last thread may have less boxes than
others.
//
pthread_t *threads = (pthread_t
*)malloc( sizeof(pthread_t)*numThreads );
ThreadData_t *threadData =
(ThreadData_t *)malloc( sizeof(ThreadData_t)*numThreads );

//
// Initialize individual thread data.
//
for( int i=0; i<numThreads; ++i )
{
threadData.id = i;
threadData.mySum = 0.0;
}

for( int i=0; i<numThreads; ++i )
{
int rc = pthread_create( &threads, NULL, threadFunction,
(void*)&threadData );
if( rc != 0 )
printf("\nERROR: Failed to launch thread %d\n", i );
}

sleep(5);

gettimeofday( &time1, NULL );

//
// Master distributes work to the thread pool here.
//
for( int i=0; eggCount<NUM_BOXES; i=(i+1)%NUM_BOXES )
{
if( box.step < NUM_STEPS ) // This box is not done yet.
{
int status = pthread_mutex_lock( &ourFlag.cond_mutex );
if( status != 0 )
{
printf("\nERROR: lock failed on cond_mutex.\n");
exit( -1 );
}

ourFlag.data = i; // Send box i to the thread pool.

status = pthread_cond_broadcast( &ourFlag.cond_var );
//status = pthread_cond_signal( &ourFlag.cond_var );
if( status != 0 )
{
printf("\nERROR: signal failed on cond_var.\n");
exit( -1 );
}

status = pthread_mutex_unlock( &ourFlag.cond_mutex );
if( status != 0 )
{
printf("\nERROR: unlock failed on cond_mutex.\n");
exit( -1 );
}

if( loglevel > 2 )
printf("\nWaiting on thread pool for box %d", i );

while( ourFlag.data != -1 ) // Wait until a worker picks this
box up.
{
if( eggCount >= NUM_BOXES ) break; // This should not happen!
}

if( loglevel > 2 ) printf("\nBox %d taken.", i );
}
else if( box.step == NUM_STEPS )
{
if( loglevel > 0 ) printf("\nBox %d just completed.", i );
++eggCount;
box.step++; // increment beyond NUM_STEP to discard this
box.
}
else // This box is already done, move to the next one.
{
if( loglevel > 1 ) printf("\nBox %d already completed.", i );
}

if( eggCount >= NUM_BOXES ) // Check if all boxes are already
done.
{
printf("\nAll boxes completed.");
printf("\n\t***Global sum of velocity squares: %.5f\n",
globalSum );
}
}

gettimeofday( &time2, NULL );

double etime = time2.tv_sec - time1.tv_sec +
( time2.tv_usec - time1.tv_usec )/1000000.0;

printf("\n\t***Elapsed time: %.5f seconds\n\n", etime );

for( int i=0; i<numThreads; ++i )
pthread_join( threads, NULL );

pthread_mutex_destroy( &sum_mutex );
free( threads );
free( threadData );
pthread_exit( NULL );
}

//---------------------------------------------------------------------------

void *threadFunction( void *threadData_ )
{
ThreadData_t *threadData = (ThreadData_t*)threadData_;

while( eggCount < NUM_BOXES )
{
int status = pthread_mutex_lock( &ourFlag.cond_mutex );
if( status != 0 )
{
printf("ERROR: lock failed on cond_mutex.\n");
exit( -1 );
}

while( ourFlag.data == -1 && eggCount < NUM_BOXES )
{
if( loglevel > 0 )
printf("\nThread blocking: %d", threadData->id );

status = pthread_cond_wait( &ourFlag.cond_var,
&ourFlag.cond_mutex );
if( status != 0 )
{
printf("ERROR: wait failed on condition variable.\n");
exit( -1 );
}
}

//
// Get the the box id the Master has given.
//
int boxId = ourFlag.data;
ourFlag.data = -1; // Let the Master know the given box is
taken.

if( loglevel > 0 )
printf("\nThread %d processing box %d", threadData->id, boxId );

status = pthread_mutex_unlock( &ourFlag.cond_mutex );
if( status != 0 )
{
printf("ERROR: unlock failed on cond_mutex.\n");
exit( -1 );
}

//
// Perform just 1 step on the given box here.
//
if( box[boxId].step < NUM_STEPS )
box[boxId].moveParticles( DT );

if( box[boxId].step >= NUM_STEPS ) // All steps done on this box.
{
//
// Sum up velocity squares in this box.
//
double boxSum = box[boxId].getVelocitySquare();

if( loglevel > 0 )
printf("\n\t***Total velocity square in box %d: %.5f",
boxId, boxSum );
//
// Accumulate velocity squares of boxes done by this thread.
//
threadData->mySum += boxSum ;

//
// Protect the shared data using mutex.
//
status = pthread_mutex_lock( &sum_mutex );
if( status != 0 )
{
printf("ERROR: lock failed on sum_mutex.\n");
exit( -1 );
}

globalSum += boxSum;

status = pthread_mutex_unlock( &sum_mutex );
if( status != 0 )
{
printf("ERROR: unlock failed on sum_mutex.\n");
exit( -1 );
}
}
} // end while( eggCount < NUM_BOXES )

return( threadData_ );
}

//---------------------------------------------------------------------------
 
Ad

Advertisements

V

Victor Bazarov

tutul said:
I am running the following code on a 4-processor/8-core IBM Blade with
Red Hat Enterprise Linux Server release 5.4 (Tikanga) and GCC 4.1.2.
I am not getting any performance gain due to pthread multi-threading.
I get the best performance with 1 thread and it gets worse with 2, 4,
8, .. threads, essentially indicating that the threads are actually
runnnig serially. Although I have C++ class definitions, I am not
actually using any C++ features e.g. std::cout or anything in
associated classes.

Any suggestion will be much appreciated.

I know I may not sound helpful, but have you tried a newsgroup that
actually deals with threads, like 'comp.programming.threads'? Here we
discuss C++ _language_ issues, not pthreads library...

V
 

Ask a Question

Want to reply to this thread or ask your own question?

You'll need to choose a username for the site, which only take a couple of moments. After that, you can post your question and our members will help you out.

Ask a Question

Top