S
sunilkher
Here is a small sample program that I have.
#include <stdlib.h>
#include <pthread.h>
#include <string>
using namespace std;
pthread_t threads[10];
pthread_attr_t thr_attr;
int thr_in[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
int totalIter = 0;
int thr_cnt = 0;
bool debug = false;
extern "C" void *do_something(void *tid);
int main( int argc, const char* argv[] )
{
int thr_var = 0;
//------------------
// how many threads?
//------------------
thr_cnt = atoi(argv[1]);
if (thr_cnt > 8)
{
cout << "WARNING: Limiting the thread count to 8" <<
endl;
thr_cnt = 8;
}
//--------------------------
// how much work to be done?
//--------------------------
totalIter = atoi(argv[2]);
if (totalIter > 5000000)
{
cout << "WARNING: Limiting the iteration count to
5000000" << endl;
totalIter = 5000000;
}
//-------------------------------
// do you want to check up on me?
//-------------------------------
if (argv[3] != NULL) debug = true;
//--------
// threads
//--------
pthread_attr_init(&thr_attr);
pthread_attr_setdetachstate(&thr_attr,
PTHREAD_CREATE_JOINABLE);
for (thr_var = 1; thr_var<=thr_cnt; thr_var++)
pthread_create(&threads[thr_var], &thr_attr,
do_something, (void *)
&(thr_in[thr_var]));
for (thr_var=0; thr_var<thr_cnt; thr_var++)
pthread_join(threads[thr_var], NULL);
pthread_attr_destroy(&thr_attr);
return 0;
}
void *do_something(void *tid)
{
int myThreadId = *((int *)tid);
FILE *fp = NULL;
if (debug)
{
char filename[50] = "";
sprintf(filename, "%d.out", myThreadId);
fp = fopen(filename, "w");
fprintf(fp, "thread #%d processing starts\n",
myThreadId);
}
for (int i=1; i<=totalIter; i++)
{
if (i%thr_cnt == myThreadId-1)
{
if (debug)
{
fprintf(fp, "thread #%d processing
index %d\n", myThreadId, i);
}
string a("abc"), b;
b = a;
}
}
if (debug)
{
fprintf(fp, "thread #%d processing finish\n",
myThreadId);
fflush(fp);
fclose(fp);
}
pthread_exit(NULL);
return NULL;
}
Now when I run this with 1 thread, here is the time taken.
/home/skher/testIPC/testThr> time $BIN/testThr 1 5000000
real 0m0.65s
user 0m0.47s
sys 0m0.14s
/home/skher/testIPC/testThr>
impressive, considering I am doing 5 million iterations. So, I thought
when I run with 2 or more threads, I should be done even in less time.
But here is what I found.
/home/skher/testIPC/testThr> time $BIN/testThr 2 5000000
real 0m34.67s
user 0m58.48s
sys 0m5.20s
/home/skher/testIPC/testThr>
Why is this? I guess this is because whenever I allocate any STL
object, using the _node_alloc template defined in _alloc.c, it has a
lock and unlock mechanism using a static class _Node_Alloc_Lock which
has a static member variable.
Part of that class code is shown here.
template <bool __threads, int __inst>
class _Node_Alloc_Lock {
public:
_Node_Alloc_Lock() {
# ifdef _STLP_SGI_THREADS
if (__threads && __us_rsthread_malloc)
# else /* !_STLP_SGI_THREADS */
if (__threads)
# endif
_S_lock._M_acquire_lock();
}
~_Node_Alloc_Lock() {
# ifdef _STLP_SGI_THREADS
if (__threads && __us_rsthread_malloc)
# else /* !_STLP_SGI_THREADS */
if (__threads)
# endif
_S_lock._M_release_lock();
}
static _STLP_STATIC_MUTEX _S_lock;
};
OK. Now my (worth million dollar only to me) question.
How do I get around this? How do I make my program run faster with more
threads. If you see, the threads are really mutually exclusive since
they are working on different indexes (indices) but still compete with
each other for resources viz. lock while creating STL object. How do I
make this competition go away thus making my program run faster with
more threads.
Any help will be appreciated. Thanx, Sunil.
#include <stdlib.h>
#include <pthread.h>
#include <string>
using namespace std;
pthread_t threads[10];
pthread_attr_t thr_attr;
int thr_in[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
int totalIter = 0;
int thr_cnt = 0;
bool debug = false;
extern "C" void *do_something(void *tid);
int main( int argc, const char* argv[] )
{
int thr_var = 0;
//------------------
// how many threads?
//------------------
thr_cnt = atoi(argv[1]);
if (thr_cnt > 8)
{
cout << "WARNING: Limiting the thread count to 8" <<
endl;
thr_cnt = 8;
}
//--------------------------
// how much work to be done?
//--------------------------
totalIter = atoi(argv[2]);
if (totalIter > 5000000)
{
cout << "WARNING: Limiting the iteration count to
5000000" << endl;
totalIter = 5000000;
}
//-------------------------------
// do you want to check up on me?
//-------------------------------
if (argv[3] != NULL) debug = true;
//--------
// threads
//--------
pthread_attr_init(&thr_attr);
pthread_attr_setdetachstate(&thr_attr,
PTHREAD_CREATE_JOINABLE);
for (thr_var = 1; thr_var<=thr_cnt; thr_var++)
pthread_create(&threads[thr_var], &thr_attr,
do_something, (void *)
&(thr_in[thr_var]));
for (thr_var=0; thr_var<thr_cnt; thr_var++)
pthread_join(threads[thr_var], NULL);
pthread_attr_destroy(&thr_attr);
return 0;
}
void *do_something(void *tid)
{
int myThreadId = *((int *)tid);
FILE *fp = NULL;
if (debug)
{
char filename[50] = "";
sprintf(filename, "%d.out", myThreadId);
fp = fopen(filename, "w");
fprintf(fp, "thread #%d processing starts\n",
myThreadId);
}
for (int i=1; i<=totalIter; i++)
{
if (i%thr_cnt == myThreadId-1)
{
if (debug)
{
fprintf(fp, "thread #%d processing
index %d\n", myThreadId, i);
}
string a("abc"), b;
b = a;
}
}
if (debug)
{
fprintf(fp, "thread #%d processing finish\n",
myThreadId);
fflush(fp);
fclose(fp);
}
pthread_exit(NULL);
return NULL;
}
Now when I run this with 1 thread, here is the time taken.
/home/skher/testIPC/testThr> time $BIN/testThr 1 5000000
real 0m0.65s
user 0m0.47s
sys 0m0.14s
/home/skher/testIPC/testThr>
impressive, considering I am doing 5 million iterations. So, I thought
when I run with 2 or more threads, I should be done even in less time.
But here is what I found.
/home/skher/testIPC/testThr> time $BIN/testThr 2 5000000
real 0m34.67s
user 0m58.48s
sys 0m5.20s
/home/skher/testIPC/testThr>
Why is this? I guess this is because whenever I allocate any STL
object, using the _node_alloc template defined in _alloc.c, it has a
lock and unlock mechanism using a static class _Node_Alloc_Lock which
has a static member variable.
Part of that class code is shown here.
template <bool __threads, int __inst>
class _Node_Alloc_Lock {
public:
_Node_Alloc_Lock() {
# ifdef _STLP_SGI_THREADS
if (__threads && __us_rsthread_malloc)
# else /* !_STLP_SGI_THREADS */
if (__threads)
# endif
_S_lock._M_acquire_lock();
}
~_Node_Alloc_Lock() {
# ifdef _STLP_SGI_THREADS
if (__threads && __us_rsthread_malloc)
# else /* !_STLP_SGI_THREADS */
if (__threads)
# endif
_S_lock._M_release_lock();
}
static _STLP_STATIC_MUTEX _S_lock;
};
OK. Now my (worth million dollar only to me) question.
How do I get around this? How do I make my program run faster with more
threads. If you see, the threads are really mutually exclusive since
they are working on different indexes (indices) but still compete with
each other for resources viz. lock while creating STL object. How do I
make this competition go away thus making my program run faster with
more threads.
Any help will be appreciated. Thanx, Sunil.