I am trying to use OpenMP to parallelize a simple c++ code that solves a system of linear equations by Gauss elimination.
The relevant part of my code is:
#include <iostream>
#include <time.h>
using namespace std;
#define nl "\n"
void LinearSolve(double **& M, double *& V, const int N, bool parallel, int threads){
//...
for (int i=0;i<N;i++){
#pragma omp parallel for num_threads(threads) if(parallel)
for (int j=i+1;j<N;j++){
double aux, * Mi=M[i], * Mj=M[j];
aux=Mj[i]/Mi[i];
Mj[i]=0;
for (int k=i+1;k<N;k++) {
Mj[k]-=Mi[k]*aux;
};
V[j]-=V[i]*aux;
};
};
//...
};
class Time {
clock_t startC, endC;
time_t startT, endT;
public:
void start() {startC=clock(); time (&startT);};
void end() {endC=clock(); time (&endT);};
double timedifCPU() {return(double(endC-startC)/CLOCKS_PER_SEC);};
int timedif() {return(int(difftime (endT,startT)));};
};
int main (){
Time t;
double ** M, * V;
int N=5000;
cout<<"number of equations "<<N<<nl<<nl;
M= new double * [N];
V=new double [N];
for (int i=0;i<N;i++){
M[i]=new double [N];
};
for (int m=1;m<=16;m=2*m){
cout<<m<<" threads"<<nl;
for (int i=0;i<N;i++){
V[i]=i+1.5*i*i;
for (int j=0;j<N;j++){
M[i][j]=(j+2.3)/(i-0.2)+(i+2)/(j+3); //some function to get regular matrix
};
};
t.start();
LinearSolve(M,V,N,m!=1,m);
t.end();
cout<<"time "<<t.timedif()<<", CPU time "<<t.timedifCPU()<<nl<<nl;
};
}
Since the code is extremely simple I would expect that the time would be inversely proportional to the number of threads. However the typical result I get is (the code is compiled with gcc on Linux)
number of equations 5000
1 threads
time 217, CPU time 215.89
2 threads
time 125, CPU time 245.18
4 threads
time 80, CPU time 302.72
8 threads
time 67, CPU time 458.55
16 threads
time 55, CPU time 634.41
There is a decrease in time, but much less that I would like to and the CPU time mysteriously grows.
I suspect the problem is in memory sharing, but I have been unable to identify it. Access to row M[j] should not be a problem, since each thread writes to the a different row of the matrix. There could be a problem in reading from row M[i], so I also tried to make a separate copy of this row for each thread by replacing the parallel loop by
#pragma omp parallel num_threads(threads) if(parallel)
{
double Mi[N];
for (int j=i;j<N;j++) Mi[j]=M[i][j];
#pragma omp for
for (int j=i+1;j<N;j++){
double aux, * Mj=M[j];
aux=Mj[i]/Mi[i];
Mj[i]=0;
for (int k=i+1;k<N;k++) {
Mj[k]-=Mi[k]*aux;
};
V[j]-=V[i]*aux;
};
};
Unfortunately it does not help at all.
I would very much appreciate any help.
Aucun commentaire:
Enregistrer un commentaire