I am intending to compute in parallel fashion a lot of numerical quadratures that at the end of the day use a common set of data for all the computations ( a quite big arrays of roots and weights ocupying about 25 Kb of memory). The Gauss-Legendre quadrature method is simple enought to start with. I want to make available to all the threads in the device, the roots and weights, through the declaration device double *d_droot, *d_dweight. But I am missing something because I have to pass explictly the pointers to the arrays to make my kernel to work well. How can I do it properly? Even more, aiming to have available more free memory on the device, is it possible to burn the roots and weights to some constant portion of the memory of the device?
The code is attached
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
__device__ double *d_droot, *d_dweight;
__device__ __host__
double f(double alpha,double x)
{
/*function to be integrated via gauss-legendre quadrature. */
return exp(alpha*x);
}
__global__
void lege_inte2(int n, double alpha, double a, double b, double *lroots, double *weight, double *result)
{
/*
Parameters:
n: Total number of quadratures
a: Upper integration limit
b: Lower integration limit
lroots[]: roots for the quadrature
weight[]: weights for the quadrature
result[]: allocate the results for N quadratures.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
result[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
result[i] += weight[dummy] * f(alpha,c1 * lroots[dummy] + c2)*c1;
}
}
__global__
void lege_inte2_shared(int n,double alpha, double a, double b, double *result)
{
extern __shared__ double *d_droot;
extern __shared__ double *d_dweight;
/*
Parameters:
n: Total number of quadratures
a: Upper integration limit
b: Lower integration limit
d_root[]: roots for the quadrature
d_weight[]: weights for the quadrature
result[]: allocate the results for N quadratures.
*/
double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
int dummy;
int i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
{
result[i] = 0.0;
for (dummy = 0; dummy < 5; dummy++)
{
result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
printf(" Vale: %f \n", d_dweight[dummy]);
}
}
}
int main(void)
{
int N = 1<<23;
int N_nodes = 5;
double *droot, *dweight, *dresult, *d_dresult;
/*double version in host*/
droot =(double*)malloc(N_nodes*sizeof(double));
dweight =(double*)malloc(N_nodes*sizeof(double));
dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/
/*double version in device*/
cudaMalloc(&d_droot, N_nodes*sizeof(double));
cudaMalloc(&d_dweight, N_nodes*sizeof(double));
cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/
/*double version of the roots and weights*/
droot[0] = 0.90618;
droot[1] = 0.538469;
droot[2] = 0.0;
droot[3] = -0.538469;
droot[4] = -0.90618;
dweight[0] = 0.236927;
dweight[1] = 0.478629;
dweight[2] = 0.568889;
dweight[3] = 0.478629;
dweight[4] = 0.236927;
/*double copy host-> device*/
cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
// Perform SAXPY on 1M element
lege_inte2<<<(N+255)/256, 256>>>(N,1.0, -3.0, 3.0, d_droot, d_dweight, d_dresult); /*This kerlnel works OK*/
//lege_inte2_shared<<<(N+255)/256, 256>>>(N, -3.0, 3.0, d_dresult); /*why this one does not work? */
cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost);
double maxError = 0.0f;
for (int i = 0; i < N; i++)
maxError = max(maxError, abs(dresult[i]-20.03574985));
printf("Max error: %f in %i quadratures \n", maxError, N);
printf("integral: %f \n" ,dresult[0]);
cudaFree(dresult);
cudaFree(d_droot);
cudaFree(d_dweight);
}
and a makefile to compile it:
objects = main.o
all: $(objects)
nvcc -Xcompiler -std=c99 -arch=sm_20 $(objects) -o gauss
%.o: %.cpp
nvcc -x cu -arch=sm_20 -I. -dc $< -o $@
clean:
rm -f *.o gauss
Thanks in advance for any suggestion
Aucun commentaire:
Enregistrer un commentaire