c++: Sharing roots and weights for many Gauss-Legendre Quadrature in GPUs

lundi 2 mars 2015

Sharing roots and weights for many Gauss-Legendre Quadrature in GPUs

I am intending to compute in parallel fashion a lot of numerical quadratures that at the end of the day use a common set of data for all the computations ( a quite big arrays of roots and weights ocupying about 25 Kb of memory). The Gauss-Legendre quadrature method is simple enought to start with. I want to make available to all the threads in the device, the roots and weights, through the declaration device double *d_droot, *d_dweight. But I am missing something because I have to pass explictly the pointers to the arrays to make my kernel to work well. How can I do it properly? Even more, aiming to have available more free memory on the device, is it possible to burn the roots and weights to some constant portion of the memory of the device?

The code is attached


#include <math.h>
#include <stdlib.h>
#include <stdio.h>


__device__  double *d_droot, *d_dweight;


__device__ __host__
double f(double alpha,double x)
{
  /*function to be integrated via gauss-legendre quadrature. */
  return exp(alpha*x);
}

__global__
void lege_inte2(int n, double alpha, double a, double b, double *lroots, double *weight, double *result)
{
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    lroots[]: roots for the quadrature
    weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    result[i] += weight[dummy] * f(alpha,c1 * lroots[dummy] + c2)*c1;
    }
}

__global__
void lege_inte2_shared(int n,double alpha, double a, double b,  double *result)
{
  extern __shared__ double *d_droot;
  extern __shared__ double *d_dweight;
  /*
    Parameters:
    n: Total number of quadratures
    a: Upper integration limit
    b: Lower integration limit
    d_root[]: roots for the quadrature
    d_weight[]: weights for the quadrature
    result[]: allocate the results for N quadratures.
   */
  double c1 = (b - a) / 2, c2 = (b + a) / 2, sum = 0;
  int dummy;

  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n)
    {
      result[i] = 0.0;
      for (dummy = 0; dummy < 5; dummy++)
    {
      result[i] += d_dweight[dummy] * f(alpha,c1 * d_droot[dummy] + c2)*c1;
      printf(" Vale: %f \n", d_dweight[dummy]);
    }
    }
}


int main(void)
{
  int N = 1<<23;
  int N_nodes = 5;


  double *droot, *dweight, *dresult, *d_dresult;


  /*double version in host*/
  droot =(double*)malloc(N_nodes*sizeof(double));
  dweight =(double*)malloc(N_nodes*sizeof(double));
  dresult =(double*)malloc(N*sizeof(double)); /*will recibe the results of N quadratures!*/


  /*double version in device*/
  cudaMalloc(&d_droot, N_nodes*sizeof(double));
  cudaMalloc(&d_dweight, N_nodes*sizeof(double));
  cudaMalloc(&d_dresult, N*sizeof(double)); /*results for N quadratures will be contained here*/


  /*double version of the roots and weights*/
  droot[0] = 0.90618;
  droot[1] = 0.538469;
  droot[2] = 0.0;
  droot[3] = -0.538469;
  droot[4] = -0.90618;


  dweight[0] = 0.236927;
  dweight[1] = 0.478629;
  dweight[2] = 0.568889;
  dweight[3] = 0.478629;
  dweight[4] = 0.236927;



  /*double copy host-> device*/
  cudaMemcpy(d_droot, droot, N_nodes*sizeof(double), cudaMemcpyHostToDevice);
  cudaMemcpy(d_dweight, dweight, N_nodes*sizeof(double), cudaMemcpyHostToDevice);


  // Perform SAXPY on 1M element

  lege_inte2<<<(N+255)/256, 256>>>(N,1.0,  -3.0, 3.0, d_droot, d_dweight, d_dresult); /*This kerlnel works OK*/
  //lege_inte2_shared<<<(N+255)/256, 256>>>(N,  -3.0, 3.0,  d_dresult); /*why this one does not work? */





  cudaMemcpy(dresult, d_dresult, N*sizeof(double), cudaMemcpyDeviceToHost); 

  double maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(dresult[i]-20.03574985));
  printf("Max error: %f in %i quadratures \n", maxError, N);
  printf("integral: %f  \n" ,dresult[0]);



  cudaFree(dresult);
  cudaFree(d_droot);
  cudaFree(d_dweight);

}

and a makefile to compile it:


objects = main.o 

all: $(objects)
        nvcc   -Xcompiler -std=c99 -arch=sm_20 $(objects) -o gauss

%.o: %.cpp
        nvcc -x cu -arch=sm_20  -I. -dc $< -o $@

clean:
        rm -f *.o gauss

Thanks in advance for any suggestion

c++

lundi 2 mars 2015

Sharing roots and weights for many Gauss-Legendre Quadrature in GPUs

Aucun commentaire:

Enregistrer un commentaire