Actual source code: bvorthogcuda.cu

slepc-3.8.0 2017-10-20
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2017, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV orthogonalization routines (CUDA)
 12: */

 14: #include <slepc/private/bvimpl.h>          /*I   "slepcbv.h"   I*/
 15: #include <slepcblaslapack.h>

 17: /*
 18:    BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
 19: */
 20: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
 21: {
 23:   PetscScalar    *d_hh,*d_a;
 24:   PetscInt       i;

 27:   if (!h) {
 28:     VecCUDAGetArrayReadWrite(bv->buffer,&d_a);
 29:     d_hh = d_a + j*(bv->nc+bv->m);
 30:     cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar));
 31:     WaitForGPU();CHKERRCUDA(ierr);
 32:     VecCUDARestoreArrayReadWrite(bv->buffer,&d_a);
 33:   } else { /* cpu memory */
 34:     for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
 35:   }
 36:   return(0);
 37: }

 39: /*
 40:    BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
 41:    into column j of the bv buffer
 42:  */
 43: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
 44: {
 46:   PetscScalar    *d_h,*d_c,sone=1.0;
 47:   PetscInt       i;
 48:   PetscBLASInt   one=1;
 49:   cublasStatus_t cberr;

 52:   if (!h) {
 53:     VecCUDAGetArrayReadWrite(bv->buffer,&d_c);
 54:     d_h = d_c + j*(bv->nc+bv->m);
 55:     cberr = cublasXaxpy(cublasv2handle,bv->nc+j,&sone,d_c,one,d_h,one);CHKERRCUBLAS(cberr);
 56:     WaitForGPU();CHKERRCUDA(ierr);
 57:     VecCUDARestoreArrayReadWrite(bv->buffer,&d_c);
 58:   } else { /* cpu memory */
 59:     for (i=0;i<bv->nc+j;i++) h[i] += c[i];
 60:   }
 61:   return(0);
 62: }

 64: /*
 65:    BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
 66:    of the coefficients array
 67: */
 68: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
 69: {
 71:   PetscScalar    *d_h,*a;
 72:   cudaError_t    cerr;

 75:   if (!h) {
 76:     VecCUDAGetArrayReadWrite(bv->buffer,&a);
 77:     d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
 78:     cerr = cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
 79:     WaitForGPU();CHKERRCUDA(ierr);
 80:     VecCUDARestoreArrayReadWrite(bv->buffer,&a);
 81:   } else { /* cpu memory */
 82:     h[bv->nc+j] = value;
 83:   }
 84:   return(0);
 85: }

 87: /*
 88:    BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
 89:    coefficients array (up to position j)
 90: */
 91: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
 92: {
 93:   PetscErrorCode    ierr;
 94:   const PetscScalar *d_h;
 95:   PetscScalar       dot;
 96:   PetscInt          i;
 97:   PetscBLASInt      one=1;
 98:   cublasStatus_t    cberr;

101:   if (!h) {
102:     VecCUDAGetArrayRead(bv->buffer,&d_h);
103:     cberr = cublasXdotc(cublasv2handle,bv->nc+j,d_h,one,d_h,one,&dot);CHKERRCUBLAS(cberr);
104:     WaitForGPU();CHKERRCUDA(ierr);
105:     *sum = PetscRealPart(dot);
106:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
107:   } else { /* cpu memory */
108:     *sum = 0.0;
109:     for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
110:   }
111:   return(0);
112: }

114: #define X_AXIS        0
115: #define BLOCK_SIZE_X 64
116: #define TILE_SIZE_X  16 /* work to be done by any thread on axis x */

118: /*
119:    Set the kernels grid dimensions
120:    xcount: number of kernel calls needed for the requested size
121:  */
122: PetscErrorCode SetGrid1D(PetscInt n, dim3 *dimGrid, dim3 *dimBlock,PetscInt *xcount)
123: {
124:   PetscInt              one=1,card;
125:   struct cudaDeviceProp devprop;
126:   cudaError_t           cerr;

129:   *xcount = 1;
130:   if (n>BLOCK_SIZE_X) {
131:     dimBlock->x = BLOCK_SIZE_X;
132:     dimGrid->x = (n+BLOCK_SIZE_X*TILE_SIZE_X-one)/BLOCK_SIZE_X*TILE_SIZE_X;
133:   } else {
134:     dimBlock->x = (n+TILE_SIZE_X-one)/TILE_SIZE_X;
135:     dimGrid->x = one;
136:   }
137:   cerr = cudaGetDevice(&card);CHKERRCUDA(cerr);
138:   cerr = cudaGetDeviceProperties(&devprop,card);CHKERRCUDA(cerr);
139:   if (dimGrid->x>(unsigned)devprop.maxGridSize[X_AXIS]) {
140:     *xcount = (dimGrid->x+devprop.maxGridSize[X_AXIS]-one)/devprop.maxGridSize[X_AXIS];
141:     dimGrid->x = devprop.maxGridSize[X_AXIS];
142:   }
143:   return(0);
144: }

146: /* pointwise multiplication */
147: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
148: {
149:   PetscInt i,x;

151:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
152:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
153:     a[i] *= PetscRealPart(b[i]);
154:   }
155: }

157: /* pointwise division */
158: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
159: {
160:   PetscInt i,x;

162:   x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x*TILE_SIZE_X+threadIdx.x*TILE_SIZE_X;
163:   for (i=x;i<x+TILE_SIZE_X&&i<n;i++) {
164:     a[i] /= PetscRealPart(b[i]);
165:   }
166: }

168: /*
169:    BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
170:    the contents of the coefficients array (up to position j) and omega is the signature;
171:    if inverse=TRUE then the operation is h/omega
172: */
173: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
174: {
175:   PetscErrorCode    ierr;
176:   PetscScalar       *d_h;
177:   const PetscScalar *d_omega,*omega;
178:   PetscInt          i,xcount;
179:   dim3              blocks3d, threads3d;
180:   cudaError_t       cerr;

183:   if (!(bv->nc+j)) return(0);
184:   if (!h) {
185:     VecCUDAGetArrayReadWrite(bv->buffer,&d_h);
186:     VecCUDAGetArrayRead(bv->omega,&d_omega);
187:     SetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount);
188:     if (inverse) {
189:       for (i=0;i<xcount;i++) {
190:         PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
191:       }
192:     } else {
193:       for (i=0;i<xcount;i++) {
194:         PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
195:       }
196:     }
197:     cerr = cudaGetLastError();CHKERRCUDA(cerr);
198:     WaitForGPU();CHKERRCUDA(ierr);
199:     VecCUDARestoreArrayRead(bv->omega,&d_omega);
200:     VecCUDARestoreArrayReadWrite(bv->buffer,&d_h);
201:   } else {
202:     VecGetArrayRead(bv->omega,&omega);
203:     if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
204:     else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
205:     VecRestoreArrayRead(bv->omega,&omega);
206:   }
207:   return(0);
208: }

210: /*
211:    BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
212:    of the coefficients array
213: */
214: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
215: {
216:   PetscErrorCode    ierr;
217:   const PetscScalar *d_h;
218:   PetscScalar       hh;
219:   cudaError_t       cerr;

222:   if (!h) {
223:     VecCUDAGetArrayRead(bv->buffer,&d_h);
224:     cerr = cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
225:     WaitForGPU();CHKERRCUDA(ierr);
226:     BV_SafeSqrt(bv,hh,beta);
227:     VecCUDARestoreArrayRead(bv->buffer,&d_h);
228:   } else {
229:     BV_SafeSqrt(bv,h[bv->nc+j],beta);
230:   }
231:   return(0);
232: }

234: /*
235:    BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
236:    provided by the caller (only values from l to j are copied)
237: */
238: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
239: {
240:   PetscErrorCode    ierr;
241:   const PetscScalar *d_h,*d_a;
242:   PetscInt          i;
243:   cudaError_t       cerr;

246:   if (!h) {
247:     VecCUDAGetArrayRead(bv->buffer,&d_a);
248:     d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
249:     cerr = cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
250:     WaitForGPU();CHKERRCUDA(ierr);
251:     VecCUDARestoreArrayRead(bv->buffer,&d_a);
252:   } else {
253:     for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
254:   }
255:   return(0);
256: }