Actual source code: bvblas.c

slepc-3.8.0 2017-10-20
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2017, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV private kernels that use the BLAS
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include <slepcblaslapack.h>

 17: #define BLOCKSIZE 64

 19: /*
 20:     C := alpha*A*B + beta*C

 22:     A is mxk (ld=m), B is kxn (ld=ldb), C is mxn (ld=m)
 23: */
 24: PetscErrorCode BVMult_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldb_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *B,PetscScalar beta,PetscScalar *C)
 25: {
 27:   PetscBLASInt   m,n,k,ldb;
 28: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
 29:   PetscBLASInt   l,bs=BLOCKSIZE;
 30: #endif

 33:   PetscBLASIntCast(m_,&m);
 34:   PetscBLASIntCast(n_,&n);
 35:   PetscBLASIntCast(k_,&k);
 36:   PetscBLASIntCast(ldb_,&ldb);
 37: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
 38:   l = m % bs;
 39:   if (l) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
 40:   for (;l<m;l+=bs) {
 41:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bs,&n,&k,&alpha,(PetscScalar*)A+l,&m,(PetscScalar*)B,&ldb,&beta,C+l,&m));
 42:   }
 43: #else
 44:   if (m) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
 45: #endif
 46:   PetscLogFlops(2.0*m*n*k);
 47:   return(0);
 48: }

 50: /*
 51:     y := alpha*A*x + beta*y

 53:     A is nxk (ld=n)
 54: */
 55: PetscErrorCode BVMultVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *x,PetscScalar beta,PetscScalar *y)
 56: {
 58:   PetscBLASInt   n,k,one=1;

 61:   PetscBLASIntCast(n_,&n);
 62:   PetscBLASIntCast(k_,&k);
 63:   if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&n,&k,&alpha,A,&n,x,&one,&beta,y,&one));
 64:   PetscLogFlops(2.0*n*k);
 65:   return(0);
 66: }

 68: /*
 69:     A(:,s:e-1) := A*B(:,s:e-1)

 71:     A is mxk (ld=m), B is kxn (ld=ldb)  n=e-s
 72: */
 73: PetscErrorCode BVMultInPlace_BLAS_Private(BV bv,PetscInt m_,PetscInt k_,PetscInt ldb_,PetscInt s,PetscInt e,PetscScalar *A,const PetscScalar *B,PetscBool btrans)
 74: {
 76:   PetscScalar    *pb,zero=0.0,one=1.0;
 77:   PetscBLASInt   m,n,k,l,ldb,bs=BLOCKSIZE;
 78:   PetscInt       j,n_=e-s;
 79:   const char     *bt;

 82:   PetscBLASIntCast(m_,&m);
 83:   PetscBLASIntCast(n_,&n);
 84:   PetscBLASIntCast(k_,&k);
 85:   PetscBLASIntCast(ldb_,&ldb);
 86:   BVAllocateWork_Private(bv,BLOCKSIZE*n_);
 87:   if (btrans) {
 88:     pb = (PetscScalar*)B+s;
 89:     bt = "C";
 90:   } else {
 91:     pb = (PetscScalar*)B+s*ldb;
 92:     bt = "N";
 93:   }
 94:   l = m % bs;
 95:   if (l) {
 96:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&k,&one,A,&m,pb,&ldb,&zero,bv->work,&l));
 97:     for (j=0;j<n;j++) {
 98:       PetscMemcpy(A+(s+j)*m,bv->work+j*l,l*sizeof(PetscScalar));
 99:     }
100:   }
101:   for (;l<m;l+=bs) {
102:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&k,&one,A+l,&m,pb,&ldb,&zero,bv->work,&bs));
103:     for (j=0;j<n;j++) {
104:       PetscMemcpy(A+(s+j)*m+l,bv->work+j*bs,bs*sizeof(PetscScalar));
105:     }
106:   }
107:   PetscLogFlops(2.0*m*n*k);
108:   return(0);
109: }

111: /*
112:     V := V*B

114:     V is mxn (ld=m), B is nxn (ld=k)
115: */
116: PetscErrorCode BVMultInPlace_Vecs_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,Vec *V,const PetscScalar *B,PetscBool btrans)
117: {
118:   PetscErrorCode    ierr;
119:   PetscScalar       zero=0.0,one=1.0,*out,*pout;
120:   const PetscScalar *pin;
121:   PetscBLASInt      m,n,k,l,bs=BLOCKSIZE;
122:   PetscInt          j;
123:   const char        *bt;

126:   PetscBLASIntCast(m_,&m);
127:   PetscBLASIntCast(n_,&n);
128:   PetscBLASIntCast(k_,&k);
129:   BVAllocateWork_Private(bv,2*BLOCKSIZE*n_);
130:   out = bv->work+BLOCKSIZE*n_;
131:   if (btrans) bt = "C";
132:   else bt = "N";
133:   l = m % bs;
134:   if (l) {
135:     for (j=0;j<n;j++) {
136:       VecGetArrayRead(V[j],&pin);
137:       PetscMemcpy(bv->work+j*l,pin,l*sizeof(PetscScalar));
138:       VecRestoreArrayRead(V[j],&pin);
139:     }
140:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&n,&one,bv->work,&l,(PetscScalar*)B,&k,&zero,out,&l));
141:     for (j=0;j<n;j++) {
142:       VecGetArray(V[j],&pout);
143:       PetscMemcpy(pout,out+j*l,l*sizeof(PetscScalar));
144:       VecRestoreArray(V[j],&pout);
145:     }
146:   }
147:   for (;l<m;l+=bs) {
148:     for (j=0;j<n;j++) {
149:       VecGetArrayRead(V[j],&pin);
150:       PetscMemcpy(bv->work+j*bs,pin+l,bs*sizeof(PetscScalar));
151:       VecRestoreArrayRead(V[j],&pin);
152:     }
153:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&n,&one,bv->work,&bs,(PetscScalar*)B,&k,&zero,out,&bs));
154:     for (j=0;j<n;j++) {
155:       VecGetArray(V[j],&pout);
156:       PetscMemcpy(pout+l,out+j*bs,bs*sizeof(PetscScalar));
157:       VecRestoreArray(V[j],&pout);
158:     }
159:   }
160:   PetscLogFlops(2.0*n*n*k);
161:   return(0);
162: }

164: /*
165:     B := alpha*A + beta*B

167:     A,B are nxk (ld=n)
168: */
169: PetscErrorCode BVAXPY_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,PetscScalar beta,PetscScalar *B)
170: {
172:   PetscBLASInt   m,one=1;

175:   PetscBLASIntCast(n_*k_,&m);
176:   if (beta!=(PetscScalar)1.0) {
177:     PetscStackCallBLAS("BLASscal",BLASscal_(&m,&beta,B,&one));
178:     PetscLogFlops(m);
179:   }
180:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&m,&alpha,A,&one,B,&one));
181:   PetscLogFlops(2.0*m);
182:   return(0);
183: }

185: /*
186:     C := A'*B

188:     A' is mxk (ld=k), B is kxn (ld=k), C is mxn (ld=ldc)
189: */
190: PetscErrorCode BVDot_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldc_,const PetscScalar *A,const PetscScalar *B,PetscScalar *C,PetscBool mpi)
191: {
193:   PetscScalar    zero=0.0,one=1.0,*CC;
194:   PetscBLASInt   m,n,k,ldc,j;
195:   PetscMPIInt    len;

198:   PetscBLASIntCast(m_,&m);
199:   PetscBLASIntCast(n_,&n);
200:   PetscBLASIntCast(k_,&k);
201:   PetscBLASIntCast(ldc_,&ldc);
202:   if (mpi) {
203:     if (ldc==m) {
204:       BVAllocateWork_Private(bv,m*n);
205:       if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&ldc));
206:       else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
207:       PetscMPIIntCast(m*n,&len);
208:       MPI_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
209:     } else {
210:       BVAllocateWork_Private(bv,2*m*n);
211:       CC = bv->work+m*n;
212:       if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&m));
213:       else { PetscMemzero(bv->work,m*n*sizeof(PetscScalar)); }
214:       PetscMPIIntCast(m*n,&len);
215:       MPI_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
216:       for (j=0;j<n;j++) {
217:         PetscMemcpy(C+j*ldc,CC+j*m,m*sizeof(PetscScalar));
218:       }
219:     }
220:   } else {
221:     if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,C,&ldc));
222:   }
223:   PetscLogFlops(2.0*m*n*k);
224:   return(0);
225: }

227: /*
228:     y := A'*x

230:     A is nxk (ld=n)
231: */
232: PetscErrorCode BVDotVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *A,const PetscScalar *x,PetscScalar *y,PetscBool mpi)
233: {
235:   PetscScalar    zero=0.0,done=1.0;
236:   PetscBLASInt   n,k,one=1;
237:   PetscMPIInt    len;

240:   PetscBLASIntCast(n_,&n);
241:   PetscBLASIntCast(k_,&k);
242:   if (mpi) {
243:     BVAllocateWork_Private(bv,k);
244:     if (n) {
245:       PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,bv->work,&one));
246:     } else {
247:       PetscMemzero(bv->work,k*sizeof(PetscScalar));
248:     }
249:     PetscMPIIntCast(k,&len);
250:     MPI_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
251:   } else {
252:     if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,y,&one));
253:   }
254:   PetscLogFlops(2.0*n*k);
255:   return(0);
256: }

258: /*
259:     Scale n scalars
260: */
261: PetscErrorCode BVScale_BLAS_Private(BV bv,PetscInt n_,PetscScalar *A,PetscScalar alpha)
262: {
264:   PetscBLASInt   n,one=1;

267:   if (alpha == (PetscScalar)0.0) {
268:     PetscMemzero(A,n_*sizeof(PetscScalar));
269:   } else if (alpha!=(PetscScalar)1.0) {
270:     PetscBLASIntCast(n_,&n);
271:     PetscStackCallBLAS("BLASscal",BLASscal_(&n,&alpha,A,&one));
272:     PetscLogFlops(n);
273:   }
274:   return(0);
275: }

277: /*
278:     Compute ||A|| for an mxn matrix
279: */
280: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
281: {
283:   PetscBLASInt   m,n,i,j;
284:   PetscMPIInt    len;
285:   PetscReal      lnrm,*rwork=NULL,*rwork2=NULL;

288:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
289:   PetscBLASIntCast(m_,&m);
290:   PetscBLASIntCast(n_,&n);
291:   if (type==NORM_FROBENIUS || type==NORM_2) {
292:     lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
293:     if (mpi) {
294:       lnrm = lnrm*lnrm;
295:       MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
296:       *nrm = PetscSqrtReal(*nrm);
297:     } else *nrm = lnrm;
298:     PetscLogFlops(2.0*m*n);
299:   } else if (type==NORM_1) {
300:     if (mpi) {
301:       BVAllocateWork_Private(bv,2*n_);
302:       rwork = (PetscReal*)bv->work;
303:       rwork2 = rwork+n_;
304:       PetscMemzero(rwork,n_*sizeof(PetscReal));
305:       PetscMemzero(rwork2,n_*sizeof(PetscReal));
306:       for (j=0;j<n_;j++) {
307:         for (i=0;i<m_;i++) {
308:           rwork[j] += PetscAbsScalar(A[i+j*m_]);
309:         }
310:       }
311:       PetscMPIIntCast(n_,&len);
312:       MPI_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv));
313:       *nrm = 0.0;
314:       for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
315:     } else {
316:       *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
317:     }
318:     PetscLogFlops(1.0*m*n);
319:   } else if (type==NORM_INFINITY) {
320:     BVAllocateWork_Private(bv,m_);
321:     rwork = (PetscReal*)bv->work;
322:     lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
323:     if (mpi) {
324:       MPI_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv));
325:     } else *nrm = lnrm;
326:     PetscLogFlops(1.0*m*n);
327:   }
328:   PetscFPTrapPop();
329:   return(0);
330: }

332: /*
333:     QR factorization of an mxn matrix
334: */
335: PetscErrorCode BVOrthogonalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R)
336: {
337: #if defined(PETSC_MISSING_LAPACK_GEQRF) || defined(PETSC_MISSING_LAPACK_ORGQR)
339:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEQRF/ORGQR - Lapack routines are unavailable");
340: #else
342:   PetscBLASInt   m,n,i,j,k,l,nb,lwork,info;
343:   PetscScalar    *tau,*work,*Rl=NULL,*A=NULL,*C=NULL,one=1.0,zero=0.0;
344:   PetscMPIInt    rank,size,len;
345:   PetscBool      mpi;

348:   PetscFPTrapPush(PETSC_FP_TRAP_OFF);
349:   PetscBLASIntCast(m_,&m);
350:   PetscBLASIntCast(n_,&n);
351:   k = PetscMin(m,n);
352:   nb = 16;
353:   MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size);
354:   if (size>2) SETERRQ(PetscObjectComm((PetscObject)bv),1,"Not implemented yet for more than two MPI processes");
355:   mpi = size>1? PETSC_TRUE: PETSC_FALSE;
356:   if (mpi) {
357:     MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank);
358:     BVAllocateWork_Private(bv,k+n*nb+n*n+n*n*size+m*n);
359:   } else {
360:     BVAllocateWork_Private(bv,k+n*nb);
361:    }
362:   tau = bv->work;
363:   work = bv->work+k;
364:   PetscBLASIntCast(n*nb,&lwork);
365:   if (mpi) {
366:     Rl = bv->work+k+n*nb;
367:     A  = bv->work+k+n*nb+n*n;
368:     C  = bv->work+k+n*nb+n*n+n*n*size;
369:   }

371:   /* Compute QR */
372:   PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
373:   SlepcCheckLapackInfo("geqrf",info);

375:   /* Extract R */
376:   if (R || mpi) {
377:     PetscMemzero(mpi? Rl: R,n*n*sizeof(PetscScalar));
378:     for (j=0;j<n;j++) {
379:       for (i=0;i<=j;i++) {
380:         if (mpi) Rl[i+j*n] = Q[i+j*m];
381:         else R[i+j*n] = Q[i+j*m];
382:       }
383:     }
384:   }

386:   /* Compute orthogonal matrix in Q */
387:   PetscStackCallBLAS("LAPACKungqr",LAPACKungqr_(&m,&n,&k,Q,&m,tau,work,&lwork,&info));
388:   SlepcCheckLapackInfo("ungqr",info);

390:   if (mpi) {

392:     /* Stack triangular matrices */
393:     PetscBLASIntCast(n*size,&l);
394:     PetscMPIIntCast(n,&len);
395:     for (j=0;j<n;j++) {
396:       MPI_Allgather(Rl+j*n,len,MPIU_SCALAR,A+j*l,len,MPIU_SCALAR,PetscObjectComm((PetscObject)bv));
397:     }

399:     /* Compute QR */
400:     PetscStackCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
401:     SlepcCheckLapackInfo("geqrf",info);

403:     /* Extract R */
404:     if (R) {
405:       PetscMemzero(R,n*n*sizeof(PetscScalar));
406:       for (j=0;j<n;j++)
407:         for (i=0;i<=j;i++)
408:           R[i+j*n] = A[i+j*l];
409:     }

411:     /* Accumulate orthogonal matrix */
412:     PetscStackCallBLAS("LAPACKungqr",LAPACKungqr_(&l,&n,&n,A,&l,tau,work,&lwork,&info));
413:     SlepcCheckLapackInfo("ungqr",info);
414:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&n,&one,Q,&m,A+rank*n,&l,&zero,C,&m));
415:     PetscMemcpy(Q,C,m*n*sizeof(PetscScalar));
416:   }

418:   PetscLogFlops(3.0*m*n*n);
419:   PetscFPTrapPop();
420:   return(0);
421: #endif
422: }