Actual source code: fnexp.c

slepc-3.8.0 2017-10-20
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2017, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Exponential function  exp(x)
 12: */

 14: #include <slepc/private/fnimpl.h>      /*I "slepcfn.h" I*/
 15: #include <slepcblaslapack.h>

 17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
 18: {
 20:   *y = PetscExpScalar(x);
 21:   return(0);
 22: }

 24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
 25: {
 27:   *y = PetscExpScalar(x);
 28:   return(0);
 29: }

 31: #define MAX_PADE 6
 32: #define SWAP(a,b,t) {t=a;a=b;b=t;}

 34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
 35: {
 36: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
 38:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
 39: #else
 41:   PetscBLASInt   n,ld,ld2,*ipiv,info,inc=1;
 42:   PetscInt       m,j,k,sexp;
 43:   PetscBool      odd;
 44:   const PetscInt p=MAX_PADE;
 45:   PetscReal      c[MAX_PADE+1],s,*rwork;
 46:   PetscScalar    scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
 47:   PetscScalar    *Aa,*Ba,*As,*A2,*Q,*P,*W,*aux;

 50:   MatDenseGetArray(A,&Aa);
 51:   MatDenseGetArray(B,&Ba);
 52:   MatGetSize(A,&m,NULL);
 53:   PetscBLASIntCast(m,&n);
 54:   ld  = n;
 55:   ld2 = ld*ld;
 56:   P   = Ba;
 57:   PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
 58:   PetscMemcpy(As,Aa,ld2*sizeof(PetscScalar));

 60:   /* Pade' coefficients */
 61:   c[0] = 1.0;
 62:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

 64:   /* Scaling */
 65:   s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
 66:   PetscLogFlops(1.0*n*n);
 67:   if (s>0.5) {
 68:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
 69:     scale = PetscPowRealInt(2.0,-sexp);
 70:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
 71:     PetscLogFlops(1.0*n*n);
 72:   } else sexp = 0;

 74:   /* Horner evaluation */
 75:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
 76:   PetscLogFlops(2.0*n*n*n);
 77:   PetscMemzero(Q,ld2*sizeof(PetscScalar));
 78:   PetscMemzero(P,ld2*sizeof(PetscScalar));
 79:   for (j=0;j<n;j++) {
 80:     Q[j+j*ld] = c[p];
 81:     P[j+j*ld] = c[p-1];
 82:   }

 84:   odd = PETSC_TRUE;
 85:   for (k=p-1;k>0;k--) {
 86:     if (odd) {
 87:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
 88:       SWAP(Q,W,aux);
 89:       for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
 90:       odd = PETSC_FALSE;
 91:     } else {
 92:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
 93:       SWAP(P,W,aux);
 94:       for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
 95:       odd = PETSC_TRUE;
 96:     }
 97:     PetscLogFlops(2.0*n*n*n);
 98:   }
 99:   if (odd) {
100:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
101:     SWAP(Q,W,aux);
102:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
103:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
104:     SlepcCheckLapackInfo("gesv",info);
105:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
106:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
107:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
108:   } else {
109:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
110:     SWAP(P,W,aux);
111:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
112:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
113:     SlepcCheckLapackInfo("gesv",info);
114:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
115:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
116:   }
117:   PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

119:   for (k=1;k<=sexp;k++) {
120:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
121:     PetscMemcpy(P,W,ld2*sizeof(PetscScalar));
122:   }
123:   if (P!=Ba) { PetscMemcpy(Ba,P,ld2*sizeof(PetscScalar)); }
124:   PetscLogFlops(2.0*n*n*n*sexp);

126:   PetscFree6(Q,W,As,A2,rwork,ipiv);
127:   MatDenseRestoreArray(A,&Aa);
128:   MatDenseRestoreArray(B,&Ba);
129:   return(0);
130: #endif
131: }

133: #define ITMAX 5

135: /*
136:  * Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
137:  */
138: static PetscReal normest1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
139: {
140:   PetscScalar  *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
141:   PetscReal    est=0.0,est_old,vals[2],*zvals,maxzval[2],raux;
142:   PetscBLASInt i,j,t=2,it=0,ind[2],est_j=0,m1;

145:   X = work;
146:   Y = work + 2*n;
147:   Z = work + 4*n;
148:   S = work + 6*n;
149:   S_old = work + 8*n;
150:   zvals = (PetscReal*)(work + 10*n);

152:   for (i=0;i<n;i++) {  /* X has columns of unit 1-norm */
153:     X[i] = 1.0/n;
154:     PetscRandomGetValue(rand,&val);
155:     if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
156:     else X[i+n] = 1.0/n;
157:   }
158:   for (i=0;i<t*n;i++) S[i] = 0.0;
159:   ind[0] = 0; ind[1] = 0;
160:   est_old = 0;
161:   while (1) {
162:     it++;
163:     for (j=0;j<m;j++) {  /* Y = A^m*X */
164:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
165:       if (j<m-1) SWAP(X,Y,aux);
166:     }
167:     for (j=0;j<t;j++) {  /* vals[j] = norm(Y(:,j),1) */
168:       vals[j] = 0.0;
169:       for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
170:     }
171:     if (vals[0]<vals[1]) {
172:       SWAP(vals[0],vals[1],raux);
173:       m1 = 1;
174:     } else m1 = 0;
175:     est = vals[0];
176:     if (est>est_old || it==2) est_j = ind[m1];
177:     if (it>=2 && est<=est_old) {
178:       est = est_old;
179:       break;
180:     }
181:     est_old = est;
182:     if (it>ITMAX) break;
183:     SWAP(S,S_old,aux);
184:     for (i=0;i<t*n;i++) {  /* S = sign(Y) */
185:       S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
186:     }
187:     for (j=0;j<m;j++) {  /* Z = (A^T)^m*S */
188:       PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
189:       if (j<m-1) SWAP(S,Z,aux);
190:     }
191:     maxzval[0] = -1; maxzval[1] = -1;
192:     ind[0] = 0; ind[1] = 0;
193:     for (i=0;i<n;i++) {  /* zvals[i] = norm(Z(i,:),inf) */
194:       zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
195:       if (zvals[i]>maxzval[0]) {
196:         maxzval[0] = zvals[i];
197:         ind[0] = i;
198:       } else if (zvals[i]>maxzval[1]) {
199:         maxzval[1] = zvals[i];
200:         ind[1] = i;
201:       }
202:     }
203:     if (it>=2 && maxzval[0]==zvals[est_j]) break;
204:     for (i=0;i<t*n;i++) X[i] = 0.0;
205:     for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
206:   }
207:   /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
208:   PetscLogFlops(4.0*it*m*t*n*n);
209:   PetscFunctionReturn(est);
210: }

212: #define SMALLN 100

214: /*
215:  * Estimate norm(A^m,1) (required workspace is 2*n*n)
216:  */
217: static PetscReal normAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
218: {
219:   PetscScalar  *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
220:   PetscReal    nrm,rwork[1],tmp;
221:   PetscBLASInt i,j,one=1;
222:   PetscBool    isrealpos=PETSC_TRUE;

225:   if (n<SMALLN) {   /* compute matrix power explicitly */
226:     if (m==1) {
227:       nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
228:       PetscLogFlops(1.0*n*n);
229:     } else {  /* m>=2 */
230:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
231:       for (j=0;j<m-2;j++) {
232:         PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
233:         SWAP(v,w,aux);
234:       }
235:       nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
236:       PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
237:     }
238:   } else {
239:     for (i=0;i<n;i++)
240:       for (j=0;j<n;j++) 
241: #if defined(PETSC_USE_COMPLEX)
242:         if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
243: #else
244:         if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
245: #endif
246:     if (isrealpos) {   /* for positive matrices only */
247:       for (i=0;i<n;i++) v[i] = 1.0;
248:       for (j=0;j<m;j++) {  /* w = A'*v */
249:         PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
250:         SWAP(v,w,aux);
251:       }
252:       PetscLogFlops(2.0*n*n*m);
253:       nrm = 0.0;
254:       for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > nrm) nrm = tmp;   /* norm(v,inf) */
255:     } else {
256:       nrm = normest1(n,A,m,work,rand);
257:     }
258:   }
259:   PetscFunctionReturn(nrm);
260: }

262: /*
263:  * Function needed to compute optimal parameters (required workspace is 3*n*n)
264:  */
265: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
266: {
267:   PetscScalar  *Ascaled=work;
268:   PetscReal    nrm,alpha,beta,rwork[1];
269:   PetscInt     t;
270:   PetscBLASInt i,j;

273:   beta = PetscPowReal(coeff,1.0/(2*m+1));
274:   for (i=0;i<n;i++)
275:     for (j=0;j<n;j++) 
276:       Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
277:   nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
278:   PetscLogFlops(2.0*n*n);
279:   alpha = normAm(n,Ascaled,2*m+1,work+n*n,rand)/nrm;
280:   t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
281:   PetscFunctionReturn(t);
282: }

284: /*
285:  * Compute scaling parameter (s) and order of Pade approximant (m)  (required workspace is 4*n*n)
286:  */
287: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
288: {
289:   PetscErrorCode  ierr;
290:   PetscScalar     sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
291:   PetscReal       d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
292:   PetscBLASInt    n_,n2,one=1;
293:   PetscRandom     rand;
294:   const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11,  /* backward error function */
295:                                2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
296:   const PetscReal theta[5] = { 1.495585217958292e-002,    /* m = 3  */
297:                                2.539398330063230e-001,    /* m = 5  */
298:                                9.504178996162932e-001,    /* m = 7  */
299:                                2.097847961257068e+000,    /* m = 9  */
300:                                5.371920351148152e+000 };  /* m = 13 */

303:   *s = 0;
304:   *m = 13;
305:   PetscBLASIntCast(n,&n_);
306:   PetscRandomCreate(PETSC_COMM_SELF,&rand);
307:   d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
308:   if (d4==0.0) { /* safeguard for the case A = 0 */
309:     *m = 3;
310:     goto done;
311:   }
312:   d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
313:   PetscLogFlops(2.0*n*n);
314:   eta1 = PetscMax(d4,d6);
315:   if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
316:     *m = 3;
317:     goto done;
318:   }
319:   if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
320:     *m = 5;
321:     goto done;
322:   }
323:   if (n<SMALLN) {
324:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
325:     d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
326:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
327:   } else {
328:     d8 = PetscPowReal(normAm(n_,Apowers[2],2,work,rand),1.0/8.0);
329:   }
330:   eta3 = PetscMax(d6,d8);
331:   if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
332:     *m = 7;
333:     goto done;
334:   }
335:   if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
336:     *m = 9;
337:     goto done;
338:   }
339:   if (n<SMALLN) {
340:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
341:     d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
342:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
343:   } else {
344:     d10 = PetscPowReal(normAm(n_,Apowers[1],5,work,rand),1.0/10.0);
345:   }
346:   eta4 = PetscMax(d8,d10);
347:   eta5 = PetscMin(eta3,eta4);
348:   *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
349:   if (*s) {
350:     Ascaled = work+3*n*n;
351:     n2 = n_*n_;
352:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
353:     sfactor = PetscPowRealInt(2.0,-(*s));
354:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
355:     PetscLogFlops(1.0*n*n);
356:   } else Ascaled = A;
357:   *s += ell(n_,Ascaled,coeff[4],13,work,rand);
358: done:
359:   PetscRandomDestroy(&rand);
360:   return(0);
361: }

363: /*
364:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
365:  *
366:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential 
367:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
368:  */
369: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
370: {
371: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
373:   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
374: #else
375:   PetscErrorCode    ierr;
376:   PetscBLASInt      n_,n2,*ipiv,info,one=1;
377:   PetscInt          n,m,j,s;
378:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
379:   PetscScalar       *Aa,*Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
380:   const PetscScalar *c;
381:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
382:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
383:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
384:   const PetscScalar c9[10]  = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
385:                                 2162160, 110880, 3960, 90, 1 };
386:   const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
387:                                 1187353796428800,  129060195264000,   10559470521600,
388:                                 670442572800,      33522128640,       1323241920,
389:                                 40840800,          960960,            16380,  182,  1 };

392:   MatDenseGetArray(A,&Aa);
393:   MatDenseGetArray(B,&Ba);
394:   MatGetSize(A,&n,NULL);
395:   PetscBLASIntCast(n,&n_);
396:   n2 = n_*n_;
397:   PetscMalloc2(8*n*n,&work,n,&ipiv);

399:   /* Matrix powers */
400:   Apowers[0] = work;                  /* Apowers[0] = A   */
401:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
402:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
403:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
404:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */

406:   PetscMemcpy(Apowers[0],Aa,n2*sizeof(PetscScalar));
407:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
408:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
409:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
410:   PetscLogFlops(6.0*n*n*n);

412:   /* Compute scaling parameter and order of Pade approximant */
413:   expm_params(n,Apowers,&s,&m,Apowers[4]);

415:   if (s) { /* rescale */
416:     for (j=0;j<4;j++) {
417:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
418:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
419:     }
420:     PetscLogFlops(4.0*n*n);
421:   }

423:   /* Evaluate the Pade approximant */
424:   switch (m) {
425:     case 3:  c = c3;  break;
426:     case 5:  c = c5;  break;
427:     case 7:  c = c7;  break;
428:     case 9:  c = c9;  break;
429:     case 13: c = c13; break;
430:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
431:   }
432:   P = Ba;
433:   Q = Apowers[4] + n*n;
434:   W = Q + n*n;
435:   switch (m) {
436:     case 3:
437:     case 5:
438:     case 7:
439:     case 9:
440:       if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
441:       PetscMemzero(P,n2*sizeof(PetscScalar));
442:       PetscMemzero(Q,n2*sizeof(PetscScalar));
443:       for (j=0;j<n;j++) {
444:         P[j+j*n] = c[1];
445:         Q[j+j*n] = c[0];
446:       }
447:       for (j=m;j>=3;j-=2) {
448:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
449:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
450:         PetscLogFlops(4.0*n*n);
451:       }
452:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
453:       PetscLogFlops(2.0*n*n*n);
454:       SWAP(P,W,aux);
455:       break;
456:     case 13:
457:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1]) 
458:               + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
459:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
460:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
461:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
462:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
463:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
464:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
465:       PetscMemzero(P,n2*sizeof(PetscScalar));
466:       for (j=0;j<n;j++) P[j+j*n] = c[1];
467:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
468:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
469:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
470:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
471:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
472:       PetscLogFlops(7.0*n*n+2.0*n*n*n);
473:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
474:               + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
475:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
476:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
477:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
478:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
479:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
480:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
481:       PetscMemzero(Q,n2*sizeof(PetscScalar));
482:       for (j=0;j<n;j++) Q[j+j*n] = c[0];
483:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
484:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
485:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
486:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
487:       PetscLogFlops(7.0*n*n);
488:       break;
489:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
490:   }
491:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
492:   PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
493:   SlepcCheckLapackInfo("gesv",info);
494:   PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
495:   for (j=0;j<n;j++) P[j+j*n] += 1.0;
496:   PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);

498:   /* Squaring */
499:   for (j=1;j<=s;j++) {
500:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
501:     SWAP(P,W,aux);
502:   }
503:   if (P!=Ba) { PetscMemcpy(Ba,P,n2*sizeof(PetscScalar)); }
504:   PetscLogFlops(2.0*n*n*n*s);

506:   PetscFree2(work,ipiv);
507:   MatDenseRestoreArray(A,&Aa);
508:   MatDenseRestoreArray(B,&Ba);
509:   return(0);
510: #endif
511: }

513: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
514: {
516:   PetscBool      isascii;
517:   char           str[50];
518:   const char     *methodname[] = {
519:                   "scaling & squaring, [m/m] Pade approximant (Higham)",
520:                   "scaling & squaring, [6/6] Pade approximant"
521:   };
522:   const int      nmeth=sizeof(methodname)/sizeof(methodname[0]);

525:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
526:   if (isascii) {
527:     if (fn->beta==(PetscScalar)1.0) {
528:       if (fn->alpha==(PetscScalar)1.0) {
529:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(x)\n");
530:       } else {
531:         SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
532:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(%s*x)\n",str);
533:       }
534:     } else {
535:       SlepcSNPrintfScalar(str,50,fn->beta,PETSC_TRUE);
536:       if (fn->alpha==(PetscScalar)1.0) {
537:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s*exp(x)\n",str);
538:       } else {
539:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s",str);
540:         PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
541:         SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
542:         PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
543:         PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
544:       }
545:     }
546:     if (fn->method<nmeth) {
547:       PetscViewerASCIIPrintf(viewer,"  computing matrix functions with: %s\n",methodname[fn->method]);
548:     }
549:   }
550:   return(0);
551: }

553: PETSC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
554: {
556:   fn->ops->evaluatefunction       = FNEvaluateFunction_Exp;
557:   fn->ops->evaluatederivative     = FNEvaluateDerivative_Exp;
558:   fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
559:   fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
560:   fn->ops->view                   = FNView_Exp;
561:   return(0);
562: }