Actual source code: fnexp.c
slepc-3.8.0 2017-10-20
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2017, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h> /*I "slepcfn.h" I*/
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
20: *y = PetscExpScalar(x);
21: return(0);
22: }
24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25: {
27: *y = PetscExpScalar(x);
28: return(0);
29: }
31: #define MAX_PADE 6
32: #define SWAP(a,b,t) {t=a;a=b;b=t;}
34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
35: {
36: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
38: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
39: #else
41: PetscBLASInt n,ld,ld2,*ipiv,info,inc=1;
42: PetscInt m,j,k,sexp;
43: PetscBool odd;
44: const PetscInt p=MAX_PADE;
45: PetscReal c[MAX_PADE+1],s,*rwork;
46: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
47: PetscScalar *Aa,*Ba,*As,*A2,*Q,*P,*W,*aux;
50: MatDenseGetArray(A,&Aa);
51: MatDenseGetArray(B,&Ba);
52: MatGetSize(A,&m,NULL);
53: PetscBLASIntCast(m,&n);
54: ld = n;
55: ld2 = ld*ld;
56: P = Ba;
57: PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
58: PetscMemcpy(As,Aa,ld2*sizeof(PetscScalar));
60: /* Pade' coefficients */
61: c[0] = 1.0;
62: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
64: /* Scaling */
65: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
66: PetscLogFlops(1.0*n*n);
67: if (s>0.5) {
68: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
69: scale = PetscPowRealInt(2.0,-sexp);
70: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
71: PetscLogFlops(1.0*n*n);
72: } else sexp = 0;
74: /* Horner evaluation */
75: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
76: PetscLogFlops(2.0*n*n*n);
77: PetscMemzero(Q,ld2*sizeof(PetscScalar));
78: PetscMemzero(P,ld2*sizeof(PetscScalar));
79: for (j=0;j<n;j++) {
80: Q[j+j*ld] = c[p];
81: P[j+j*ld] = c[p-1];
82: }
84: odd = PETSC_TRUE;
85: for (k=p-1;k>0;k--) {
86: if (odd) {
87: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
88: SWAP(Q,W,aux);
89: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
90: odd = PETSC_FALSE;
91: } else {
92: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
93: SWAP(P,W,aux);
94: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
95: odd = PETSC_TRUE;
96: }
97: PetscLogFlops(2.0*n*n*n);
98: }
99: if (odd) {
100: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
101: SWAP(Q,W,aux);
102: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
103: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
104: SlepcCheckLapackInfo("gesv",info);
105: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
106: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
107: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
108: } else {
109: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
110: SWAP(P,W,aux);
111: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
112: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
113: SlepcCheckLapackInfo("gesv",info);
114: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
115: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
116: }
117: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
119: for (k=1;k<=sexp;k++) {
120: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
121: PetscMemcpy(P,W,ld2*sizeof(PetscScalar));
122: }
123: if (P!=Ba) { PetscMemcpy(Ba,P,ld2*sizeof(PetscScalar)); }
124: PetscLogFlops(2.0*n*n*n*sexp);
126: PetscFree6(Q,W,As,A2,rwork,ipiv);
127: MatDenseRestoreArray(A,&Aa);
128: MatDenseRestoreArray(B,&Ba);
129: return(0);
130: #endif
131: }
133: #define ITMAX 5
135: /*
136: * Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
137: */
138: static PetscReal normest1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
139: {
140: PetscScalar *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
141: PetscReal est=0.0,est_old,vals[2],*zvals,maxzval[2],raux;
142: PetscBLASInt i,j,t=2,it=0,ind[2],est_j=0,m1;
145: X = work;
146: Y = work + 2*n;
147: Z = work + 4*n;
148: S = work + 6*n;
149: S_old = work + 8*n;
150: zvals = (PetscReal*)(work + 10*n);
152: for (i=0;i<n;i++) { /* X has columns of unit 1-norm */
153: X[i] = 1.0/n;
154: PetscRandomGetValue(rand,&val);
155: if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
156: else X[i+n] = 1.0/n;
157: }
158: for (i=0;i<t*n;i++) S[i] = 0.0;
159: ind[0] = 0; ind[1] = 0;
160: est_old = 0;
161: while (1) {
162: it++;
163: for (j=0;j<m;j++) { /* Y = A^m*X */
164: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
165: if (j<m-1) SWAP(X,Y,aux);
166: }
167: for (j=0;j<t;j++) { /* vals[j] = norm(Y(:,j),1) */
168: vals[j] = 0.0;
169: for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
170: }
171: if (vals[0]<vals[1]) {
172: SWAP(vals[0],vals[1],raux);
173: m1 = 1;
174: } else m1 = 0;
175: est = vals[0];
176: if (est>est_old || it==2) est_j = ind[m1];
177: if (it>=2 && est<=est_old) {
178: est = est_old;
179: break;
180: }
181: est_old = est;
182: if (it>ITMAX) break;
183: SWAP(S,S_old,aux);
184: for (i=0;i<t*n;i++) { /* S = sign(Y) */
185: S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
186: }
187: for (j=0;j<m;j++) { /* Z = (A^T)^m*S */
188: PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
189: if (j<m-1) SWAP(S,Z,aux);
190: }
191: maxzval[0] = -1; maxzval[1] = -1;
192: ind[0] = 0; ind[1] = 0;
193: for (i=0;i<n;i++) { /* zvals[i] = norm(Z(i,:),inf) */
194: zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
195: if (zvals[i]>maxzval[0]) {
196: maxzval[0] = zvals[i];
197: ind[0] = i;
198: } else if (zvals[i]>maxzval[1]) {
199: maxzval[1] = zvals[i];
200: ind[1] = i;
201: }
202: }
203: if (it>=2 && maxzval[0]==zvals[est_j]) break;
204: for (i=0;i<t*n;i++) X[i] = 0.0;
205: for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
206: }
207: /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
208: PetscLogFlops(4.0*it*m*t*n*n);
209: PetscFunctionReturn(est);
210: }
212: #define SMALLN 100
214: /*
215: * Estimate norm(A^m,1) (required workspace is 2*n*n)
216: */
217: static PetscReal normAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
218: {
219: PetscScalar *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
220: PetscReal nrm,rwork[1],tmp;
221: PetscBLASInt i,j,one=1;
222: PetscBool isrealpos=PETSC_TRUE;
225: if (n<SMALLN) { /* compute matrix power explicitly */
226: if (m==1) {
227: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
228: PetscLogFlops(1.0*n*n);
229: } else { /* m>=2 */
230: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
231: for (j=0;j<m-2;j++) {
232: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
233: SWAP(v,w,aux);
234: }
235: nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
236: PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
237: }
238: } else {
239: for (i=0;i<n;i++)
240: for (j=0;j<n;j++)
241: #if defined(PETSC_USE_COMPLEX)
242: if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
243: #else
244: if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
245: #endif
246: if (isrealpos) { /* for positive matrices only */
247: for (i=0;i<n;i++) v[i] = 1.0;
248: for (j=0;j<m;j++) { /* w = A'*v */
249: PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
250: SWAP(v,w,aux);
251: }
252: PetscLogFlops(2.0*n*n*m);
253: nrm = 0.0;
254: for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > nrm) nrm = tmp; /* norm(v,inf) */
255: } else {
256: nrm = normest1(n,A,m,work,rand);
257: }
258: }
259: PetscFunctionReturn(nrm);
260: }
262: /*
263: * Function needed to compute optimal parameters (required workspace is 3*n*n)
264: */
265: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
266: {
267: PetscScalar *Ascaled=work;
268: PetscReal nrm,alpha,beta,rwork[1];
269: PetscInt t;
270: PetscBLASInt i,j;
273: beta = PetscPowReal(coeff,1.0/(2*m+1));
274: for (i=0;i<n;i++)
275: for (j=0;j<n;j++)
276: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
277: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
278: PetscLogFlops(2.0*n*n);
279: alpha = normAm(n,Ascaled,2*m+1,work+n*n,rand)/nrm;
280: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
281: PetscFunctionReturn(t);
282: }
284: /*
285: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
286: */
287: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
288: {
289: PetscErrorCode ierr;
290: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
291: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
292: PetscBLASInt n_,n2,one=1;
293: PetscRandom rand;
294: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
295: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
296: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
297: 2.539398330063230e-001, /* m = 5 */
298: 9.504178996162932e-001, /* m = 7 */
299: 2.097847961257068e+000, /* m = 9 */
300: 5.371920351148152e+000 }; /* m = 13 */
303: *s = 0;
304: *m = 13;
305: PetscBLASIntCast(n,&n_);
306: PetscRandomCreate(PETSC_COMM_SELF,&rand);
307: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
308: if (d4==0.0) { /* safeguard for the case A = 0 */
309: *m = 3;
310: goto done;
311: }
312: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
313: PetscLogFlops(2.0*n*n);
314: eta1 = PetscMax(d4,d6);
315: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
316: *m = 3;
317: goto done;
318: }
319: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
320: *m = 5;
321: goto done;
322: }
323: if (n<SMALLN) {
324: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
325: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
326: PetscLogFlops(2.0*n*n*n+1.0*n*n);
327: } else {
328: d8 = PetscPowReal(normAm(n_,Apowers[2],2,work,rand),1.0/8.0);
329: }
330: eta3 = PetscMax(d6,d8);
331: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
332: *m = 7;
333: goto done;
334: }
335: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
336: *m = 9;
337: goto done;
338: }
339: if (n<SMALLN) {
340: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
341: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
342: PetscLogFlops(2.0*n*n*n+1.0*n*n);
343: } else {
344: d10 = PetscPowReal(normAm(n_,Apowers[1],5,work,rand),1.0/10.0);
345: }
346: eta4 = PetscMax(d8,d10);
347: eta5 = PetscMin(eta3,eta4);
348: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
349: if (*s) {
350: Ascaled = work+3*n*n;
351: n2 = n_*n_;
352: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
353: sfactor = PetscPowRealInt(2.0,-(*s));
354: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
355: PetscLogFlops(1.0*n*n);
356: } else Ascaled = A;
357: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
358: done:
359: PetscRandomDestroy(&rand);
360: return(0);
361: }
363: /*
364: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
365: *
366: * N. J. Higham, "The scaling and squaring method for the matrix exponential
367: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
368: */
369: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
370: {
371: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
373: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
374: #else
375: PetscErrorCode ierr;
376: PetscBLASInt n_,n2,*ipiv,info,one=1;
377: PetscInt n,m,j,s;
378: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
379: PetscScalar *Aa,*Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
380: const PetscScalar *c;
381: const PetscScalar c3[4] = { 120, 60, 12, 1 };
382: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
383: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
384: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
385: 2162160, 110880, 3960, 90, 1 };
386: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
387: 1187353796428800, 129060195264000, 10559470521600,
388: 670442572800, 33522128640, 1323241920,
389: 40840800, 960960, 16380, 182, 1 };
392: MatDenseGetArray(A,&Aa);
393: MatDenseGetArray(B,&Ba);
394: MatGetSize(A,&n,NULL);
395: PetscBLASIntCast(n,&n_);
396: n2 = n_*n_;
397: PetscMalloc2(8*n*n,&work,n,&ipiv);
399: /* Matrix powers */
400: Apowers[0] = work; /* Apowers[0] = A */
401: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
402: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
403: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
404: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
406: PetscMemcpy(Apowers[0],Aa,n2*sizeof(PetscScalar));
407: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
408: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
409: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
410: PetscLogFlops(6.0*n*n*n);
412: /* Compute scaling parameter and order of Pade approximant */
413: expm_params(n,Apowers,&s,&m,Apowers[4]);
415: if (s) { /* rescale */
416: for (j=0;j<4;j++) {
417: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
418: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
419: }
420: PetscLogFlops(4.0*n*n);
421: }
423: /* Evaluate the Pade approximant */
424: switch (m) {
425: case 3: c = c3; break;
426: case 5: c = c5; break;
427: case 7: c = c7; break;
428: case 9: c = c9; break;
429: case 13: c = c13; break;
430: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
431: }
432: P = Ba;
433: Q = Apowers[4] + n*n;
434: W = Q + n*n;
435: switch (m) {
436: case 3:
437: case 5:
438: case 7:
439: case 9:
440: if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
441: PetscMemzero(P,n2*sizeof(PetscScalar));
442: PetscMemzero(Q,n2*sizeof(PetscScalar));
443: for (j=0;j<n;j++) {
444: P[j+j*n] = c[1];
445: Q[j+j*n] = c[0];
446: }
447: for (j=m;j>=3;j-=2) {
448: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
449: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
450: PetscLogFlops(4.0*n*n);
451: }
452: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
453: PetscLogFlops(2.0*n*n*n);
454: SWAP(P,W,aux);
455: break;
456: case 13:
457: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
458: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
459: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
460: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
461: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
462: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
463: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
464: PetscLogFlops(5.0*n*n+2.0*n*n*n);
465: PetscMemzero(P,n2*sizeof(PetscScalar));
466: for (j=0;j<n;j++) P[j+j*n] = c[1];
467: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
468: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
469: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
470: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
471: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
472: PetscLogFlops(7.0*n*n+2.0*n*n*n);
473: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
474: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
475: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
476: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
477: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
478: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
479: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
480: PetscLogFlops(5.0*n*n+2.0*n*n*n);
481: PetscMemzero(Q,n2*sizeof(PetscScalar));
482: for (j=0;j<n;j++) Q[j+j*n] = c[0];
483: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
484: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
485: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
486: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
487: PetscLogFlops(7.0*n*n);
488: break;
489: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
490: }
491: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
492: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
493: SlepcCheckLapackInfo("gesv",info);
494: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
495: for (j=0;j<n;j++) P[j+j*n] += 1.0;
496: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);
498: /* Squaring */
499: for (j=1;j<=s;j++) {
500: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
501: SWAP(P,W,aux);
502: }
503: if (P!=Ba) { PetscMemcpy(Ba,P,n2*sizeof(PetscScalar)); }
504: PetscLogFlops(2.0*n*n*n*s);
506: PetscFree2(work,ipiv);
507: MatDenseRestoreArray(A,&Aa);
508: MatDenseRestoreArray(B,&Ba);
509: return(0);
510: #endif
511: }
513: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
514: {
516: PetscBool isascii;
517: char str[50];
518: const char *methodname[] = {
519: "scaling & squaring, [m/m] Pade approximant (Higham)",
520: "scaling & squaring, [6/6] Pade approximant"
521: };
522: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
525: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
526: if (isascii) {
527: if (fn->beta==(PetscScalar)1.0) {
528: if (fn->alpha==(PetscScalar)1.0) {
529: PetscViewerASCIIPrintf(viewer," Exponential: exp(x)\n");
530: } else {
531: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
532: PetscViewerASCIIPrintf(viewer," Exponential: exp(%s*x)\n",str);
533: }
534: } else {
535: SlepcSNPrintfScalar(str,50,fn->beta,PETSC_TRUE);
536: if (fn->alpha==(PetscScalar)1.0) {
537: PetscViewerASCIIPrintf(viewer," Exponential: %s*exp(x)\n",str);
538: } else {
539: PetscViewerASCIIPrintf(viewer," Exponential: %s",str);
540: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
541: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
542: PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
543: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
544: }
545: }
546: if (fn->method<nmeth) {
547: PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
548: }
549: }
550: return(0);
551: }
553: PETSC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
554: {
556: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
557: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
558: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
559: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
560: fn->ops->view = FNView_Exp;
561: return(0);
562: }