Actual source code: fnexp.c

slepc-3.13.2 2020-05-12
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-2020, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    Exponential function  exp(x)
 12: */

 14:  #include <slepc/private/fnimpl.h>
 15:  #include <slepcblaslapack.h>

 17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
 18: {
 20:   *y = PetscExpScalar(x);
 21:   return(0);
 22: }

 24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
 25: {
 27:   *y = PetscExpScalar(x);
 28:   return(0);
 29: }

 31: #define MAX_PADE 6
 32: #define SWAP(a,b,t) {t=a;a=b;b=t;}

 34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
 35: {
 37:   PetscBLASInt   n,ld,ld2,*ipiv,info,inc=1;
 38:   PetscInt       m,j,k,sexp;
 39:   PetscBool      odd;
 40:   const PetscInt p=MAX_PADE;
 41:   PetscReal      c[MAX_PADE+1],s,*rwork;
 42:   PetscScalar    scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
 43:   PetscScalar    *Aa,*Ba,*As,*A2,*Q,*P,*W,*aux;

 46:   MatDenseGetArray(A,&Aa);
 47:   MatDenseGetArray(B,&Ba);
 48:   MatGetSize(A,&m,NULL);
 49:   PetscBLASIntCast(m,&n);
 50:   ld  = n;
 51:   ld2 = ld*ld;
 52:   P   = Ba;
 53:   PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
 54:   PetscArraycpy(As,Aa,ld2);

 56:   /* Pade' coefficients */
 57:   c[0] = 1.0;
 58:   for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));

 60:   /* Scaling */
 61:   s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
 62:   PetscLogFlops(1.0*n*n);
 63:   if (s>0.5) {
 64:     sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
 65:     scale = PetscPowRealInt(2.0,-sexp);
 66:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
 67:     PetscLogFlops(1.0*n*n);
 68:   } else sexp = 0;

 70:   /* Horner evaluation */
 71:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
 72:   PetscLogFlops(2.0*n*n*n);
 73:   PetscArrayzero(Q,ld2);
 74:   PetscArrayzero(P,ld2);
 75:   for (j=0;j<n;j++) {
 76:     Q[j+j*ld] = c[p];
 77:     P[j+j*ld] = c[p-1];
 78:   }

 80:   odd = PETSC_TRUE;
 81:   for (k=p-1;k>0;k--) {
 82:     if (odd) {
 83:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
 84:       SWAP(Q,W,aux);
 85:       for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
 86:       odd = PETSC_FALSE;
 87:     } else {
 88:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
 89:       SWAP(P,W,aux);
 90:       for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
 91:       odd = PETSC_TRUE;
 92:     }
 93:     PetscLogFlops(2.0*n*n*n);
 94:   }
 95:   /*if (odd) {
 96:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
 97:     SWAP(Q,W,aux);
 98:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
 99:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
100:     SlepcCheckLapackInfo("gesv",info);
101:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
102:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
103:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
104:   } else {*/
105:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
106:     SWAP(P,W,aux);
107:     PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
108:     PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
109:     SlepcCheckLapackInfo("gesv",info);
110:     PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
111:     for (j=0;j<n;j++) P[j+j*ld] += 1.0;
112:   /*}*/
113:   PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);

115:   for (k=1;k<=sexp;k++) {
116:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
117:     PetscArraycpy(P,W,ld2);
118:   }
119:   if (P!=Ba) { PetscArraycpy(Ba,P,ld2); }
120:   PetscLogFlops(2.0*n*n*n*sexp);

122:   PetscFree6(Q,W,As,A2,rwork,ipiv);
123:   MatDenseRestoreArray(A,&Aa);
124:   MatDenseRestoreArray(B,&Ba);
125:   return(0);
126: }

128: /*
129:  * Set scaling factor (s) and Pade degree (k,m)
130:  */
131: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
132: {
134:   if (nrm>1) {
135:     if      (nrm<200)  {*s = 4; *k = 5; *m = *k-1;}
136:     else if (nrm<1e4)  {*s = 4; *k = 4; *m = *k+1;}
137:     else if (nrm<1e6)  {*s = 4; *k = 3; *m = *k+1;}
138:     else if (nrm<1e9)  {*s = 3; *k = 3; *m = *k+1;}
139:     else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
140:     else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
141:     else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
142:     else               {*s = 1; *k = 1; *m = *k+1;}
143:   } else { /* nrm<1 */
144:     if       (nrm>0.5)  {*s = 4; *k = 4; *m = *k-1;}
145:     else  if (nrm>0.3)  {*s = 3; *k = 4; *m = *k-1;}
146:     else  if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
147:     else  if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
148:     else  if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
149:     else  if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
150:     else  if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
151:     else  if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
152:     else                {*s = 0; *k = 1; *m = 0;}
153:   }
154:   return(0);
155: }

157: #if defined(PETSC_HAVE_COMPLEX)
158: /*
159:  * Partial fraction form coefficients.
160:  * If query, the function returns the size necessary to store the coefficients.
161:  */
162: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
163: {
164:   PetscInt i;
165:   const PetscComplex /* m == k+1 */
166:     p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
167:                -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
168:                 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
169:                 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
170:                -2.733432894659307e+02                                },
171:     p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
172:                 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
173:                 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
174:                 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
175:                 6.286704751729261e+00                               },
176:     p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
177:                -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
178:                 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
179:                 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
180:     p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
181:                 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
182:                 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
183:                 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
184:     p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
185:                 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
186:                -1.829749817484586e+01                                },
187:     p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
188:                 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
189:                 3.637834252744491e+00                                },
190:     p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
191:                 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
192:     p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
193:                 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
194:   const PetscComplex /* m == k-1 */
195:     m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
196:                -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
197:                 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
198:                 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
199:     m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
200:                 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
201:                 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
202:                 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
203:     m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
204:                 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
205:                -1.734353918633177e+02                                },
206:     m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
207:                 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
208:                 5.648485971016893e+00                                },
209:     m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
210:                 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
211:     m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
212:                 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
213:   const PetscScalar /* m == k-1 */
214:     m1remain5[2] = { 2.000000000000000e-01,  9.800000000000000e+00},
215:     m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
216:     m1remain3[2] = { 3.333333333333333e-01,  5.666666666666667e+00},
217:     m1remain2[2] = {-0.5,                   -3.5},
218:     remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
219:     remain2[3] = {1.0/2.0, 1, 1};

222:   if (query) { /* query about buffer's size */
223:     if (m==k+1) {
224:       *remain = 0;
225:       *r = *q = k+1;
226:       return(0); /* quick return */
227:     }
228:     if (m==k-1) {
229:       *remain = 2;
230:       if (k==5) *r = *q = 4;
231:       else if (k==4) *r = *q = 3;
232:       else if (k==3) *r = *q = 2;
233:       else if (k==2) *r = *q = 1;
234:     }
235:     if (m==0) {
236:       *r = *q = 0;
237:       *remain = k+1;
238:     }
239:   } else {
240:     if (m==k+1) {
241:       if (k==4) {
242:         for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
243:       } else if (k==3) {
244:         for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
245:       } else if (k==2) {
246:         for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
247:       } else if (k==1) {
248:         for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
249:       }
250:       return(0); /* quick return */
251:     }
252:     if (m==k-1) {
253:       if (k==5) {
254:         for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
255:         for (i=0;i<2;i++) remain[i] = m1remain5[i];
256:       } else if (k==4) {
257:         for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
258:         for (i=0;i<2;i++) remain[i] = m1remain4[i];
259:       } else if (k==3) {
260:         for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
261:       } else if (k==2) {
262:         r[0] = -13.5; q[0] = 3;
263:         for (i=0;i<2;i++) remain[i] = m1remain2[i];
264:       }
265:     }
266:     if (m==0) {
267:       r = q = 0;
268:       if (k==3) {
269:         for (i=0;i<4;i++) remain[i] = remain3[i];
270:       } else if (k==2) {
271:         for (i=0;i<3;i++) remain[i] = remain2[i];
272:       }
273:     }
274:   }
275:   return(0);
276: }

278: /*
279:  * Product form coefficients.
280:  * If query, the function returns the size necessary to store the coefficients.
281:  */
282: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
283: {
284:   PetscInt i;
285:   const PetscComplex /* m == k+1 */
286:   p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
287:              -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
288:              -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
289:              -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
290:   p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
291:               3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
292:               6.286704751729261e+00                                ,
293:               5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
294:               5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
295:   p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
296:              -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
297:              -5.648485971016893e+00                                },
298:   p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
299:               3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
300:               4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
301:               4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
302:   p1p2[2] = {-4.00000000000000e+00  + 2.000000000000000e+00*PETSC_i,
303:              -4.00000000000000e+00  - 2.000000000000000e+00*PETSC_i},
304:   p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
305:               2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
306:               3.637834252744491e+00                               },
307:   p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
308:               2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
309:   const PetscComplex /* m == k-1 */
310:   m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
311:              -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
312:              -6.286704751729261e+00                                ,
313:              -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
314:              -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
315:   m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
316:               5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
317:               6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
318:               6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
319:   m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
320:              -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
321:              -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
322:              -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
323:   m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
324:               4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
325:               5.648485971016893e+00                                },
326:   m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
327:              -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
328:              -3.637834252744491e+00                                },
329:   m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
330:               4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
331:   m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
332:              -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};

335:   if (query) {
336:     if (m == k+1) {
337:       *mult = 1;
338:       *p = k;
339:       *q = k+1;
340:       return(0);
341:     }
342:     if (m==k-1) {
343:       *mult = 1;
344:       *p = k;
345:       *q = k-1;
346:     }
347:   } else {
348:     if (m == k+1) {
349:       *mult = PetscPowInt(-1,m);
350:       *mult *= m;
351:       if (k==4) {
352:         for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
353:         q[4] = p1q4[4];
354:       } else if (k==3) {
355:         for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
356:         q[3] = p1q3[3];
357:       } else if (k==2) {
358:         for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
359:         q[2] = p1q2[2];
360:       } else if (k==1) {
361:         p[0] = -3;
362:         for (i=0;i<2;i++) q[i] = p1q1[i];
363:       }
364:       return(0);
365:     }
366:     if (m==k-1) {
367:       *mult = PetscPowInt(-1,m);
368:       *mult /= k;
369:       if (k==5) {
370:         for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
371:         p[4] = m1p5[4];
372:       } else if (k==4) {
373:         for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
374:         p[3] = m1p4[3];
375:       } else if (k==3) {
376:         for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
377:         p[2] = m1p3[2];
378:       } else if (k==2) {
379:         for (i=0;i<2;i++) p[i] = m1p2[i];
380:         q[0] = 3;
381:       }
382:     }
383:   }
384:   return(0);
385: }
386: #endif /* PETSC_HAVE_COMPLEX */

388: #if defined(PETSC_USE_COMPLEX)
389: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
390: {
391:   PetscInt i;

394:   *result=PETSC_TRUE;
395:   for (i=0;i<n&&*result;i++) {
396:     if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
397:   }
398:   return(0);
399: }
400: #endif

402: /*
403:  * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
404:  * and Yuji Nakatsukasa
405:  *
406:  *     Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
407:  *     Approximation for the Matrix Exponential",
408:  *     SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
409:  *     https://doi.org/10.1137/15M1027553
410:  */
411: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
412: {
413: #if !defined(PETSC_HAVE_COMPLEX)
415:   SETERRQ(PETSC_COMM_SELF,1,"This function requires C99 or C++ complex support");
416: #else
417:   PetscInt       i,j,n_,s,k,m,mod;
418:   PetscBLASInt   n,n2,irsize,rsizediv2,ipsize,iremainsize,info,*piv,minlen,lwork,one=1;
419:   PetscReal      nrm,shift;
420: #if defined(PETSC_USE_COMPLEX) || defined(PETSC_HAVE_ESSL)
421:   PetscReal      *rwork=NULL;
422: #endif
423:   PetscComplex   *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
424:   PetscScalar    *Aa,*Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
426:   PetscBool      isreal;
427: #if defined(PETSC_HAVE_ESSL)
428:   PetscScalar    sdummy,*wri;
429:   PetscBLASInt   idummy,io=0;
430: #else
431:   PetscBLASInt   query=-1;
432:   PetscScalar    work1,*work;
433: #endif

436:   MatGetSize(A,&n_,NULL);
437:   PetscBLASIntCast(n_,&n);
438:   MatDenseGetArray(A,&Aa);
439:   MatDenseGetArray(B,&Ba);
440:   Ba2 = Ba;
441:   PetscBLASIntCast(n*n,&n2);

443:   PetscMalloc2(n2,&sMaux,n2,&Maux);
444:   Maux2 = Maux;
445:   PetscMalloc2(n,&wr,n,&wi);
446:   PetscArraycpy(sMaux,Aa,n2);
447:   /* estimate rightmost eigenvalue and shift A with it */
448: #if !defined(PETSC_HAVE_ESSL)
449: #if !defined(PETSC_USE_COMPLEX)
450:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
451:   SlepcCheckLapackInfo("geev",info);
452:   PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
453:   PetscMalloc1(lwork,&work);
454:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
455:   PetscFree(work);
456: #else
457:   PetscArraycpy(Maux,Aa,n2);
458:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
459:   SlepcCheckLapackInfo("geev",info);
460:   PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
461:   PetscMalloc2(2*n,&rwork,lwork,&work);
462:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
463:   PetscFree2(rwork,work);
464: #endif
465:   SlepcCheckLapackInfo("geev",info);
466: #else /* defined(PETSC_HAVE_ESSL) */
467:   PetscBLASIntCast(4*n,&lwork);
468:   PetscMalloc2(lwork,&rwork,2*n,&wri);
469: #if !defined(PETSC_USE_COMPLEX)
470:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,sMaux,&n,wri,&sdummy,&idummy,&idummy,&n,rwork,&lwork));
471:   for (i=0;i<n;i++) {
472:     wr[i] = wri[2*i];
473:     wi[i] = wri[2*i+1];
474:   }
475: #else
476:   PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,Maux,&n,wri,&sdummy,&idummy,&idummy,&n,rwork,&lwork));
477:   for (i=0;i<n;i++) wr[i] = wri[i];
478: #endif
479:   PetscFree2(rwork,wri);
480: #endif
481:   PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);

483:   shift = PetscRealPart(wr[0]);
484:   for (i=1;i<n;i++) {
485:     if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
486:   }
487:   PetscFree2(wr,wi);
488:   /* shift so that largest real part is (about) 0 */
489:   PetscArraycpy(sMaux,Aa,n2);
490:   for (i=0;i<n;i++) {
491:     sMaux[i+i*n] -= shift;
492:   }
493:   PetscLogFlops(1.0*n);
494: #if defined(PETSC_USE_COMPLEX)
495:   PetscArraycpy(Maux,Aa,n2);
496:   for (i=0;i<n;i++) {
497:     Maux[i+i*n] -= shift;
498:   }
499:   PetscLogFlops(1.0*n);
500: #endif

502:   /* estimate norm(A) and select the scaling factor */
503:   nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
504:   PetscLogFlops(1.0*n*n);
505:   sexpm_params(nrm,&s,&k,&m);
506:   if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
507:     expshift = PetscExpReal(shift);
508:     for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
509:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
510:     PetscLogFlops(1.0*(n+n2));
511:     PetscArraycpy(Ba,sMaux,n2);
512:     PetscFree2(sMaux,Maux);
513:     MatDenseRestoreArray(A,&Aa);
514:     MatDenseRestoreArray(B,&Ba);
515:     return(0); /* quick return */
516:   }

518:   PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
519:   expmA2 = expmA; RR2 = RR;
520:   /* scale matrix */
521: #if !defined(PETSC_USE_COMPLEX)
522:   for (i=0;i<n2;i++) {
523:     As[i] = sMaux[i];
524:   }
525: #else
526:   PetscArraycpy(As,sMaux,n2);
527: #endif
528:   scale = 1.0/PetscPowRealInt(2.0,s);
529:   PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
530:   SlepcLogFlopsComplex(1.0*n2);

532:   /* evaluate Pade approximant (partial fraction or product form) */
533:   if (fn->method==3 || !m) { /* partial fraction */
534:     getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
535:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
536:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
537:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
538:     PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
539:     getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);

541:     PetscArrayzero(expmA,n2);
542: #if !defined(PETSC_USE_COMPLEX)
543:     isreal = PETSC_TRUE;
544: #else
545:     getisreal(n2,Maux,&isreal);
546: #endif
547:     if (isreal) {
548:       rsizediv2 = irsize/2;
549:       for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
550:         PetscArraycpy(Maux,As,n2);
551:         PetscArrayzero(RR,n2);
552:         for (j=0;j<n;j++) {
553:           Maux[j+j*n] -= p[2*i];
554:           RR[j+j*n] = r[2*i];
555:         }
556:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
557:         SlepcCheckLapackInfo("gesv",info);
558:         for (j=0;j<n2;j++) {
559:           expmA[j] += RR[j] + PetscConj(RR[j]);
560:         }
561:         /* loop(n) + gesv + loop(n2) */
562:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
563:       }

565:       mod = ipsize % 2;
566:       if (mod) {
567:         PetscArraycpy(Maux,As,n2);
568:         PetscArrayzero(RR,n2);
569:         for (j=0;j<n;j++) {
570:           Maux[j+j*n] -= p[ipsize-1];
571:           RR[j+j*n] = r[irsize-1];
572:         }
573:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
574:         SlepcCheckLapackInfo("gesv",info);
575:         for (j=0;j<n2;j++) {
576:           expmA[j] += RR[j];
577:         }
578:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
579:       }
580:     } else { /* complex */
581:       for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
582:         PetscArraycpy(Maux,As,n2);
583:         PetscArrayzero(RR,n2);
584:         for (j=0;j<n;j++) {
585:           Maux[j+j*n] -= p[i];
586:           RR[j+j*n] = r[i];
587:         }
588:         PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
589:         SlepcCheckLapackInfo("gesv",info);
590:         for (j=0;j<n2;j++) {
591:           expmA[j] += RR[j];
592:         }
593:         SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
594:       }
595:     }
596:     for (i=0;i<iremainsize;i++) {
597:       if (!i) {
598:         PetscArrayzero(RR,n2);
599:         for (j=0;j<n;j++) {
600:           RR[j+j*n] = remainterm[iremainsize-1];
601:         }
602:       } else {
603:         PetscArraycpy(RR,As,n2);
604:         for (j=1;j<i;j++) {
605:           PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
606:           SWAP(RR,Maux,aux);
607:           SlepcLogFlopsComplex(2.0*n*n*n);
608:         }
609:         PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
610:         SlepcLogFlopsComplex(1.0*n2);
611:       }
612:       for (j=0;j<n2;j++) {
613:         expmA[j] += RR[j];
614:       }
615:       SlepcLogFlopsComplex(1.0*n2);
616:     }
617:     PetscFree3(r,p,remainterm);
618:   } else { /* product form, default */
619:     getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
620:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
621:     PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
622:     PetscMalloc2(irsize,&rootp,ipsize,&rootq);
623:     getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);

625:     PetscArrayzero(expmA,n2);
626:     for (i=0;i<n;i++) { /* initialize */
627:       expmA[i+i*n] = 1.0;
628:     }
629:     minlen = PetscMin(irsize,ipsize);
630:     for (i=0;i<minlen;i++) {
631:       PetscArraycpy(RR,As,n2);
632:       for (j=0;j<n;j++) {
633:         RR[j+j*n] -= rootp[i];
634:       }
635:       PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
636:       SWAP(expmA,Maux,aux);
637:       PetscArraycpy(RR,As,n2);
638:       for (j=0;j<n;j++) {
639:         RR[j+j*n] -= rootq[i];
640:       }
641:       PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
642:       SlepcCheckLapackInfo("gesv",info);
643:       /* loop(n) + gemm + loop(n) + gesv */
644:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
645:     }
646:     /* extra numerator */
647:     for (i=minlen;i<irsize;i++) {
648:       PetscArraycpy(RR,As,n2);
649:       for (j=0;j<n;j++) {
650:         RR[j+j*n] -= rootp[i];
651:       }
652:       PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
653:       SWAP(expmA,Maux,aux);
654:       SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
655:     }
656:     /* extra denominator */
657:     for (i=minlen;i<ipsize;i++) {
658:       PetscArraycpy(RR,As,n2);
659:       for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
660:       PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
661:       SlepcCheckLapackInfo("gesv",info);
662:       SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
663:     }
664:     PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
665:     SlepcLogFlopsComplex(1.0*n2);
666:     PetscFree2(rootp,rootq);
667:   }

669: #if !defined(PETSC_USE_COMPLEX)
670:   for (i=0;i<n2;i++) {
671:     Ba2[i] = PetscRealPartComplex(expmA[i]);
672:   }
673: #else
674:   PetscArraycpy(Ba2,expmA,n2);
675: #endif

677:   /* perform repeated squaring */
678:   for (i=0;i<s;i++) { /* final squaring */
679:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
680:     SWAP(Ba2,sMaux,saux);
681:     PetscLogFlops(2.0*n*n*n);
682:   }
683:   if (Ba2!=Ba) {
684:     PetscArraycpy(Ba,Ba2,n2);
685:     sMaux = Ba2;
686:   }
687:   expshift = PetscExpReal(shift);
688:   PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
689:   PetscLogFlops(1.0*n2);

691:   /* restore pointers */
692:   Maux = Maux2; expmA = expmA2; RR = RR2;
693:   PetscFree2(sMaux,Maux);
694:   PetscFree4(expmA,As,RR,piv);
695:   MatDenseRestoreArray(A,&Aa);
696:   MatDenseRestoreArray(B,&Ba);
697:   return(0);
698: #endif
699: }

701: #define SMALLN 100

703: /*
704:  * Function needed to compute optimal parameters (required workspace is 3*n*n)
705:  */
706: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
707: {
708:   PetscScalar    *Ascaled=work;
709:   PetscReal      nrm,alpha,beta,rwork[1];
710:   PetscInt       t;
711:   PetscBLASInt   i,j;

715:   beta = PetscPowReal(coeff,1.0/(2*m+1));
716:   for (i=0;i<n;i++)
717:     for (j=0;j<n;j++)
718:       Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
719:   nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
720:   PetscLogFlops(2.0*n*n);
721:   SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha);
722:   alpha /= nrm;
723:   t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
724:   PetscFunctionReturn(t);
725: }

727: /*
728:  * Compute scaling parameter (s) and order of Pade approximant (m)  (required workspace is 4*n*n)
729:  */
730: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
731: {
732:   PetscErrorCode  ierr;
733:   PetscScalar     sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
734:   PetscReal       d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
735:   PetscBLASInt    n_,n2,one=1;
736:   PetscRandom     rand;
737:   const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11,  /* backward error function */
738:                                2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
739:   const PetscReal theta[5] = { 1.495585217958292e-002,    /* m = 3  */
740:                                2.539398330063230e-001,    /* m = 5  */
741:                                9.504178996162932e-001,    /* m = 7  */
742:                                2.097847961257068e+000,    /* m = 9  */
743:                                5.371920351148152e+000 };  /* m = 13 */

746:   *s = 0;
747:   *m = 13;
748:   PetscBLASIntCast(n,&n_);
749:   PetscRandomCreate(PETSC_COMM_SELF,&rand);
750:   d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
751:   if (d4==0.0) { /* safeguard for the case A = 0 */
752:     *m = 3;
753:     goto done;
754:   }
755:   d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
756:   PetscLogFlops(2.0*n*n);
757:   eta1 = PetscMax(d4,d6);
758:   if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
759:     *m = 3;
760:     goto done;
761:   }
762:   if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
763:     *m = 5;
764:     goto done;
765:   }
766:   if (n<SMALLN) {
767:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
768:     d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
769:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
770:   } else {
771:     SlepcNormAm(n_,Apowers[2],2,work,rand,&d8);
772:     d8 = PetscPowReal(d8,1.0/8.0);
773:   }
774:   eta3 = PetscMax(d6,d8);
775:   if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
776:     *m = 7;
777:     goto done;
778:   }
779:   if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
780:     *m = 9;
781:     goto done;
782:   }
783:   if (n<SMALLN) {
784:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
785:     d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
786:     PetscLogFlops(2.0*n*n*n+1.0*n*n);
787:   } else {
788:     SlepcNormAm(n_,Apowers[1],5,work,rand,&d10);
789:     d10 = PetscPowReal(d10,1.0/10.0);
790:   }
791:   eta4 = PetscMax(d8,d10);
792:   eta5 = PetscMin(eta3,eta4);
793:   *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
794:   if (*s) {
795:     Ascaled = work+3*n*n;
796:     n2 = n_*n_;
797:     PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
798:     sfactor = PetscPowRealInt(2.0,-(*s));
799:     PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
800:     PetscLogFlops(1.0*n*n);
801:   } else Ascaled = A;
802:   *s += ell(n_,Ascaled,coeff[4],13,work,rand);
803: done:
804:   PetscRandomDestroy(&rand);
805:   return(0);
806: }

808: /*
809:  * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
810:  *
811:  *     N. J. Higham, "The scaling and squaring method for the matrix exponential
812:  *     revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
813:  */
814: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
815: {
816:   PetscErrorCode    ierr;
817:   PetscBLASInt      n_,n2,*ipiv,info,one=1;
818:   PetscInt          n,m,j,s;
819:   PetscScalar       scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
820:   PetscScalar       *Aa,*Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
821:   const PetscScalar *c;
822:   const PetscScalar c3[4]   = { 120, 60, 12, 1 };
823:   const PetscScalar c5[6]   = { 30240, 15120, 3360, 420, 30, 1 };
824:   const PetscScalar c7[8]   = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
825:   const PetscScalar c9[10]  = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
826:                                 2162160, 110880, 3960, 90, 1 };
827:   const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
828:                                 1187353796428800,  129060195264000,   10559470521600,
829:                                 670442572800,      33522128640,       1323241920,
830:                                 40840800,          960960,            16380,  182,  1 };

833:   MatDenseGetArray(A,&Aa);
834:   MatDenseGetArray(B,&Ba);
835:   MatGetSize(A,&n,NULL);
836:   PetscBLASIntCast(n,&n_);
837:   n2 = n_*n_;
838:   PetscMalloc2(8*n*n,&work,n,&ipiv);

840:   /* Matrix powers */
841:   Apowers[0] = work;                  /* Apowers[0] = A   */
842:   Apowers[1] = Apowers[0] + n*n;      /* Apowers[1] = A^2 */
843:   Apowers[2] = Apowers[1] + n*n;      /* Apowers[2] = A^4 */
844:   Apowers[3] = Apowers[2] + n*n;      /* Apowers[3] = A^6 */
845:   Apowers[4] = Apowers[3] + n*n;      /* Apowers[4] = A^8 */

847:   PetscArraycpy(Apowers[0],Aa,n2);
848:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
849:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
850:   PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
851:   PetscLogFlops(6.0*n*n*n);

853:   /* Compute scaling parameter and order of Pade approximant */
854:   expm_params(n,Apowers,&s,&m,Apowers[4]);

856:   if (s) { /* rescale */
857:     for (j=0;j<4;j++) {
858:       scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
859:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
860:     }
861:     PetscLogFlops(4.0*n*n);
862:   }

864:   /* Evaluate the Pade approximant */
865:   switch (m) {
866:     case 3:  c = c3;  break;
867:     case 5:  c = c5;  break;
868:     case 7:  c = c7;  break;
869:     case 9:  c = c9;  break;
870:     case 13: c = c13; break;
871:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
872:   }
873:   P = Ba;
874:   Q = Apowers[4] + n*n;
875:   W = Q + n*n;
876:   switch (m) {
877:     case 3:
878:     case 5:
879:     case 7:
880:     case 9:
881:       if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
882:       PetscArrayzero(P,n2);
883:       PetscArrayzero(Q,n2);
884:       for (j=0;j<n;j++) {
885:         P[j+j*n] = c[1];
886:         Q[j+j*n] = c[0];
887:       }
888:       for (j=m;j>=3;j-=2) {
889:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
890:         PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
891:         PetscLogFlops(4.0*n*n);
892:       }
893:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
894:       PetscLogFlops(2.0*n*n*n);
895:       SWAP(P,W,aux);
896:       break;
897:     case 13:
898:       /*  P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
899:               + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I)       */
900:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
901:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
902:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
903:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
904:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
905:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
906:       PetscArrayzero(P,n2);
907:       for (j=0;j<n;j++) P[j+j*n] = c[1];
908:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
909:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
910:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
911:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
912:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
913:       PetscLogFlops(7.0*n*n+2.0*n*n*n);
914:       /*  Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
915:               + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I        */
916:       PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
917:       PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
918:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
919:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
920:       PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
921:       PetscLogFlops(5.0*n*n+2.0*n*n*n);
922:       PetscArrayzero(Q,n2);
923:       for (j=0;j<n;j++) Q[j+j*n] = c[0];
924:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
925:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
926:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
927:       PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
928:       PetscLogFlops(7.0*n*n);
929:       break;
930:     default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
931:   }
932:   PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
933:   PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
934:   SlepcCheckLapackInfo("gesv",info);
935:   PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
936:   for (j=0;j<n;j++) P[j+j*n] += 1.0;
937:   PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);

939:   /* Squaring */
940:   for (j=1;j<=s;j++) {
941:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
942:     SWAP(P,W,aux);
943:   }
944:   if (P!=Ba) { PetscArraycpy(Ba,P,n2); }
945:   PetscLogFlops(2.0*n*n*n*s);

947:   PetscFree2(work,ipiv);
948:   MatDenseRestoreArray(A,&Aa);
949:   MatDenseRestoreArray(B,&Ba);
950:   return(0);
951: }

953: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
954: {
956:   PetscBool      isascii;
957:   char           str[50];
958:   const char     *methodname[] = {
959:                   "scaling & squaring, [m/m] Pade approximant (Higham)",
960:                   "scaling & squaring, [6/6] Pade approximant",
961:                   "scaling & squaring, subdiagonal Pade approximant (product form)",
962:                   "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
963:   };
964:   const int      nmeth=sizeof(methodname)/sizeof(methodname[0]);

967:   PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
968:   if (isascii) {
969:     if (fn->beta==(PetscScalar)1.0) {
970:       if (fn->alpha==(PetscScalar)1.0) {
971:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(x)\n");
972:       } else {
973:         SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
974:         PetscViewerASCIIPrintf(viewer,"  Exponential: exp(%s*x)\n",str);
975:       }
976:     } else {
977:       SlepcSNPrintfScalar(str,50,fn->beta,PETSC_TRUE);
978:       if (fn->alpha==(PetscScalar)1.0) {
979:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s*exp(x)\n",str);
980:       } else {
981:         PetscViewerASCIIPrintf(viewer,"  Exponential: %s",str);
982:         PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
983:         SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
984:         PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
985:         PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
986:       }
987:     }
988:     if (fn->method<nmeth) {
989:       PetscViewerASCIIPrintf(viewer,"  computing matrix functions with: %s\n",methodname[fn->method]);
990:     }
991:   }
992:   return(0);
993: }

995: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
996: {
998:   fn->ops->evaluatefunction       = FNEvaluateFunction_Exp;
999:   fn->ops->evaluatederivative     = FNEvaluateDerivative_Exp;
1000:   fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1001:   fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1002:   fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1003:   fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1004:   fn->ops->view                   = FNView_Exp;
1005:   return(0);
1006: }