Actual source code: fnexp.c
slepc-3.10.1 2018-10-23
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2018, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h> /*I "slepcfn.h" I*/
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
20: *y = PetscExpScalar(x);
21: return(0);
22: }
24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25: {
27: *y = PetscExpScalar(x);
28: return(0);
29: }
31: #define MAX_PADE 6
32: #define SWAP(a,b,t) {t=a;a=b;b=t;}
34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
35: {
36: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
38: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
39: #else
41: PetscBLASInt n,ld,ld2,*ipiv,info,inc=1;
42: PetscInt m,j,k,sexp;
43: PetscBool odd;
44: const PetscInt p=MAX_PADE;
45: PetscReal c[MAX_PADE+1],s,*rwork;
46: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
47: PetscScalar *Aa,*Ba,*As,*A2,*Q,*P,*W,*aux;
50: MatDenseGetArray(A,&Aa);
51: MatDenseGetArray(B,&Ba);
52: MatGetSize(A,&m,NULL);
53: PetscBLASIntCast(m,&n);
54: ld = n;
55: ld2 = ld*ld;
56: P = Ba;
57: PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
58: PetscMemcpy(As,Aa,ld2*sizeof(PetscScalar));
60: /* Pade' coefficients */
61: c[0] = 1.0;
62: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
64: /* Scaling */
65: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
66: PetscLogFlops(1.0*n*n);
67: if (s>0.5) {
68: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
69: scale = PetscPowRealInt(2.0,-sexp);
70: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
71: PetscLogFlops(1.0*n*n);
72: } else sexp = 0;
74: /* Horner evaluation */
75: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
76: PetscLogFlops(2.0*n*n*n);
77: PetscMemzero(Q,ld2*sizeof(PetscScalar));
78: PetscMemzero(P,ld2*sizeof(PetscScalar));
79: for (j=0;j<n;j++) {
80: Q[j+j*ld] = c[p];
81: P[j+j*ld] = c[p-1];
82: }
84: odd = PETSC_TRUE;
85: for (k=p-1;k>0;k--) {
86: if (odd) {
87: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
88: SWAP(Q,W,aux);
89: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
90: odd = PETSC_FALSE;
91: } else {
92: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
93: SWAP(P,W,aux);
94: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
95: odd = PETSC_TRUE;
96: }
97: PetscLogFlops(2.0*n*n*n);
98: }
99: if (odd) {
100: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
101: SWAP(Q,W,aux);
102: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
103: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
104: SlepcCheckLapackInfo("gesv",info);
105: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
106: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
107: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
108: } else {
109: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
110: SWAP(P,W,aux);
111: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
112: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
113: SlepcCheckLapackInfo("gesv",info);
114: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
115: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
116: }
117: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
119: for (k=1;k<=sexp;k++) {
120: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
121: PetscMemcpy(P,W,ld2*sizeof(PetscScalar));
122: }
123: if (P!=Ba) { PetscMemcpy(Ba,P,ld2*sizeof(PetscScalar)); }
124: PetscLogFlops(2.0*n*n*n*sexp);
126: PetscFree6(Q,W,As,A2,rwork,ipiv);
127: MatDenseRestoreArray(A,&Aa);
128: MatDenseRestoreArray(B,&Ba);
129: return(0);
130: #endif
131: }
133: #define PARTIAL_FRACTION_FORM 0
134: #define PRODUCT_FORM 1
136: /*
137: * Set scaling factor (s) and Pade degree (k,m)
138: */
139: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt mode,PetscInt *s,PetscInt *k,PetscInt *m)
140: {
142: if (nrm>1) {
143: if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
144: else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
145: else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
146: else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
147: else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
148: else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
149: else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
150: else {*s = 1; *k = 1; *m = *k+1;}
151: } else { /* nrm<1 */
152: if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
153: else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
154: else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
155: else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
156: else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
157: else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
158: else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
159: else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
160: else {*s = 0; *k = 1; *m = 0;}
161: }
162: return(0);
163: }
165: #if defined(PETSC_HAVE_COMPLEX)
166: /*
167: * Partial fraction form coefficients.
168: * If query, the function returns the size necessary to store the coefficients.
169: */
170: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
171: {
172: PetscInt i;
173: const PetscComplex /* m == k+1 */
174: p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
175: -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
176: 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
177: 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
178: -2.733432894659307e+02 },
179: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
180: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
181: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
182: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
183: 6.286704751729261e+00 },
184: p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
185: -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
186: 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
187: 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
188: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
189: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
190: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
191: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
192: p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
193: 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
194: -1.829749817484586e+01 },
195: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
196: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
197: 3.637834252744491e+00 },
198: p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
199: 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
200: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
201: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
202: const PetscComplex /* m == k-1 */
203: m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
204: -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
205: 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
206: 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
207: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
208: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
209: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
210: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
211: m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
212: 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
213: -2.734353918633177e+02 },
214: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
215: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
216: 5.648485971016893e+00 },
217: m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
218: 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
219: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
220: 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
221: const PetscScalar /* m == k-1 */
222: m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
223: m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
224: m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
225: m1remain2[2] = {-0.5, -3.5},
226: remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
227: remain2[3] = {1.0/2.0, 1, 1};
230: if (query) { /* query about buffer's size */
231: if (m==k+1) {
232: *remain = 0;
233: if (k==4) {
234: *r = *q = 5;
235: } else if (k==3) {
236: *r = *q = 4;
237: } else if (k==2) {
238: *r = *q = 3;
239: } else if (k==1) {
240: *r = *q = 2;
241: }
242: return(0); /* quick return */
243: }
244: if (m==k-1) {
245: if (k==5) {
246: *r = *q = 4; *remain = 2;
247: } else if (k==4) {
248: *r = *q = 3; *remain = 2;
249: } else if (k==3) {
250: *r = *q = 2; *remain = 2;
251: } else if (k==2) {
252: *r = *q = 1; *remain = 2;
253: }
254: }
255: if (m==0) {
256: *r = *q = 0;
257: if (k==3) {
258: *remain = 4;
259: } else if (k==2) {
260: *remain = 3;
261: }
262: }
263: } else {
264: if (m==k+1) {
265: if (k==4) {
266: for (i=0;i<5;i++) {
267: r[i] = p1r4[i]; q[i] = p1q4[i];
268: }
269: } else if (k==3) {
270: for (i=0;i<4;i++) {
271: r[i] = p1r3[i]; q[i] = p1q3[i];
272: }
273: } else if (k==2) {
274: for (i=0;i<3;i++) {
275: r[i] = p1r2[i]; q[i] = p1q2[i];
276: }
277: } else if (k==1) {
278: for (i=0;i<2;i++) {
279: r[i] = p1r1[i]; q[i] = p1q1[i];
280: }
281: }
282: return(0); /* quick return */
283: }
284: if (m==k-1) {
285: if (k==5) {
286: for (i=0;i<4;i++) {
287: r[i] = m1r5[i]; q[i] = m1q5[i];
288: }
289: for (i=0;i<2;i++) {
290: remain[i] = m1remain5[i];
291: }
292: } else if (k==4) {
293: for (i=0;i<3;i++) {
294: r[i] = m1r4[i]; q[i] = m1q4[i];
295: }
296: for (i=0;i<2;i++) {
297: remain[i] = m1remain4[i];
298: }
299: } else if (k==3) {
300: for (i=0;i<2;i++) {
301: r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i];
302: }
303: } else if (k==2) {
304: r[0] = -13.5;
305: q[0] = 3;
306: for (i=0;i<2;i++) {
307: remain[i] = m1remain2[i];
308: }
309: }
310: }
311: if (m==0) {
312: r = q = 0;
313: if (k==3) {
314: for (i=0;i<4;i++) {
315: remain[i] = remain3[i];
316: }
317: } else if (k==2) {
318: for (i=0;i<3;i++) {
319: remain[i] = remain2[i];
320: }
321: }
322: }
323: }
324: return(0);
325: }
327: /*
328: * Product form coefficients.
329: * If query, the function returns the size necessary to store the coefficients.
330: */
331: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
332: {
333: PetscInt i;
334: const PetscComplex /* m == k+1 */
335: p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
336: -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
337: -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
338: -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
339: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
340: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
341: 6.286704751729261e+00 ,
342: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
343: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
344: p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
345: -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
346: -5.648485971016893e+00 },
347: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
348: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
349: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
350: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
351: p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
352: -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
353: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
354: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
355: 3.637834252744491e+00 },
356: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
357: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
358: const PetscComplex /* m == k-1 */
359: m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
360: -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
361: -6.286704751729261e+00 ,
362: -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
363: -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
364: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
365: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
366: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
367: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
368: m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
369: -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
370: -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
371: -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
372: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
373: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
374: 5.648485971016893e+00 },
375: m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
376: -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
377: -3.637834252744491e+00 },
378: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
379: 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
380: m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
381: -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
384: if (query) {
385: if (m == k+1) {
386: *mult = 1;
387: if (k==4) {
388: *p = 4; *q = 5;
389: } else if (k==3) {
390: *p = 3; *q = 4;
391: } else if (k==2) {
392: *p = 2; *q = 3;
393: } else if (k==1) {
394: *p = 1; *q = 2;
395: }
396: return(0);
397: }
398: if (m==k-1) {
399: *mult = 1;
400: if (k==5) {
401: *p = 5; *q = 4;
402: } else if (k==4) {
403: *p = 4; *q = 3;
404: } else if (k==3) {
405: *p = 3; *q = 2;
406: } else if (k==2) {
407: *p = 2; *q = 1;
408: }
409: }
410: } else {
411: if (m == k+1) {
412: *mult = PetscPowInt(-1,m);
413: *mult *= m;
414: if (k==4) {
415: for (i=0;i<4;i++) {
416: p[i] = p1p4[i]; q[i] = p1q4[i];
417: }
418: q[4] = p1q4[4];
419: } else if (k==3) {
420: for (i=0;i<3;i++) {
421: p[i] = p1p3[i]; q[i] = p1q3[i];
422: }
423: q[3] = p1q3[3];
424: } else if (k==2) {
425: for (i=0;i<2;i++) {
426: p[i] = p1p2[i]; q[i] = p1q2[i];
427: }
428: q[2] = p1q2[2];
429: } else if (k==1) {
430: p[0] = -3;
431: for (i=0;i<2;i++) {
432: q[i] = p1q1[i];
433: }
434: }
435: return(0);
436: }
437: if (m==k-1) {
438: *mult = PetscPowInt(-1,m);
439: *mult /= k;
440: if (k==5) {
441: for (i=0;i<4;i++) {
442: p[i] = m1p5[i]; q[i] = m1q5[i];
443: }
444: p[4] = m1p5[4];
445: } else if (k==4) {
446: for (i=0;i<3;i++) {
447: p[i] = m1p4[i]; q[i] = m1q4[i];
448: }
449: p[3] = m1p4[3];
450: } else if (k==3) {
451: for (i=0;i<2;i++) {
452: p[i] = m1p3[i]; q[i] = m1q3[i];
453: }
454: p[2] = m1p3[2];
455: } else if (k==2) {
456: for (i=0;i<2;i++) {
457: p[i] = m1p2[i];
458: }
459: q[0] = 3;
460: }
461: }
462: }
463: return(0);
464: }
465: #endif /* PETSC_HAVE_COMPLEX */
467: #if defined(PETSC_USE_COMPLEX)
468: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
469: {
470: PetscInt i;
473: *result=PETSC_TRUE;
474: for (i=0;i<n&&*result;i++) {
475: if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
476: }
477: return(0);
478: }
479: #endif
481: /*
482: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
483: * and Yuji Nakatsukasa
484: *
485: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
486: * Approximation for the Matrix Exponential",
487: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
488: * https://doi.org/10.1137/15M1027553
489: */
490: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
491: {
492: #if !defined(PETSC_HAVE_COMPLEX)
494: SETERRQ(PETSC_COMM_SELF,1,"This function requires C99 or C++ complex support");
495: #elif defined(PETSC_MISSING_LAPACK_GEEV) || defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
497: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GEEV/GESV/LANGE - Lapack routines are unavailable");
498: #else
499: PetscInt i,j,n_,s,k,m,mode=PRODUCT_FORM,mod;
500: PetscBLASInt n,n2,irsize,rsizediv2,ipsize,iremainsize,query=-1,info,*piv,minlen,lwork,one=1;
501: PetscReal nrm,shift;
502: #if defined(PETSC_USE_COMPLEX)
503: PetscReal *rwork=NULL;
504: #endif
505: PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
506: PetscScalar *Aa,*Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*work,work1,*saux;
508: PetscBool isreal;
511: MatGetSize(A,&n_,NULL);
512: PetscBLASIntCast(n_,&n);
513: MatDenseGetArray(A,&Aa);
514: MatDenseGetArray(B,&Ba);
515: Ba2 = Ba;
516: PetscBLASIntCast(n*n,&n2);
518: PetscMalloc2(n2,&sMaux,n2,&Maux);
519: Maux2 = Maux;
520: PetscMalloc2(n,&wr,n,&wi);
521: PetscMemcpy(sMaux,Aa,n2*sizeof(PetscScalar));
522: /* estimate rightmost eigenvalue and shift A with it */
523: #if !defined(PETSC_USE_COMPLEX)
524: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
525: SlepcCheckLapackInfo("geev",info);
526: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
527: PetscMalloc1(lwork,&work);
528: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
529: PetscFree(work);
530: #else
531: PetscMemcpy(Maux,Aa,n2*sizeof(PetscScalar));
532: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
533: SlepcCheckLapackInfo("geev",info);
534: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
535: PetscMalloc2(2*n,&rwork,lwork,&work);
536: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
537: PetscFree2(rwork,work);
538: #endif
539: SlepcCheckLapackInfo("geev",info);
540: PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);
542: shift = PetscRealPart(wr[0]);
543: for (i=1;i<n;i++) {
544: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
545: }
546: PetscFree2(wr,wi);
547: /* shift so that largest real part is (about) 0 */
548: PetscMemcpy(sMaux,Aa,n2*sizeof(PetscScalar));
549: for (i=0;i<n;i++) {
550: sMaux[i+i*n] -= shift;
551: }
552: PetscLogFlops(1.0*n);
553: #if defined(PETSC_USE_COMPLEX)
554: PetscMemcpy(Maux,Aa,n2*sizeof(PetscScalar));
555: for (i=0;i<n;i++) {
556: Maux[i+i*n] -= shift;
557: }
558: PetscLogFlops(1.0*n);
559: #endif
561: /* estimate norm(A) and select the scaling factor */
562: nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
563: PetscLogFlops(1.0*n*n);
564: sexpm_params(nrm,mode,&s,&k,&m);
565: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
566: expshift = PetscExpScalar(shift);
567: for (i=0;i<n;i++) {
568: sMaux[i+i*n] += 1.0;
569: }
570: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
571: PetscLogFlops(1.0*(n+n2));
572: PetscMemcpy(Ba,sMaux,n2*sizeof(PetscScalar));
573: PetscFree(sMaux);
574: MatDenseRestoreArray(A,&Aa);
575: MatDenseRestoreArray(B,&Ba);
576: return(0); /* quick return */
577: }
579: PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
580: expmA2 = expmA; RR2 = RR;
581: /* scale matrix */
582: #if !defined(PETSC_USE_COMPLEX)
583: for (i=0;i<n2;i++) {
584: As[i] = sMaux[i];
585: }
586: #else
587: PetscMemcpy(As,sMaux,n2*sizeof(PetscScalar));
588: #endif
589: scale = 1.0/PetscPowRealInt(2.0,s);
590: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
591: SlepcLogFlopsComplex(1.0*n2);
593: /* evaluate Pade approximant (partial fraction or product form) */
594: if (mode==PARTIAL_FRACTION_FORM || !m) { /* partial fraction */
595: getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
596: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
597: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
598: PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
599: PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
600: getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);
602: PetscMemzero(expmA,n2*sizeof(PetscComplex));
603: #if !defined(PETSC_USE_COMPLEX)
604: isreal = PETSC_TRUE;
605: #else
606: getisreal(n2,Maux,&isreal);
607: #endif
608: if (isreal) {
609: rsizediv2 = irsize/2;
610: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
611: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
612: PetscMemzero(RR,n2*sizeof(PetscComplex));
613: for (j=0;j<n;j++) {
614: Maux[j+j*n] -= p[2*i];
615: RR[j+j*n] = r[2*i];
616: }
617: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
618: SlepcCheckLapackInfo("gesv",info);
619: for (j=0;j<n2;j++) {
620: expmA[j] += RR[j] + PetscConj(RR[j]);
621: }
622: /* loop(n) + gesv + loop(n2) */
623: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
624: }
626: mod = ipsize % 2;
627: if (mod) {
628: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
629: PetscMemzero(RR,n2*sizeof(PetscComplex));
630: for (j=0;j<n;j++) {
631: Maux[j+j*n] -= p[ipsize-1];
632: RR[j+j*n] = r[irsize-1];
633: }
634: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
635: SlepcCheckLapackInfo("gesv",info);
636: for (j=0;j<n2;j++) {
637: expmA[j] += RR[j];
638: }
639: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
640: }
641: } else { /* complex */
642: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
643: PetscMemcpy(Maux,As,n2*sizeof(PetscComplex));
644: PetscMemzero(RR,n2*sizeof(PetscComplex));
645: for (j=0;j<n;j++) {
646: Maux[j+j*n] -= p[i];
647: RR[j+j*n] = r[i];
648: }
649: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
650: SlepcCheckLapackInfo("gesv",info);
651: for (j=0;j<n2;j++) {
652: expmA[j] += RR[j];
653: }
654: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
655: }
656: }
657: for (i=0;i<iremainsize;i++) {
658: if (!i) {
659: PetscMemzero(RR,n2*sizeof(PetscComplex));
660: for (j=0;j<n;j++) {
661: RR[j+j*n] = remainterm[iremainsize-1];
662: }
663: } else {
664: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
665: for (j=1;j<i;j++) {
666: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
667: SWAP(RR,Maux,aux);
668: SlepcLogFlopsComplex(2.0*n*n*n);
669: }
670: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
671: SlepcLogFlopsComplex(1.0*n2);
672: }
673: for (j=0;j<n2;j++) {
674: expmA[j] += RR[j];
675: }
676: SlepcLogFlopsComplex(1.0*n2);
677: }
678: PetscFree3(r,p,remainterm);
679: } else { /* product form, default */
680: getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
681: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
682: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
683: PetscMalloc2(irsize,&rootp,ipsize,&rootq);
684: getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);
686: PetscMemzero(expmA,n2*sizeof(PetscComplex));
687: for (i=0;i<n;i++) { /* initialize */
688: expmA[i+i*n] = 1.0;
689: }
690: minlen = PetscMin(irsize,ipsize);
691: for (i=0;i<minlen;i++) {
692: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
693: for (j=0;j<n;j++) {
694: RR[j+j*n] -= rootp[i];
695: }
696: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
697: SWAP(expmA,Maux,aux);
698: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
699: for (j=0;j<n;j++) {
700: RR[j+j*n] -= rootq[i];
701: }
702: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
703: SlepcCheckLapackInfo("gesv",info);
704: /* loop(n) + gemm + loop(n) + gesv */
705: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
706: }
707: /* extra enumerator */
708: for (i=minlen;i<irsize;i++) {
709: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
710: for (j=0;j<n;j++) {
711: RR[j+j*n] -= rootp[i];
712: }
713: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
714: SWAP(expmA,Maux,aux);
715: SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
716: }
717: /* extra denominator */
718: for (i=minlen;i<ipsize;i++) {
719: PetscMemcpy(RR,As,n2*sizeof(PetscComplex));
720: for (j=0;j<n;j++) {
721: RR[j+j*n] -= rootq[i];
722: }
724: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
725: SlepcCheckLapackInfo("gesv",info);
726: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
727: }
728: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
729: SlepcLogFlopsComplex(1.0*n2);
730: PetscFree2(rootp,rootq);
731: }
733: #if !defined(PETSC_USE_COMPLEX)
734: for (i=0;i<n2;i++) {
735: Ba2[i] = PetscRealPartComplex(expmA[i]);
736: }
737: #else
738: PetscMemcpy(Ba2,expmA,n2*sizeof(PetscScalar));
739: #endif
741: /* perform repeated squaring */
742: for (i=0;i<s;i++) { /* final squaring */
743: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
744: SWAP(Ba2,sMaux,saux);
745: PetscLogFlops(2.0*n*n*n);
746: }
747: if (Ba2!=Ba) {
748: PetscMemcpy(Ba,Ba2,n2*sizeof(PetscScalar));
749: sMaux = Ba2;
750: }
751: expshift = PetscExpReal(shift);
752: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
753: PetscLogFlops(1.0*n2);
755: /* restore pointers */
756: Maux = Maux2; expmA = expmA2; RR = RR2;
757: PetscFree2(sMaux,Maux);
758: PetscFree4(expmA,As,RR,piv);
759: MatDenseRestoreArray(A,&Aa);
760: MatDenseRestoreArray(B,&Ba);
761: return(0);
762: #endif
763: }
765: #define ITMAX 5
767: /*
768: * Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
769: */
770: static PetscReal normest1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
771: {
772: PetscScalar *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
773: PetscReal est=0.0,est_old,vals[2]={0.0,0.0},*zvals,maxzval[2],raux;
774: PetscBLASInt i,j,t=2,it=0,ind[2],est_j=0,m1;
777: X = work;
778: Y = work + 2*n;
779: Z = work + 4*n;
780: S = work + 6*n;
781: S_old = work + 8*n;
782: zvals = (PetscReal*)(work + 10*n);
784: for (i=0;i<n;i++) { /* X has columns of unit 1-norm */
785: X[i] = 1.0/n;
786: PetscRandomGetValue(rand,&val);
787: if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
788: else X[i+n] = 1.0/n;
789: }
790: for (i=0;i<t*n;i++) S[i] = 0.0;
791: ind[0] = 0; ind[1] = 0;
792: est_old = 0;
793: while (1) {
794: it++;
795: for (j=0;j<m;j++) { /* Y = A^m*X */
796: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
797: if (j<m-1) SWAP(X,Y,aux);
798: }
799: for (j=0;j<t;j++) { /* vals[j] = norm(Y(:,j),1) */
800: vals[j] = 0.0;
801: for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
802: }
803: if (vals[0]<vals[1]) {
804: SWAP(vals[0],vals[1],raux);
805: m1 = 1;
806: } else m1 = 0;
807: est = vals[0];
808: if (est>est_old || it==2) est_j = ind[m1];
809: if (it>=2 && est<=est_old) {
810: est = est_old;
811: break;
812: }
813: est_old = est;
814: if (it>ITMAX) break;
815: SWAP(S,S_old,aux);
816: for (i=0;i<t*n;i++) { /* S = sign(Y) */
817: S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
818: }
819: for (j=0;j<m;j++) { /* Z = (A^T)^m*S */
820: PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
821: if (j<m-1) SWAP(S,Z,aux);
822: }
823: maxzval[0] = -1; maxzval[1] = -1;
824: ind[0] = 0; ind[1] = 0;
825: for (i=0;i<n;i++) { /* zvals[i] = norm(Z(i,:),inf) */
826: zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
827: if (zvals[i]>maxzval[0]) {
828: maxzval[0] = zvals[i];
829: ind[0] = i;
830: } else if (zvals[i]>maxzval[1]) {
831: maxzval[1] = zvals[i];
832: ind[1] = i;
833: }
834: }
835: if (it>=2 && maxzval[0]==zvals[est_j]) break;
836: for (i=0;i<t*n;i++) X[i] = 0.0;
837: for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
838: }
839: /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
840: PetscLogFlops(4.0*it*m*t*n*n);
841: PetscFunctionReturn(est);
842: }
844: #define SMALLN 100
846: /*
847: * Estimate norm(A^m,1) (required workspace is 2*n*n)
848: */
849: static PetscReal normAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand)
850: {
851: PetscScalar *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
852: PetscReal nrm,rwork[1],tmp;
853: PetscBLASInt i,j,one=1;
854: PetscBool isrealpos=PETSC_TRUE;
857: if (n<SMALLN) { /* compute matrix power explicitly */
858: if (m==1) {
859: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
860: PetscLogFlops(1.0*n*n);
861: } else { /* m>=2 */
862: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
863: for (j=0;j<m-2;j++) {
864: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
865: SWAP(v,w,aux);
866: }
867: nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
868: PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
869: }
870: } else {
871: for (i=0;i<n;i++)
872: for (j=0;j<n;j++)
873: #if defined(PETSC_USE_COMPLEX)
874: if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
875: #else
876: if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
877: #endif
878: if (isrealpos) { /* for positive matrices only */
879: for (i=0;i<n;i++) v[i] = 1.0;
880: for (j=0;j<m;j++) { /* w = A'*v */
881: PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
882: SWAP(v,w,aux);
883: }
884: PetscLogFlops(2.0*n*n*m);
885: nrm = 0.0;
886: for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > nrm) nrm = tmp; /* norm(v,inf) */
887: } else {
888: nrm = normest1(n,A,m,work,rand);
889: }
890: }
891: PetscFunctionReturn(nrm);
892: }
894: /*
895: * Function needed to compute optimal parameters (required workspace is 3*n*n)
896: */
897: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
898: {
899: PetscScalar *Ascaled=work;
900: PetscReal nrm,alpha,beta,rwork[1];
901: PetscInt t;
902: PetscBLASInt i,j;
905: beta = PetscPowReal(coeff,1.0/(2*m+1));
906: for (i=0;i<n;i++)
907: for (j=0;j<n;j++)
908: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
909: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
910: PetscLogFlops(2.0*n*n);
911: alpha = normAm(n,Ascaled,2*m+1,work+n*n,rand)/nrm;
912: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
913: PetscFunctionReturn(t);
914: }
916: /*
917: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
918: */
919: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
920: {
921: PetscErrorCode ierr;
922: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
923: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
924: PetscBLASInt n_,n2,one=1;
925: PetscRandom rand;
926: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
927: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
928: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
929: 2.539398330063230e-001, /* m = 5 */
930: 9.504178996162932e-001, /* m = 7 */
931: 2.097847961257068e+000, /* m = 9 */
932: 5.371920351148152e+000 }; /* m = 13 */
935: *s = 0;
936: *m = 13;
937: PetscBLASIntCast(n,&n_);
938: PetscRandomCreate(PETSC_COMM_SELF,&rand);
939: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
940: if (d4==0.0) { /* safeguard for the case A = 0 */
941: *m = 3;
942: goto done;
943: }
944: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
945: PetscLogFlops(2.0*n*n);
946: eta1 = PetscMax(d4,d6);
947: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
948: *m = 3;
949: goto done;
950: }
951: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
952: *m = 5;
953: goto done;
954: }
955: if (n<SMALLN) {
956: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
957: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
958: PetscLogFlops(2.0*n*n*n+1.0*n*n);
959: } else {
960: d8 = PetscPowReal(normAm(n_,Apowers[2],2,work,rand),1.0/8.0);
961: }
962: eta3 = PetscMax(d6,d8);
963: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
964: *m = 7;
965: goto done;
966: }
967: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
968: *m = 9;
969: goto done;
970: }
971: if (n<SMALLN) {
972: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
973: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
974: PetscLogFlops(2.0*n*n*n+1.0*n*n);
975: } else {
976: d10 = PetscPowReal(normAm(n_,Apowers[1],5,work,rand),1.0/10.0);
977: }
978: eta4 = PetscMax(d8,d10);
979: eta5 = PetscMin(eta3,eta4);
980: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
981: if (*s) {
982: Ascaled = work+3*n*n;
983: n2 = n_*n_;
984: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
985: sfactor = PetscPowRealInt(2.0,-(*s));
986: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
987: PetscLogFlops(1.0*n*n);
988: } else Ascaled = A;
989: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
990: done:
991: PetscRandomDestroy(&rand);
992: return(0);
993: }
995: /*
996: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
997: *
998: * N. J. Higham, "The scaling and squaring method for the matrix exponential
999: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
1000: */
1001: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
1002: {
1003: #if defined(PETSC_MISSING_LAPACK_GESV) || defined(SLEPC_MISSING_LAPACK_LANGE)
1005: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"GESV/LANGE - Lapack routines are unavailable");
1006: #else
1007: PetscErrorCode ierr;
1008: PetscBLASInt n_,n2,*ipiv,info,one=1;
1009: PetscInt n,m,j,s;
1010: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
1011: PetscScalar *Aa,*Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
1012: const PetscScalar *c;
1013: const PetscScalar c3[4] = { 120, 60, 12, 1 };
1014: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
1015: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
1016: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
1017: 2162160, 110880, 3960, 90, 1 };
1018: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
1019: 1187353796428800, 129060195264000, 10559470521600,
1020: 670442572800, 33522128640, 1323241920,
1021: 40840800, 960960, 16380, 182, 1 };
1024: MatDenseGetArray(A,&Aa);
1025: MatDenseGetArray(B,&Ba);
1026: MatGetSize(A,&n,NULL);
1027: PetscBLASIntCast(n,&n_);
1028: n2 = n_*n_;
1029: PetscMalloc2(8*n*n,&work,n,&ipiv);
1031: /* Matrix powers */
1032: Apowers[0] = work; /* Apowers[0] = A */
1033: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
1034: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
1035: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
1036: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
1038: PetscMemcpy(Apowers[0],Aa,n2*sizeof(PetscScalar));
1039: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
1040: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
1041: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
1042: PetscLogFlops(6.0*n*n*n);
1044: /* Compute scaling parameter and order of Pade approximant */
1045: expm_params(n,Apowers,&s,&m,Apowers[4]);
1047: if (s) { /* rescale */
1048: for (j=0;j<4;j++) {
1049: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
1050: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
1051: }
1052: PetscLogFlops(4.0*n*n);
1053: }
1055: /* Evaluate the Pade approximant */
1056: switch (m) {
1057: case 3: c = c3; break;
1058: case 5: c = c5; break;
1059: case 7: c = c7; break;
1060: case 9: c = c9; break;
1061: case 13: c = c13; break;
1062: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1063: }
1064: P = Ba;
1065: Q = Apowers[4] + n*n;
1066: W = Q + n*n;
1067: switch (m) {
1068: case 3:
1069: case 5:
1070: case 7:
1071: case 9:
1072: if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
1073: PetscMemzero(P,n2*sizeof(PetscScalar));
1074: PetscMemzero(Q,n2*sizeof(PetscScalar));
1075: for (j=0;j<n;j++) {
1076: P[j+j*n] = c[1];
1077: Q[j+j*n] = c[0];
1078: }
1079: for (j=m;j>=3;j-=2) {
1080: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
1081: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
1082: PetscLogFlops(4.0*n*n);
1083: }
1084: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
1085: PetscLogFlops(2.0*n*n*n);
1086: SWAP(P,W,aux);
1087: break;
1088: case 13:
1089: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
1090: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
1091: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
1092: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
1093: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
1094: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
1095: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
1096: PetscLogFlops(5.0*n*n+2.0*n*n*n);
1097: PetscMemzero(P,n2*sizeof(PetscScalar));
1098: for (j=0;j<n;j++) P[j+j*n] = c[1];
1099: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
1100: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
1101: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
1102: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
1103: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
1104: PetscLogFlops(7.0*n*n+2.0*n*n*n);
1105: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
1106: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
1107: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
1108: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
1109: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
1110: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
1111: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
1112: PetscLogFlops(5.0*n*n+2.0*n*n*n);
1113: PetscMemzero(Q,n2*sizeof(PetscScalar));
1114: for (j=0;j<n;j++) Q[j+j*n] = c[0];
1115: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
1116: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
1117: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
1118: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
1119: PetscLogFlops(7.0*n*n);
1120: break;
1121: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
1122: }
1123: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
1124: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
1125: SlepcCheckLapackInfo("gesv",info);
1126: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
1127: for (j=0;j<n;j++) P[j+j*n] += 1.0;
1128: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);
1130: /* Squaring */
1131: for (j=1;j<=s;j++) {
1132: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
1133: SWAP(P,W,aux);
1134: }
1135: if (P!=Ba) { PetscMemcpy(Ba,P,n2*sizeof(PetscScalar)); }
1136: PetscLogFlops(2.0*n*n*n*s);
1138: PetscFree2(work,ipiv);
1139: MatDenseRestoreArray(A,&Aa);
1140: MatDenseRestoreArray(B,&Ba);
1141: return(0);
1142: #endif
1143: }
1145: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
1146: {
1148: PetscBool isascii;
1149: char str[50];
1150: const char *methodname[] = {
1151: "scaling & squaring, [m/m] Pade approximant (Higham)",
1152: "scaling & squaring, [6/6] Pade approximant",
1153: "scaling & squaring, subdiagonal Pade approximant"
1154: };
1155: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
1158: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
1159: if (isascii) {
1160: if (fn->beta==(PetscScalar)1.0) {
1161: if (fn->alpha==(PetscScalar)1.0) {
1162: PetscViewerASCIIPrintf(viewer," Exponential: exp(x)\n");
1163: } else {
1164: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
1165: PetscViewerASCIIPrintf(viewer," Exponential: exp(%s*x)\n",str);
1166: }
1167: } else {
1168: SlepcSNPrintfScalar(str,50,fn->beta,PETSC_TRUE);
1169: if (fn->alpha==(PetscScalar)1.0) {
1170: PetscViewerASCIIPrintf(viewer," Exponential: %s*exp(x)\n",str);
1171: } else {
1172: PetscViewerASCIIPrintf(viewer," Exponential: %s",str);
1173: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
1174: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
1175: PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
1176: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
1177: }
1178: }
1179: if (fn->method<nmeth) {
1180: PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
1181: }
1182: }
1183: return(0);
1184: }
1186: PETSC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
1187: {
1189: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
1190: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1191: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1192: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1193: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa;
1194: fn->ops->view = FNView_Exp;
1195: return(0);
1196: }