1 | ///////////////////////////////////////////////////////////////////////////////// |
---|
2 | //// |
---|
3 | //// Linear algebra operations for the sba package |
---|
4 | //// Copyright (C) 2004 Manolis Lourakis (lourakis@ics.forth.gr) |
---|
5 | //// Institute of Computer Science, Foundation for Research & Technology - Hellas |
---|
6 | //// Heraklion, Crete, Greece. |
---|
7 | //// |
---|
8 | //// This program is free software; you can redistribute it and/or modify |
---|
9 | //// it under the terms of the GNU General Public License as published by |
---|
10 | //// the Free Software Foundation; either version 2 of the License, or |
---|
11 | //// (at your option) any later version. |
---|
12 | //// |
---|
13 | //// This program is distributed in the hope that it will be useful, |
---|
14 | //// but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
15 | //// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
16 | //// GNU General Public License for more details. |
---|
17 | //// |
---|
18 | /////////////////////////////////////////////////////////////////////////////////// |
---|
19 | |
---|
20 | #include <stdio.h> |
---|
21 | #include <stdlib.h> |
---|
22 | #include <string.h> |
---|
23 | #include <math.h> |
---|
24 | #include <float.h> |
---|
25 | |
---|
26 | /* inline */ |
---|
27 | #ifdef _MSC_VER |
---|
28 | #define inline __inline //MSVC |
---|
29 | #elif !defined(__GNUC__) |
---|
30 | #define inline //other than MSVC, GCC: define empty |
---|
31 | #endif |
---|
32 | |
---|
33 | #include "sba.h" |
---|
34 | |
---|
35 | #ifdef SBA_APPEND_UNDERSCORE_SUFFIX |
---|
36 | #define F77_FUNC(func) func ## _ |
---|
37 | #else |
---|
38 | #define F77_FUNC(func) func |
---|
39 | #endif /* SBA_APPEND_UNDERSCORE_SUFFIX */ |
---|
40 | |
---|
41 | |
---|
42 | /* declarations of LAPACK routines employed */ |
---|
43 | |
---|
44 | /* QR decomposition */ |
---|
45 | extern int F77_FUNC(dgeqrf)(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); |
---|
46 | extern int F77_FUNC(dorgqr)(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info); |
---|
47 | |
---|
48 | /* solution of triangular system */ |
---|
49 | extern int F77_FUNC(dtrtrs)(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info); |
---|
50 | |
---|
51 | /* cholesky decomposition, matrix inversion */ |
---|
52 | extern int F77_FUNC(dpotf2)(char *uplo, int *n, double *a, int *lda, int *info); |
---|
53 | extern int F77_FUNC(dpotrf)(char *uplo, int *n, double *a, int *lda, int *info); /* block version of dpotf2 */ |
---|
54 | extern int F77_FUNC(dpotri)(char *uplo, int *n, double *a, int *lda, int *info); |
---|
55 | |
---|
56 | /* LU decomposition, linear system solution and matrix inversion */ |
---|
57 | extern int F77_FUNC(dgetrf)(int *m, int *n, double *a, int *lda, int *ipiv, int *info); |
---|
58 | extern int F77_FUNC(dgetrs)(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info); |
---|
59 | extern int F77_FUNC(dgetri)(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); |
---|
60 | |
---|
61 | /* SVD */ |
---|
62 | extern int F77_FUNC(dgesvd)(char *jobu, char *jobvt, int *m, int *n, |
---|
63 | double *a, int *lda, double *s, double *u, int *ldu, |
---|
64 | double *vt, int *ldvt, double *work, int *lwork, |
---|
65 | int *info); |
---|
66 | |
---|
67 | /* lapack 3.0 routine, faster than dgesvd() */ |
---|
68 | extern int F77_FUNC(dgesdd)(char *jobz, int *m, int *n, double *a, int *lda, |
---|
69 | double *s, double *u, int *ldu, double *vt, int *ldvt, |
---|
70 | double *work, int *lwork, int *iwork, int *info); |
---|
71 | |
---|
72 | |
---|
73 | /* Bunch-Kaufman factorization of a real symmetric matrix A and solution of linear systems */ |
---|
74 | extern int F77_FUNC(dsytrf)(char *uplo, int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); |
---|
75 | extern int F77_FUNC(dsytrs)(char *uplo, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info); |
---|
76 | |
---|
77 | |
---|
78 | /* |
---|
79 | * This function returns the solution of Ax = b |
---|
80 | * |
---|
81 | * The function is based on QR decomposition with explicit computation of Q: |
---|
82 | * If A=Q R with Q orthogonal and R upper triangular, the linear system becomes |
---|
83 | * Q R x = b or R x = Q^T b. |
---|
84 | * |
---|
85 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
86 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
87 | * this function modifies A! |
---|
88 | * |
---|
89 | * The function returns 0 in case of error, 1 if successfull |
---|
90 | * |
---|
91 | * This function is often called repetitively to solve problems of identical |
---|
92 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
93 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
94 | */ |
---|
95 | int sba_Axb_QR(double *A, double *B, double *x, int m, int iscolmaj) |
---|
96 | { |
---|
97 | static double *buf=NULL; |
---|
98 | static int buf_sz=0; |
---|
99 | |
---|
100 | double *a, *qtb, *r, *tau, *work; |
---|
101 | int a_sz, qtb_sz, r_sz, tau_sz, tot_sz; |
---|
102 | register int i, j; |
---|
103 | int info, worksz, nrhs=1; |
---|
104 | register double sum; |
---|
105 | |
---|
106 | /* calculate required memory size */ |
---|
107 | a_sz=(iscolmaj)? 0 : m*m; |
---|
108 | qtb_sz=m; |
---|
109 | r_sz=m*m; /* only the upper triangular part really needed */ |
---|
110 | tau_sz=m; |
---|
111 | worksz=3*m; /* this is probably too much */ |
---|
112 | tot_sz=a_sz + qtb_sz + r_sz + tau_sz + worksz; |
---|
113 | |
---|
114 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
115 | if(buf) free(buf); /* free previously allocated memory */ |
---|
116 | |
---|
117 | buf_sz=tot_sz; |
---|
118 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
119 | if(!buf){ |
---|
120 | fprintf(stderr, "memory allocation in sba_Axb_QR() failed!\n"); |
---|
121 | exit(1); |
---|
122 | } |
---|
123 | } |
---|
124 | |
---|
125 | if(!iscolmaj){ |
---|
126 | a=buf; |
---|
127 | /* store A (column major!) into a */ |
---|
128 | for(i=0; i<m; i++) |
---|
129 | for(j=0; j<m; j++) |
---|
130 | a[i+j*m]=A[i*m+j]; |
---|
131 | } |
---|
132 | else a=A; /* no copying required */ |
---|
133 | |
---|
134 | qtb=buf+a_sz; |
---|
135 | r=qtb+qtb_sz; |
---|
136 | tau=r+r_sz; |
---|
137 | work=tau+tau_sz; |
---|
138 | |
---|
139 | /* QR decomposition of A */ |
---|
140 | F77_FUNC(dgeqrf)((int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); |
---|
141 | /* error treatment */ |
---|
142 | if(info!=0){ |
---|
143 | if(info<0){ |
---|
144 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dgeqrf in sba_Axb_QR()\n", -info); |
---|
145 | exit(1); |
---|
146 | } |
---|
147 | else{ |
---|
148 | fprintf(stderr, "Unknown LAPACK error %d for dgeqrf in sba_Axb_QR()\n", info); |
---|
149 | return 0; |
---|
150 | } |
---|
151 | } |
---|
152 | |
---|
153 | /* R is now stored in the upper triangular part of a; copy it in r so that dorgqr() below won't destroy it */ |
---|
154 | for(i=0; i<r_sz; i++) |
---|
155 | r[i]=a[i]; |
---|
156 | |
---|
157 | /* compute Q using the elementary reflectors computed by the above decomposition */ |
---|
158 | F77_FUNC(dorgqr)((int *)&m, (int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); |
---|
159 | if(info!=0){ |
---|
160 | if(info<0){ |
---|
161 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dorgqr in sba_Axb_QR()\n", -info); |
---|
162 | exit(1); |
---|
163 | } |
---|
164 | else{ |
---|
165 | fprintf(stderr, "Unknown LAPACK error (%d) in sba_Axb_QR()\n", info); |
---|
166 | return 0; |
---|
167 | } |
---|
168 | } |
---|
169 | |
---|
170 | /* Q is now in a; compute Q^T b in qtb */ |
---|
171 | for(i=0; i<m; i++){ |
---|
172 | for(j=0, sum=0.0; j<m; j++) |
---|
173 | sum+=a[i*m+j]*B[j]; |
---|
174 | qtb[i]=sum; |
---|
175 | } |
---|
176 | |
---|
177 | /* solve the linear system R x = Q^t b */ |
---|
178 | F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, r, (int *)&m, qtb, (int *)&m, &info); |
---|
179 | /* error treatment */ |
---|
180 | if(info!=0){ |
---|
181 | if(info<0){ |
---|
182 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QR()\n", -info); |
---|
183 | exit(1); |
---|
184 | } |
---|
185 | else{ |
---|
186 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QR()\n", info); |
---|
187 | return 0; |
---|
188 | } |
---|
189 | } |
---|
190 | |
---|
191 | /* copy the result in x */ |
---|
192 | for(i=0; i<m; i++) |
---|
193 | x[i]=qtb[i]; |
---|
194 | |
---|
195 | return 1; |
---|
196 | } |
---|
197 | |
---|
198 | /* |
---|
199 | * This function returns the solution of Ax = b |
---|
200 | * |
---|
201 | * The function is based on QR decomposition without computation of Q: |
---|
202 | * If A=Q R with Q orthogonal and R upper triangular, the linear system becomes |
---|
203 | * (A^T A) x = A^T b or (R^T Q^T Q R) x = A^T b or (R^T R) x = A^T b. |
---|
204 | * This amounts to solving R^T y = A^T b for y and then R x = y for x |
---|
205 | * Note that Q does not need to be explicitly computed |
---|
206 | * |
---|
207 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
208 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
209 | * this function modifies A! |
---|
210 | * |
---|
211 | * The function returns 0 in case of error, 1 if successfull |
---|
212 | * |
---|
213 | * This function is often called repetitively to solve problems of identical |
---|
214 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
215 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
216 | */ |
---|
217 | int sba_Axb_QRnoQ(double *A, double *B, double *x, int m, int iscolmaj) |
---|
218 | { |
---|
219 | static double *buf=NULL; |
---|
220 | static int buf_sz=0; |
---|
221 | |
---|
222 | double *a, *atb, *tau, *work; |
---|
223 | int a_sz, atb_sz, tau_sz, tot_sz; |
---|
224 | register int i, j; |
---|
225 | int info, worksz, nrhs=1; |
---|
226 | register double sum; |
---|
227 | |
---|
228 | /* calculate required memory size */ |
---|
229 | a_sz=(iscolmaj)? 0 : m*m; |
---|
230 | atb_sz=m; |
---|
231 | tau_sz=m; |
---|
232 | worksz=3*m; /* this is probably too much */ |
---|
233 | tot_sz=a_sz + atb_sz + tau_sz + worksz; |
---|
234 | |
---|
235 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
236 | if(buf) free(buf); /* free previously allocated memory */ |
---|
237 | |
---|
238 | buf_sz=tot_sz; |
---|
239 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
240 | if(!buf){ |
---|
241 | fprintf(stderr, "memory allocation in sba_Axb_QRnoQ() failed!\n"); |
---|
242 | exit(1); |
---|
243 | } |
---|
244 | } |
---|
245 | |
---|
246 | if(!iscolmaj){ |
---|
247 | a=buf; |
---|
248 | /* store A (column major!) into a */ |
---|
249 | for(i=0; i<m; i++) |
---|
250 | for(j=0; j<m; j++) |
---|
251 | a[i+j*m]=A[i*m+j]; |
---|
252 | } |
---|
253 | else a=A; /* no copying required */ |
---|
254 | |
---|
255 | atb=buf+a_sz; |
---|
256 | tau=atb+atb_sz; |
---|
257 | work=tau+tau_sz; |
---|
258 | |
---|
259 | /* compute A^T b in atb */ |
---|
260 | for(i=0; i<m; i++){ |
---|
261 | for(j=0, sum=0.0; j<m; j++) |
---|
262 | sum+=a[i*m+j]*B[j]; |
---|
263 | atb[i]=sum; |
---|
264 | } |
---|
265 | |
---|
266 | /* QR decomposition of A */ |
---|
267 | F77_FUNC(dgeqrf)((int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); |
---|
268 | /* error treatment */ |
---|
269 | if(info!=0){ |
---|
270 | if(info<0){ |
---|
271 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dgeqrf in sba_Axb_QRnoQ()\n", -info); |
---|
272 | exit(1); |
---|
273 | } |
---|
274 | else{ |
---|
275 | fprintf(stderr, "Unknown LAPACK error %d for dgeqrf in sba_Axb_QRnoQ()\n", info); |
---|
276 | return 0; |
---|
277 | } |
---|
278 | } |
---|
279 | |
---|
280 | /* R is stored in the upper triangular part of a */ |
---|
281 | |
---|
282 | /* solve the linear system R^T y = A^t b */ |
---|
283 | F77_FUNC(dtrtrs)("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, atb, (int *)&m, &info); |
---|
284 | /* error treatment */ |
---|
285 | if(info!=0){ |
---|
286 | if(info<0){ |
---|
287 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QRnoQ()\n", -info); |
---|
288 | exit(1); |
---|
289 | } |
---|
290 | else{ |
---|
291 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QRnoQ()\n", info); |
---|
292 | return 0; |
---|
293 | } |
---|
294 | } |
---|
295 | |
---|
296 | /* solve the linear system R x = y */ |
---|
297 | F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, atb, (int *)&m, &info); |
---|
298 | /* error treatment */ |
---|
299 | if(info!=0){ |
---|
300 | if(info<0){ |
---|
301 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QRnoQ()\n", -info); |
---|
302 | exit(1); |
---|
303 | } |
---|
304 | else{ |
---|
305 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QRnoQ()\n", info); |
---|
306 | return 0; |
---|
307 | } |
---|
308 | } |
---|
309 | |
---|
310 | /* copy the result in x */ |
---|
311 | for(i=0; i<m; i++) |
---|
312 | x[i]=atb[i]; |
---|
313 | |
---|
314 | return 1; |
---|
315 | } |
---|
316 | |
---|
317 | /* |
---|
318 | * This function returns the solution of Ax=b |
---|
319 | * |
---|
320 | * The function assumes that A is symmetric & positive definite and employs |
---|
321 | * the Cholesky decomposition: |
---|
322 | * If A=U^T U with U upper triangular, the system to be solved becomes |
---|
323 | * (U^T U) x = b |
---|
324 | * This amount to solving U^T y = b for y and then U x = y for x |
---|
325 | * |
---|
326 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
327 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
328 | * this function modifies A and B! |
---|
329 | * |
---|
330 | * The function returns 0 in case of error, 1 if successfull |
---|
331 | * |
---|
332 | * This function is often called repetitively to solve problems of identical |
---|
333 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
334 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
335 | */ |
---|
336 | int sba_Axb_Chol(double *A, double *B, double *x, int m, int iscolmaj) |
---|
337 | { |
---|
338 | static double *buf=NULL; |
---|
339 | static int buf_sz=0; |
---|
340 | |
---|
341 | double *a, *b; |
---|
342 | int a_sz, b_sz, tot_sz; |
---|
343 | register int i, j; |
---|
344 | int info, nrhs=1; |
---|
345 | |
---|
346 | /* calculate required memory size */ |
---|
347 | a_sz=(iscolmaj)? 0 : m*m; |
---|
348 | b_sz=(iscolmaj)? 0 : m; |
---|
349 | tot_sz=a_sz + b_sz; |
---|
350 | |
---|
351 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
352 | if(buf) free(buf); /* free previously allocated memory */ |
---|
353 | |
---|
354 | buf_sz=tot_sz; |
---|
355 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
356 | if(!buf){ |
---|
357 | fprintf(stderr, "memory allocation in sba_Axb_Chol() failed!\n"); |
---|
358 | exit(1); |
---|
359 | } |
---|
360 | } |
---|
361 | |
---|
362 | if(!iscolmaj){ |
---|
363 | a=buf; |
---|
364 | b=a+a_sz; |
---|
365 | |
---|
366 | /* store A (column major!) into a anb B into b */ |
---|
367 | for(i=0; i<m; i++){ |
---|
368 | for(j=0; j<m; j++) |
---|
369 | a[i+j*m]=A[i*m+j]; |
---|
370 | |
---|
371 | b[i]=B[i]; |
---|
372 | } |
---|
373 | } |
---|
374 | else{ /* no copying is necessary */ |
---|
375 | a=A; |
---|
376 | b=B; |
---|
377 | } |
---|
378 | |
---|
379 | /* Cholesky decomposition of A */ |
---|
380 | //F77_FUNC(dpotf2)("U", (int *)&m, a, (int *)&m, (int *)&info); |
---|
381 | F77_FUNC(dpotrf)("U", (int *)&m, a, (int *)&m, (int *)&info); |
---|
382 | /* error treatment */ |
---|
383 | if(info!=0){ |
---|
384 | if(info<0){ |
---|
385 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2/dpotrf in sba_Axb_Chol()\n", -info); |
---|
386 | exit(1); |
---|
387 | } |
---|
388 | else{ |
---|
389 | fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for dpotf2/dpotrf in sba_Axb_Chol()\n", info); |
---|
390 | return 0; |
---|
391 | } |
---|
392 | } |
---|
393 | |
---|
394 | /* solve the linear system U^T y = b */ |
---|
395 | F77_FUNC(dtrtrs)("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info); |
---|
396 | /* error treatment */ |
---|
397 | if(info!=0){ |
---|
398 | if(info<0){ |
---|
399 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_Chol()\n", -info); |
---|
400 | exit(1); |
---|
401 | } |
---|
402 | else{ |
---|
403 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_Chol()\n", info); |
---|
404 | return 0; |
---|
405 | } |
---|
406 | } |
---|
407 | |
---|
408 | /* solve the linear system U x = y */ |
---|
409 | F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info); |
---|
410 | /* error treatment */ |
---|
411 | if(info!=0){ |
---|
412 | if(info<0){ |
---|
413 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_Chol()\n", -info); |
---|
414 | exit(1); |
---|
415 | } |
---|
416 | else{ |
---|
417 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_Chol()\n", info); |
---|
418 | return 0; |
---|
419 | } |
---|
420 | } |
---|
421 | |
---|
422 | /* copy the result in x */ |
---|
423 | for(i=0; i<m; i++) |
---|
424 | x[i]=b[i]; |
---|
425 | |
---|
426 | return 1; |
---|
427 | } |
---|
428 | |
---|
429 | /* |
---|
430 | * This function returns the solution of Ax = b |
---|
431 | * |
---|
432 | * The function employs LU decomposition: |
---|
433 | * If A=L U with L lower and U upper triangular, then the original system |
---|
434 | * amounts to solving |
---|
435 | * L y = b, U x = y |
---|
436 | * |
---|
437 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
438 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
439 | * this function modifies A and B! |
---|
440 | * |
---|
441 | * The function returns 0 in case of error, |
---|
442 | * 1 if successfull |
---|
443 | * |
---|
444 | * This function is often called repetitively to solve problems of identical |
---|
445 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
446 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
447 | */ |
---|
448 | int sba_Axb_LU(double *A, double *B, double *x, int m, int iscolmaj) |
---|
449 | { |
---|
450 | static double *buf=NULL; |
---|
451 | static int buf_sz=0; |
---|
452 | |
---|
453 | int a_sz, ipiv_sz, b_sz, tot_sz; |
---|
454 | register int i, j; |
---|
455 | int info, *ipiv, nrhs=1; |
---|
456 | double *a, *b; |
---|
457 | |
---|
458 | /* calculate required memory size */ |
---|
459 | ipiv_sz=m; |
---|
460 | a_sz=(iscolmaj)? 0 : m*m; |
---|
461 | b_sz=(iscolmaj)? 0 : m; |
---|
462 | tot_sz=ipiv_sz*sizeof(int) + (a_sz + b_sz)*sizeof(double); |
---|
463 | |
---|
464 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
465 | if(buf) free(buf); /* free previously allocated memory */ |
---|
466 | |
---|
467 | buf_sz=tot_sz; |
---|
468 | buf=(double *)malloc(buf_sz); |
---|
469 | if(!buf){ |
---|
470 | fprintf(stderr, "memory allocation in sba_Axb_LU() failed!\n"); |
---|
471 | exit(1); |
---|
472 | } |
---|
473 | } |
---|
474 | |
---|
475 | ipiv=(int *)buf; |
---|
476 | if(!iscolmaj){ |
---|
477 | a=(double *)(ipiv + ipiv_sz); |
---|
478 | b=a+a_sz; |
---|
479 | |
---|
480 | /* store A (column major!) into a and B into b */ |
---|
481 | for(i=0; i<m; i++){ |
---|
482 | for(j=0; j<m; j++) |
---|
483 | a[i+j*m]=A[i*m+j]; |
---|
484 | |
---|
485 | b[i]=B[i]; |
---|
486 | } |
---|
487 | } |
---|
488 | else{ /* no copying is necessary */ |
---|
489 | a=A; |
---|
490 | b=B; |
---|
491 | } |
---|
492 | |
---|
493 | /* LU decomposition for A */ |
---|
494 | F77_FUNC(dgetrf)((int *)&m, (int *)&m, a, (int *)&m, ipiv, (int *)&info); |
---|
495 | if(info!=0){ |
---|
496 | if(info<0){ |
---|
497 | fprintf(stderr, "argument %d of dgetrf illegal in sba_Axb_LU()\n", -info); |
---|
498 | exit(1); |
---|
499 | } |
---|
500 | else{ |
---|
501 | fprintf(stderr, "singular matrix A for dgetrf in sba_Axb_LU()\n"); |
---|
502 | return 0; |
---|
503 | } |
---|
504 | } |
---|
505 | |
---|
506 | /* solve the system with the computed LU */ |
---|
507 | F77_FUNC(dgetrs)("N", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, b, (int *)&m, (int *)&info); |
---|
508 | if(info!=0){ |
---|
509 | if(info<0){ |
---|
510 | fprintf(stderr, "argument %d of dgetrs illegal in sba_Axb_LU()\n", -info); |
---|
511 | exit(1); |
---|
512 | } |
---|
513 | else{ |
---|
514 | fprintf(stderr, "unknown error for dgetrs in sba_Axb_LU()\n"); |
---|
515 | return 0; |
---|
516 | } |
---|
517 | } |
---|
518 | |
---|
519 | /* copy the result in x */ |
---|
520 | for(i=0; i<m; i++){ |
---|
521 | x[i]=b[i]; |
---|
522 | } |
---|
523 | |
---|
524 | return 1; |
---|
525 | } |
---|
526 | |
---|
527 | /* |
---|
528 | * This function returns the solution of Ax = b |
---|
529 | * |
---|
530 | * The function is based on SVD decomposition: |
---|
531 | * If A=U D V^T with U, V orthogonal and D diagonal, the linear system becomes |
---|
532 | * (U D V^T) x = b or x=V D^{-1} U^T b |
---|
533 | * Note that V D^{-1} U^T is the pseudoinverse A^+ |
---|
534 | * |
---|
535 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
536 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
537 | * this function modifies A! |
---|
538 | * |
---|
539 | * The function returns 0 in case of error, 1 if successfull |
---|
540 | * |
---|
541 | * This function is often called repetitively to solve problems of identical |
---|
542 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
543 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
544 | */ |
---|
545 | int sba_Axb_SVD(double *A, double *B, double *x, int m, int iscolmaj) |
---|
546 | { |
---|
547 | static double *buf=NULL; |
---|
548 | static int buf_sz=0; |
---|
549 | static double eps=-1.0; |
---|
550 | |
---|
551 | register int i, j; |
---|
552 | double *a, *u, *s, *vt, *work; |
---|
553 | int a_sz, u_sz, s_sz, vt_sz, tot_sz; |
---|
554 | double thresh, one_over_denom; |
---|
555 | register double sum; |
---|
556 | int info, rank, worksz, *iwork, iworksz; |
---|
557 | |
---|
558 | /* calculate required memory size */ |
---|
559 | worksz=-1; // workspace query. Keep in mind that dgesdd requires more memory than dgesvd |
---|
560 | /* note that optimal work size is returned in thresh */ |
---|
561 | F77_FUNC(dgesdd)("A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, |
---|
562 | (double *)&thresh, (int *)&worksz, NULL, &info); |
---|
563 | /* F77_FUNC(dgesvd)("A", "A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, |
---|
564 | (double *)&thresh, (int *)&worksz, &info); */ |
---|
565 | worksz=(int)thresh; |
---|
566 | iworksz=8*m; |
---|
567 | a_sz=(!iscolmaj)? m*m : 0; |
---|
568 | u_sz=m*m; s_sz=m; vt_sz=m*m; |
---|
569 | |
---|
570 | tot_sz=iworksz*sizeof(int) + (a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(double); |
---|
571 | |
---|
572 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
573 | if(buf) free(buf); /* free previously allocated memory */ |
---|
574 | |
---|
575 | buf_sz=tot_sz; |
---|
576 | buf=(double *)malloc(buf_sz); |
---|
577 | if(!buf){ |
---|
578 | fprintf(stderr, "memory allocation in sba_Axb_SVD() failed!\n"); |
---|
579 | exit(1); |
---|
580 | } |
---|
581 | } |
---|
582 | |
---|
583 | iwork=(int *)buf; |
---|
584 | if(!iscolmaj){ |
---|
585 | a=(double *)(iwork+iworksz); |
---|
586 | /* store A (column major!) into a */ |
---|
587 | for(i=0; i<m; i++) |
---|
588 | for(j=0; j<m; j++) |
---|
589 | a[i+j*m]=A[i*m+j]; |
---|
590 | } |
---|
591 | else{ |
---|
592 | a=A; /* no copying required */ |
---|
593 | } |
---|
594 | |
---|
595 | u=((double *)(iwork+iworksz)) + a_sz; |
---|
596 | s=u+u_sz; |
---|
597 | vt=s+s_sz; |
---|
598 | work=vt+vt_sz; |
---|
599 | |
---|
600 | /* SVD decomposition of A */ |
---|
601 | F77_FUNC(dgesdd)("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info); |
---|
602 | //F77_FUNC(dgesvd)("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info); |
---|
603 | |
---|
604 | /* error treatment */ |
---|
605 | if(info!=0){ |
---|
606 | if(info<0){ |
---|
607 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dgesdd/dgesvd in sba_Axb_SVD()\n", -info); |
---|
608 | exit(1); |
---|
609 | } |
---|
610 | else{ |
---|
611 | fprintf(stderr, "LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in sba_Axb_SVD() [info=%d]\n", info); |
---|
612 | |
---|
613 | return 0; |
---|
614 | } |
---|
615 | } |
---|
616 | |
---|
617 | if(eps<0.0){ |
---|
618 | double aux; |
---|
619 | |
---|
620 | /* compute machine epsilon. DBL_EPSILON should do also */ |
---|
621 | for(eps=1.0; aux=eps+1.0, aux-1.0>0.0; eps*=0.5) |
---|
622 | ; |
---|
623 | eps*=2.0; |
---|
624 | } |
---|
625 | |
---|
626 | /* compute the pseudoinverse in a */ |
---|
627 | memset(a, 0, m*m*sizeof(double)); /* initialize to zero */ |
---|
628 | for(rank=0, thresh=eps*s[0]; rank<m && s[rank]>thresh; rank++){ |
---|
629 | one_over_denom=1.0/s[rank]; |
---|
630 | |
---|
631 | for(j=0; j<m; j++) |
---|
632 | for(i=0; i<m; i++) |
---|
633 | a[i*m+j]+=vt[rank+i*m]*u[j+rank*m]*one_over_denom; |
---|
634 | } |
---|
635 | |
---|
636 | /* compute A^+ b in x */ |
---|
637 | for(i=0; i<m; i++){ |
---|
638 | for(j=0, sum=0.0; j<m; j++) |
---|
639 | sum+=a[i*m+j]*B[j]; |
---|
640 | x[i]=sum; |
---|
641 | } |
---|
642 | |
---|
643 | return 1; |
---|
644 | } |
---|
645 | |
---|
646 | /* |
---|
647 | * This function returns the solution of Ax = b for a real symmetric matrix A |
---|
648 | * |
---|
649 | * The function is based on Bunch-Kaufman factorization: |
---|
650 | * A is factored as U*D*U^T where U is upper triangular and |
---|
651 | * D symmetric and block diagonal |
---|
652 | * |
---|
653 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
654 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
655 | * this function modifies A and B! |
---|
656 | * |
---|
657 | * The function returns 0 in case of error, |
---|
658 | * 1 if successfull |
---|
659 | * |
---|
660 | * This function is often called repetitively to solve problems of identical |
---|
661 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
662 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
663 | */ |
---|
664 | int sba_Axb_BK(double *A, double *B, double *x, int m, int iscolmaj) |
---|
665 | { |
---|
666 | static double *buf=NULL; |
---|
667 | static int buf_sz=0; |
---|
668 | |
---|
669 | int a_sz, ipiv_sz, b_sz, work_sz, tot_sz; |
---|
670 | register int i, j; |
---|
671 | int info, *ipiv, nrhs=1; |
---|
672 | double *a, *b, *work; |
---|
673 | |
---|
674 | /* calculate required memory size */ |
---|
675 | ipiv_sz=m; |
---|
676 | a_sz=(iscolmaj)? 0 : m*m; |
---|
677 | b_sz=(iscolmaj)? 0 : m; |
---|
678 | work_sz=16*m; /* this is probably too much */ |
---|
679 | tot_sz=ipiv_sz*sizeof(int) + (a_sz + b_sz + work_sz)*sizeof(double); |
---|
680 | |
---|
681 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
682 | if(buf) free(buf); /* free previously allocated memory */ |
---|
683 | |
---|
684 | buf_sz=tot_sz; |
---|
685 | buf=(double *)malloc(buf_sz); |
---|
686 | if(!buf){ |
---|
687 | fprintf(stderr, "memory allocation in sba_Axb_BK() failed!\n"); |
---|
688 | exit(1); |
---|
689 | } |
---|
690 | } |
---|
691 | |
---|
692 | ipiv=(int *)buf; |
---|
693 | if(!iscolmaj){ |
---|
694 | a=(double *)(ipiv + ipiv_sz); |
---|
695 | b=a+a_sz; |
---|
696 | work=b+b_sz; |
---|
697 | |
---|
698 | /* store A (column major!) into a and B into b */ |
---|
699 | for(i=0; i<m; i++){ |
---|
700 | for(j=0; j<m; j++) |
---|
701 | a[i+j*m]=A[i*m+j]; |
---|
702 | |
---|
703 | b[i]=B[i]; |
---|
704 | } |
---|
705 | } |
---|
706 | else{ /* no copying is necessary */ |
---|
707 | a=A; |
---|
708 | b=B; |
---|
709 | work=(double *)(ipiv + ipiv_sz); |
---|
710 | } |
---|
711 | |
---|
712 | /* factorization for A */ |
---|
713 | F77_FUNC(dsytrf)("U", (int *)&m, a, (int *)&m, ipiv, work, (int *)&work_sz, (int *)&info); |
---|
714 | if(info!=0){ |
---|
715 | if(info<0){ |
---|
716 | fprintf(stderr, "argument %d of dsytrf illegal in sba_Axb_BK()\n", -info); |
---|
717 | exit(1); |
---|
718 | } |
---|
719 | else{ |
---|
720 | fprintf(stderr, "singular block diagonal matrix D for dsytrf in sba_Axb_BK() [D(%d, %d) is zero]\n", info, info); |
---|
721 | return 0; |
---|
722 | } |
---|
723 | } |
---|
724 | |
---|
725 | /* solve the system with the computed factorization */ |
---|
726 | F77_FUNC(dsytrs)("U", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, b, (int *)&m, (int *)&info); |
---|
727 | if(info!=0){ |
---|
728 | if(info<0){ |
---|
729 | fprintf(stderr, "argument %d of dsytrs illegal in sba_Axb_BK()\n", -info); |
---|
730 | exit(1); |
---|
731 | } |
---|
732 | else{ |
---|
733 | fprintf(stderr, "unknown error for dsytrs in sba_Axb_BK()\n"); |
---|
734 | return 0; |
---|
735 | } |
---|
736 | } |
---|
737 | |
---|
738 | /* copy the result in x */ |
---|
739 | for(i=0; i<m; i++){ |
---|
740 | x[i]=b[i]; |
---|
741 | } |
---|
742 | |
---|
743 | return 1; |
---|
744 | } |
---|
745 | |
---|
746 | /* |
---|
747 | * This function computes the inverse of a square matrix A into B |
---|
748 | * using LU decomposition |
---|
749 | * |
---|
750 | * The function returns 0 in case of error (e.g. A is singular), |
---|
751 | * 1 if successfull |
---|
752 | * |
---|
753 | * This function is often called repetitively to solve problems of identical |
---|
754 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
755 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
756 | */ |
---|
757 | int sba_mat_invert_LU(double *A, double *B, int m) |
---|
758 | { |
---|
759 | static double *buf=NULL; |
---|
760 | static int buf_sz=0; |
---|
761 | |
---|
762 | int a_sz, ipiv_sz, work_sz, tot_sz; |
---|
763 | register int i, j; |
---|
764 | int info, *ipiv; |
---|
765 | double *a, *work; |
---|
766 | |
---|
767 | /* calculate required memory size */ |
---|
768 | ipiv_sz=m; |
---|
769 | a_sz=m*m; |
---|
770 | work_sz=16*m; /* this is probably too much */ |
---|
771 | tot_sz=ipiv_sz*sizeof(int) + (a_sz + work_sz)*sizeof(double); |
---|
772 | |
---|
773 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
774 | if(buf) free(buf); /* free previously allocated memory */ |
---|
775 | |
---|
776 | buf_sz=tot_sz; |
---|
777 | buf=(double *)malloc(buf_sz); |
---|
778 | if(!buf){ |
---|
779 | fprintf(stderr, "memory allocation in sba_mat_invert_LU() failed!\n"); |
---|
780 | exit(1); |
---|
781 | } |
---|
782 | } |
---|
783 | |
---|
784 | ipiv=(int *)buf; |
---|
785 | a=(double *)(ipiv + ipiv_sz); |
---|
786 | work=a+a_sz; |
---|
787 | |
---|
788 | /* store A (column major!) into a */ |
---|
789 | for(i=0; i<m; i++) |
---|
790 | for(j=0; j<m; j++) |
---|
791 | a[i+j*m]=A[i*m+j]; |
---|
792 | |
---|
793 | /* LU decomposition for A */ |
---|
794 | F77_FUNC(dgetrf)((int *)&m, (int *)&m, a, (int *)&m, ipiv, (int *)&info); |
---|
795 | if(info!=0){ |
---|
796 | if(info<0){ |
---|
797 | fprintf(stderr, "argument %d of dgetrf illegal in sba_mat_invert_LU()\n", -info); |
---|
798 | exit(1); |
---|
799 | } |
---|
800 | else{ |
---|
801 | fprintf(stderr, "singular matrix A for dgetrf in sba_mat_invert_LU()\n"); |
---|
802 | return 0; |
---|
803 | } |
---|
804 | } |
---|
805 | |
---|
806 | /* (A)^{-1} from LU */ |
---|
807 | F77_FUNC(dgetri)((int *)&m, a, (int *)&m, ipiv, work, (int *)&work_sz, (int *)&info); |
---|
808 | if(info!=0){ |
---|
809 | if(info<0){ |
---|
810 | fprintf(stderr, "argument %d of dgetri illegal in sba_mat_invert_LU()\n", -info); |
---|
811 | exit(1); |
---|
812 | } |
---|
813 | else{ |
---|
814 | fprintf(stderr, "singular matrix A for dgetri in sba_mat_invert_LU()\n"); |
---|
815 | return 0; |
---|
816 | } |
---|
817 | } |
---|
818 | |
---|
819 | /* store (A)^{-1} in B */ |
---|
820 | for(i=0; i<m; i++) |
---|
821 | for(j=0; j<m; j++) |
---|
822 | B[i*m+j]=a[i+j*m]; |
---|
823 | |
---|
824 | return 1; |
---|
825 | } |
---|
826 | |
---|
827 | /* |
---|
828 | * This function computes the inverse of a square symmetric positive definite |
---|
829 | * matrix A into B using Cholesky factorization |
---|
830 | * |
---|
831 | * The function returns 0 in case of error (e.g. A is not positive definite or singular), |
---|
832 | * 1 if successfull |
---|
833 | * |
---|
834 | * This function is often called repetitively to solve problems of identical |
---|
835 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
836 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
837 | */ |
---|
838 | int sba_mat_invert_Chol(double *A, double *B, int m) |
---|
839 | { |
---|
840 | static double *buf=NULL; |
---|
841 | static int buf_sz=0; |
---|
842 | |
---|
843 | int a_sz, tot_sz; |
---|
844 | register int i, j; |
---|
845 | int info; |
---|
846 | double *a; |
---|
847 | |
---|
848 | /* calculate required memory size */ |
---|
849 | a_sz=m*m; |
---|
850 | tot_sz=a_sz; |
---|
851 | |
---|
852 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
853 | if(buf) free(buf); /* free previously allocated memory */ |
---|
854 | |
---|
855 | buf_sz=tot_sz; |
---|
856 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
857 | if(!buf){ |
---|
858 | fprintf(stderr, "memory allocation in sba_mat_invert_Chol() failed!\n"); |
---|
859 | exit(1); |
---|
860 | } |
---|
861 | } |
---|
862 | |
---|
863 | a=(double *)buf; |
---|
864 | |
---|
865 | /* store A (column major!) into a */ |
---|
866 | for(i=0; i<m; i++) |
---|
867 | for(j=0; j<m; j++) |
---|
868 | a[i+j*m]=A[i*m+j]; |
---|
869 | |
---|
870 | /* Cholesky factorization for A */ |
---|
871 | F77_FUNC(dpotrf)("L", (int *)&m, a, (int *)&m, (int *)&info); |
---|
872 | /* error treatment */ |
---|
873 | if(info!=0){ |
---|
874 | if(info<0){ |
---|
875 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotrf in sba_mat_invert_Chol()\n", -info); |
---|
876 | exit(1); |
---|
877 | } |
---|
878 | else{ |
---|
879 | fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for dpotrf in sba_mat_invert_Chol()\n", info); |
---|
880 | return 0; |
---|
881 | } |
---|
882 | } |
---|
883 | |
---|
884 | /* (A)^{-1} from Cholesky */ |
---|
885 | F77_FUNC(dpotri)("L", (int *)&m, a, (int *)&m, (int *)&info); |
---|
886 | if(info!=0){ |
---|
887 | if(info<0){ |
---|
888 | fprintf(stderr, "argument %d of dpotri illegal in sba_mat_invert_Chol()\n", -info); |
---|
889 | exit(1); |
---|
890 | } |
---|
891 | else{ |
---|
892 | fprintf(stderr, "the (%d, %d) element of the factor U or L is zero, singular matrix A for dpotri in sba_mat_invert_Chol()\n", info, info); |
---|
893 | return 0; |
---|
894 | } |
---|
895 | } |
---|
896 | |
---|
897 | /* store (A)^{-1} in B. The lower triangle of the symmetric A^{-1} is in the lower triangle of a */ |
---|
898 | for(i=0; i<m; i++) |
---|
899 | for(j=0; j<=i; j++) |
---|
900 | B[i*m+j]=B[j*m+i]=a[i+j*m]; |
---|
901 | |
---|
902 | return 1; |
---|
903 | } |
---|
904 | |
---|
905 | |
---|
906 | #define __CG_LINALG_BLOCKSIZE 8 |
---|
907 | |
---|
908 | /* Dot product of two vectors x and y using loop unrolling and blocking. |
---|
909 | * see http://www.abarnett.demon.co.uk/tutorial.html |
---|
910 | */ |
---|
911 | |
---|
912 | inline static double dprod(int n, double *x, double *y) |
---|
913 | { |
---|
914 | register int i, j1, j2, j3, j4, j5, j6, j7; |
---|
915 | int blockn; |
---|
916 | register double sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0, |
---|
917 | sum4=0.0, sum5=0.0, sum6=0.0, sum7=0.0; |
---|
918 | |
---|
919 | /* n may not be divisible by __CG_LINALG_BLOCKSIZE, |
---|
920 | * go as near as we can first, then tidy up. |
---|
921 | */ |
---|
922 | blockn = (n / __CG_LINALG_BLOCKSIZE) * __CG_LINALG_BLOCKSIZE; |
---|
923 | |
---|
924 | /* unroll the loop in blocks of `__CG_LINALG_BLOCKSIZE' */ |
---|
925 | for(i=0; i<blockn; i+=__CG_LINALG_BLOCKSIZE){ |
---|
926 | sum0+=x[i]*y[i]; |
---|
927 | j1=i+1; sum1+=x[j1]*y[j1]; |
---|
928 | j2=i+2; sum2+=x[j2]*y[j2]; |
---|
929 | j3=i+3; sum3+=x[j3]*y[j3]; |
---|
930 | j4=i+4; sum4+=x[j4]*y[j4]; |
---|
931 | j5=i+5; sum5+=x[j5]*y[j5]; |
---|
932 | j6=i+6; sum6+=x[j6]*y[j6]; |
---|
933 | j7=i+7; sum7+=x[j7]*y[j7]; |
---|
934 | } |
---|
935 | |
---|
936 | /* |
---|
937 | * There may be some left to do. |
---|
938 | * This could be done as a simple for() loop, |
---|
939 | * but a switch is faster (and more interesting) |
---|
940 | */ |
---|
941 | |
---|
942 | if(i<n){ |
---|
943 | /* Jump into the case at the place that will allow |
---|
944 | * us to finish off the appropriate number of items. |
---|
945 | */ |
---|
946 | |
---|
947 | switch(n - i){ |
---|
948 | case 7 : sum0+=x[i]*y[i]; ++i; |
---|
949 | case 6 : sum1+=x[i]*y[i]; ++i; |
---|
950 | case 5 : sum2+=x[i]*y[i]; ++i; |
---|
951 | case 4 : sum3+=x[i]*y[i]; ++i; |
---|
952 | case 3 : sum4+=x[i]*y[i]; ++i; |
---|
953 | case 2 : sum5+=x[i]*y[i]; ++i; |
---|
954 | case 1 : sum6+=x[i]*y[i]; ++i; |
---|
955 | } |
---|
956 | } |
---|
957 | |
---|
958 | return sum0+sum1+sum2+sum3+sum4+sum5+sum6+sum7; |
---|
959 | } |
---|
960 | |
---|
961 | |
---|
962 | /* Compute z=x+a*y for two vectors x and y and a scalar a; z can be one of x, y. |
---|
963 | * Similarly to the dot product routine, this one uses loop unrolling and blocking |
---|
964 | */ |
---|
965 | |
---|
966 | inline static void daxpy(int n, double *z, double *x, double a, double *y) |
---|
967 | { |
---|
968 | register int i, j1, j2, j3, j4, j5, j6, j7; |
---|
969 | int blockn; |
---|
970 | |
---|
971 | /* n may not be divisible by __CG_LINALG_BLOCKSIZE, |
---|
972 | * go as near as we can first, then tidy up. |
---|
973 | */ |
---|
974 | blockn = (n / __CG_LINALG_BLOCKSIZE) * __CG_LINALG_BLOCKSIZE; |
---|
975 | |
---|
976 | /* unroll the loop in blocks of `__CG_LINALG_BLOCKSIZE' */ |
---|
977 | for(i=0; i<blockn; i+=__CG_LINALG_BLOCKSIZE){ |
---|
978 | z[i]=x[i]+a*y[i]; |
---|
979 | j1=i+1; z[j1]=x[j1]+a*y[j1]; |
---|
980 | j2=i+2; z[j2]=x[j2]+a*y[j2]; |
---|
981 | j3=i+3; z[j3]=x[j3]+a*y[j3]; |
---|
982 | j4=i+4; z[j4]=x[j4]+a*y[j4]; |
---|
983 | j5=i+5; z[j5]=x[j5]+a*y[j5]; |
---|
984 | j6=i+6; z[j6]=x[j6]+a*y[j6]; |
---|
985 | j7=i+7; z[j7]=x[j7]+a*y[j7]; |
---|
986 | } |
---|
987 | |
---|
988 | /* |
---|
989 | * There may be some left to do. |
---|
990 | * This could be done as a simple for() loop, |
---|
991 | * but a switch is faster (and more interesting) |
---|
992 | */ |
---|
993 | |
---|
994 | if(i<n){ |
---|
995 | /* Jump into the case at the place that will allow |
---|
996 | * us to finish off the appropriate number of items. |
---|
997 | */ |
---|
998 | |
---|
999 | switch(n - i){ |
---|
1000 | case 7 : z[i]=x[i]+a*y[i]; ++i; |
---|
1001 | case 6 : z[i]=x[i]+a*y[i]; ++i; |
---|
1002 | case 5 : z[i]=x[i]+a*y[i]; ++i; |
---|
1003 | case 4 : z[i]=x[i]+a*y[i]; ++i; |
---|
1004 | case 3 : z[i]=x[i]+a*y[i]; ++i; |
---|
1005 | case 2 : z[i]=x[i]+a*y[i]; ++i; |
---|
1006 | case 1 : z[i]=x[i]+a*y[i]; ++i; |
---|
1007 | } |
---|
1008 | } |
---|
1009 | } |
---|
1010 | |
---|
1011 | /* |
---|
1012 | * This function returns the solution of Ax = b where A is posititive definite, |
---|
1013 | * based on the conjugate gradients method; see "An intro to the CG method" by J.R. Shewchuk, p. 50-51 |
---|
1014 | * |
---|
1015 | * A is mxm, b, x are is mx1. Argument niter specifies the maximum number of |
---|
1016 | * iterations and eps is the desired solution accuracy. niter<0 signals that |
---|
1017 | * x contains a valid initial approximation to the solution; if niter>0 then |
---|
1018 | * the starting point is taken to be zero. Argument prec selects the desired |
---|
1019 | * preconditioning method as follows: |
---|
1020 | * 0: no preconditioning |
---|
1021 | * 1: jacobi (diagonal) preconditioning |
---|
1022 | * 2: SSOR preconditioning |
---|
1023 | * Argument iscolmaj specifies whether A is stored in column or row major order. |
---|
1024 | * |
---|
1025 | * The function returns 0 in case of error, |
---|
1026 | * the number of iterations performed if successfull |
---|
1027 | * |
---|
1028 | * This function is often called repetitively to solve problems of identical |
---|
1029 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
1030 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
1031 | */ |
---|
1032 | int sba_Axb_CG(double *A, double *B, double *x, int m, int niter, double eps, int prec, int iscolmaj) |
---|
1033 | { |
---|
1034 | static double *buf=NULL; |
---|
1035 | static int buf_sz=0; |
---|
1036 | |
---|
1037 | register int i, j; |
---|
1038 | register double *aim; |
---|
1039 | int iter, a_sz, res_sz, d_sz, q_sz, s_sz, wk_sz, z_sz, tot_sz; |
---|
1040 | double *a, *res, *d, *q, *s, *wk, *z; |
---|
1041 | double delta0, deltaold, deltanew, alpha, beta, eps_sq=eps*eps; |
---|
1042 | register double sum; |
---|
1043 | int rec_res; |
---|
1044 | |
---|
1045 | /* calculate required memory size */ |
---|
1046 | a_sz=(iscolmaj)? m*m : 0; |
---|
1047 | res_sz=m; d_sz=m; q_sz=m; |
---|
1048 | if(prec!=SBA_CG_NOPREC){ |
---|
1049 | s_sz=m; wk_sz=m; |
---|
1050 | z_sz=(prec==SBA_CG_SSOR)? m : 0; |
---|
1051 | } |
---|
1052 | else |
---|
1053 | s_sz=wk_sz=z_sz=0; |
---|
1054 | |
---|
1055 | tot_sz=a_sz+res_sz+d_sz+q_sz+s_sz+wk_sz+z_sz; |
---|
1056 | |
---|
1057 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
1058 | if(buf) free(buf); /* free previously allocated memory */ |
---|
1059 | |
---|
1060 | buf_sz=tot_sz; |
---|
1061 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
1062 | if(!buf){ |
---|
1063 | fprintf(stderr, "memory allocation request failed in sba_Axb_CG()\n"); |
---|
1064 | exit(1); |
---|
1065 | } |
---|
1066 | } |
---|
1067 | |
---|
1068 | if(iscolmaj){ |
---|
1069 | a=buf; |
---|
1070 | /* store A (row major!) into a */ |
---|
1071 | for(i=0; i<m; ++i) |
---|
1072 | for(j=0, aim=a+i*m; j<m; ++j) |
---|
1073 | aim[j]=A[i+j*m]; |
---|
1074 | } |
---|
1075 | else a=A; /* no copying required */ |
---|
1076 | |
---|
1077 | res=buf+a_sz; |
---|
1078 | d=res+res_sz; |
---|
1079 | q=d+d_sz; |
---|
1080 | if(prec!=SBA_CG_NOPREC){ |
---|
1081 | s=q+q_sz; |
---|
1082 | wk=s+s_sz; |
---|
1083 | z=(prec==SBA_CG_SSOR)? wk+wk_sz : NULL; |
---|
1084 | |
---|
1085 | for(i=0; i<m; ++i){ // compute jacobi (i.e. diagonal) preconditioners and save them in wk |
---|
1086 | sum=a[i*m+i]; |
---|
1087 | if(sum>DBL_EPSILON || -sum<-DBL_EPSILON) // != 0.0 |
---|
1088 | wk[i]=1.0/sum; |
---|
1089 | else |
---|
1090 | wk[i]=1.0/DBL_EPSILON; |
---|
1091 | } |
---|
1092 | } |
---|
1093 | else{ |
---|
1094 | s=res; |
---|
1095 | wk=z=NULL; |
---|
1096 | } |
---|
1097 | |
---|
1098 | if(niter>0) |
---|
1099 | for(i=0; i<m; ++i){ // clear solution and initialize residual vector: res <-- B |
---|
1100 | x[i]=0.0; |
---|
1101 | res[i]=B[i]; |
---|
1102 | } |
---|
1103 | else{ |
---|
1104 | niter=-niter; |
---|
1105 | |
---|
1106 | for(i=0; i<m; ++i){ // initialize residual vector: res <-- B - A*x |
---|
1107 | for(j=0, aim=a+i*m, sum=0.0; j<m; ++j) |
---|
1108 | sum+=aim[j]*x[j]; |
---|
1109 | res[i]=B[i]-sum; |
---|
1110 | } |
---|
1111 | } |
---|
1112 | |
---|
1113 | switch(prec){ |
---|
1114 | case SBA_CG_NOPREC: |
---|
1115 | for(i=0, deltanew=0.0; i<m; ++i){ |
---|
1116 | d[i]=res[i]; |
---|
1117 | deltanew+=res[i]*res[i]; |
---|
1118 | } |
---|
1119 | break; |
---|
1120 | case SBA_CG_JACOBI: // jacobi preconditioning |
---|
1121 | for(i=0, deltanew=0.0; i<m; ++i){ |
---|
1122 | d[i]=res[i]*wk[i]; |
---|
1123 | deltanew+=res[i]*d[i]; |
---|
1124 | } |
---|
1125 | break; |
---|
1126 | case SBA_CG_SSOR: // SSOR preconditioning; see the "templates" book, fig. 3.2, p. 44 |
---|
1127 | for(i=0; i<m; ++i){ |
---|
1128 | for(j=0, sum=0.0, aim=a+i*m; j<i; ++j) |
---|
1129 | sum+=aim[j]*z[j]; |
---|
1130 | z[i]=wk[i]*(res[i]-sum); |
---|
1131 | } |
---|
1132 | |
---|
1133 | for(i=m-1; i>=0; --i){ |
---|
1134 | for(j=i+1, sum=0.0, aim=a+i*m; j<m; ++j) |
---|
1135 | sum+=aim[j]*d[j]; |
---|
1136 | d[i]=z[i]-wk[i]*sum; |
---|
1137 | } |
---|
1138 | deltanew=dprod(m, res, d); |
---|
1139 | break; |
---|
1140 | default: |
---|
1141 | fprintf(stderr, "unknown preconditioning option %d in sba_Axb_CG\n", prec); |
---|
1142 | exit(1); |
---|
1143 | } |
---|
1144 | |
---|
1145 | delta0=deltanew; |
---|
1146 | |
---|
1147 | for(iter=1; deltanew>eps_sq*delta0 && iter<=niter; ++iter){ |
---|
1148 | for(i=0; i<m; ++i){ // q <-- A d |
---|
1149 | aim=a+i*m; |
---|
1150 | /*** |
---|
1151 | for(j=0, sum=0.0; j<m; ++j) |
---|
1152 | sum+=aim[j]*d[j]; |
---|
1153 | ***/ |
---|
1154 | q[i]=dprod(m, aim, d); //sum; |
---|
1155 | } |
---|
1156 | |
---|
1157 | /*** |
---|
1158 | for(i=0, sum=0.0; i<m; ++i) |
---|
1159 | sum+=d[i]*q[i]; |
---|
1160 | ***/ |
---|
1161 | alpha=deltanew/dprod(m, d, q); // deltanew/sum; |
---|
1162 | |
---|
1163 | /*** |
---|
1164 | for(i=0; i<m; ++i) |
---|
1165 | x[i]+=alpha*d[i]; |
---|
1166 | ***/ |
---|
1167 | daxpy(m, x, x, alpha, d); |
---|
1168 | |
---|
1169 | if(!(iter%50)){ |
---|
1170 | for(i=0; i<m; ++i){ // accurate computation of the residual vector |
---|
1171 | aim=a+i*m; |
---|
1172 | /*** |
---|
1173 | for(j=0, sum=0.0; j<m; ++j) |
---|
1174 | sum+=aim[j]*x[j]; |
---|
1175 | ***/ |
---|
1176 | res[i]=B[i]-dprod(m, aim, x); //B[i]-sum; |
---|
1177 | } |
---|
1178 | rec_res=0; |
---|
1179 | } |
---|
1180 | else{ |
---|
1181 | /*** |
---|
1182 | for(i=0; i<m; ++i) // approximate computation of the residual vector |
---|
1183 | res[i]-=alpha*q[i]; |
---|
1184 | ***/ |
---|
1185 | daxpy(m, res, res, -alpha, q); |
---|
1186 | rec_res=1; |
---|
1187 | } |
---|
1188 | |
---|
1189 | if(prec){ |
---|
1190 | switch(prec){ |
---|
1191 | case SBA_CG_JACOBI: // jacobi |
---|
1192 | for(i=0; i<m; ++i) |
---|
1193 | s[i]=res[i]*wk[i]; |
---|
1194 | break; |
---|
1195 | case SBA_CG_SSOR: // SSOR |
---|
1196 | for(i=0; i<m; ++i){ |
---|
1197 | for(j=0, sum=0.0, aim=a+i*m; j<i; ++j) |
---|
1198 | sum+=aim[j]*z[j]; |
---|
1199 | z[i]=wk[i]*(res[i]-sum); |
---|
1200 | } |
---|
1201 | |
---|
1202 | for(i=m-1; i>=0; --i){ |
---|
1203 | for(j=i+1, sum=0.0, aim=a+i*m; j<m; ++j) |
---|
1204 | sum+=aim[j]*s[j]; |
---|
1205 | s[i]=z[i]-wk[i]*sum; |
---|
1206 | } |
---|
1207 | break; |
---|
1208 | } |
---|
1209 | } |
---|
1210 | |
---|
1211 | deltaold=deltanew; |
---|
1212 | /*** |
---|
1213 | for(i=0, sum=0.0; i<m; ++i) |
---|
1214 | sum+=res[i]*s[i]; |
---|
1215 | ***/ |
---|
1216 | deltanew=dprod(m, res, s); //sum; |
---|
1217 | |
---|
1218 | /* make sure that we get around small delta that are due to |
---|
1219 | * accumulated floating point roundoff errors |
---|
1220 | */ |
---|
1221 | if(rec_res && deltanew<=eps_sq*delta0){ |
---|
1222 | /* analytically recompute delta */ |
---|
1223 | for(i=0; i<m; ++i){ |
---|
1224 | for(j=0, aim=a+i*m, sum=0.0; j<m; ++j) |
---|
1225 | sum+=aim[j]*x[j]; |
---|
1226 | res[i]=B[i]-sum; |
---|
1227 | } |
---|
1228 | deltanew=dprod(m, res, s); |
---|
1229 | } |
---|
1230 | |
---|
1231 | beta=deltanew/deltaold; |
---|
1232 | |
---|
1233 | /*** |
---|
1234 | for(i=0; i<m; ++i) |
---|
1235 | d[i]=s[i]+beta*d[i]; |
---|
1236 | ***/ |
---|
1237 | daxpy(m, d, s, beta, d); |
---|
1238 | } |
---|
1239 | |
---|
1240 | return iter; |
---|
1241 | } |
---|