[37] | 1 | ///////////////////////////////////////////////////////////////////////////////// |
---|
| 2 | //// |
---|
| 3 | //// Linear algebra operations for the sba package |
---|
| 4 | //// Copyright (C) 2004 Manolis Lourakis (lourakis@ics.forth.gr) |
---|
| 5 | //// Institute of Computer Science, Foundation for Research & Technology - Hellas |
---|
| 6 | //// Heraklion, Crete, Greece. |
---|
| 7 | //// |
---|
| 8 | //// This program is free software; you can redistribute it and/or modify |
---|
| 9 | //// it under the terms of the GNU General Public License as published by |
---|
| 10 | //// the Free Software Foundation; either version 2 of the License, or |
---|
| 11 | //// (at your option) any later version. |
---|
| 12 | //// |
---|
| 13 | //// This program is distributed in the hope that it will be useful, |
---|
| 14 | //// but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 15 | //// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 16 | //// GNU General Public License for more details. |
---|
| 17 | //// |
---|
| 18 | /////////////////////////////////////////////////////////////////////////////////// |
---|
| 19 | |
---|
| 20 | #include <stdio.h> |
---|
| 21 | #include <stdlib.h> |
---|
| 22 | #include <string.h> |
---|
| 23 | #include <math.h> |
---|
| 24 | #include <float.h> |
---|
| 25 | |
---|
| 26 | /* inline */ |
---|
| 27 | #ifdef _MSC_VER |
---|
| 28 | #define inline __inline //MSVC |
---|
| 29 | #elif !defined(__GNUC__) |
---|
| 30 | #define inline //other than MSVC, GCC: define empty |
---|
| 31 | #endif |
---|
| 32 | |
---|
| 33 | #include "sba.h" |
---|
| 34 | |
---|
| 35 | #ifdef SBA_APPEND_UNDERSCORE_SUFFIX |
---|
| 36 | #define F77_FUNC(func) func ## _ |
---|
| 37 | #else |
---|
| 38 | #define F77_FUNC(func) func |
---|
| 39 | #endif /* SBA_APPEND_UNDERSCORE_SUFFIX */ |
---|
| 40 | |
---|
| 41 | |
---|
| 42 | /* declarations of LAPACK routines employed */ |
---|
| 43 | |
---|
| 44 | /* QR decomposition */ |
---|
| 45 | extern int F77_FUNC(dgeqrf)(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); |
---|
| 46 | extern int F77_FUNC(dorgqr)(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, int *info); |
---|
| 47 | |
---|
| 48 | /* solution of triangular system */ |
---|
| 49 | extern int F77_FUNC(dtrtrs)(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info); |
---|
| 50 | |
---|
| 51 | /* cholesky decomposition, matrix inversion */ |
---|
| 52 | extern int F77_FUNC(dpotf2)(char *uplo, int *n, double *a, int *lda, int *info); |
---|
| 53 | extern int F77_FUNC(dpotrf)(char *uplo, int *n, double *a, int *lda, int *info); /* block version of dpotf2 */ |
---|
| 54 | extern int F77_FUNC(dpotri)(char *uplo, int *n, double *a, int *lda, int *info); |
---|
| 55 | |
---|
| 56 | /* LU decomposition, linear system solution and matrix inversion */ |
---|
| 57 | extern int F77_FUNC(dgetrf)(int *m, int *n, double *a, int *lda, int *ipiv, int *info); |
---|
| 58 | extern int F77_FUNC(dgetrs)(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info); |
---|
| 59 | extern int F77_FUNC(dgetri)(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); |
---|
| 60 | |
---|
| 61 | /* SVD */ |
---|
| 62 | extern int F77_FUNC(dgesvd)(char *jobu, char *jobvt, int *m, int *n, |
---|
| 63 | double *a, int *lda, double *s, double *u, int *ldu, |
---|
| 64 | double *vt, int *ldvt, double *work, int *lwork, |
---|
| 65 | int *info); |
---|
| 66 | |
---|
| 67 | /* lapack 3.0 routine, faster than dgesvd() */ |
---|
| 68 | extern int F77_FUNC(dgesdd)(char *jobz, int *m, int *n, double *a, int *lda, |
---|
| 69 | double *s, double *u, int *ldu, double *vt, int *ldvt, |
---|
| 70 | double *work, int *lwork, int *iwork, int *info); |
---|
| 71 | |
---|
| 72 | |
---|
| 73 | /* Bunch-Kaufman factorization of a real symmetric matrix A and solution of linear systems */ |
---|
| 74 | extern int F77_FUNC(dsytrf)(char *uplo, int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); |
---|
| 75 | extern int F77_FUNC(dsytrs)(char *uplo, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, int *info); |
---|
| 76 | |
---|
| 77 | |
---|
| 78 | /* |
---|
| 79 | * This function returns the solution of Ax = b |
---|
| 80 | * |
---|
| 81 | * The function is based on QR decomposition with explicit computation of Q: |
---|
| 82 | * If A=Q R with Q orthogonal and R upper triangular, the linear system becomes |
---|
| 83 | * Q R x = b or R x = Q^T b. |
---|
| 84 | * |
---|
| 85 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
| 86 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
| 87 | * this function modifies A! |
---|
| 88 | * |
---|
| 89 | * The function returns 0 in case of error, 1 if successfull |
---|
| 90 | * |
---|
| 91 | * This function is often called repetitively to solve problems of identical |
---|
| 92 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 93 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 94 | */ |
---|
| 95 | int sba_Axb_QR(double *A, double *B, double *x, int m, int iscolmaj) |
---|
| 96 | { |
---|
| 97 | static double *buf=NULL; |
---|
| 98 | static int buf_sz=0; |
---|
| 99 | |
---|
| 100 | double *a, *qtb, *r, *tau, *work; |
---|
| 101 | int a_sz, qtb_sz, r_sz, tau_sz, tot_sz; |
---|
| 102 | register int i, j; |
---|
| 103 | int info, worksz, nrhs=1; |
---|
| 104 | register double sum; |
---|
| 105 | |
---|
| 106 | /* calculate required memory size */ |
---|
| 107 | a_sz=(iscolmaj)? 0 : m*m; |
---|
| 108 | qtb_sz=m; |
---|
| 109 | r_sz=m*m; /* only the upper triangular part really needed */ |
---|
| 110 | tau_sz=m; |
---|
| 111 | worksz=3*m; /* this is probably too much */ |
---|
| 112 | tot_sz=a_sz + qtb_sz + r_sz + tau_sz + worksz; |
---|
| 113 | |
---|
| 114 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 115 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 116 | |
---|
| 117 | buf_sz=tot_sz; |
---|
| 118 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
| 119 | if(!buf){ |
---|
| 120 | fprintf(stderr, "memory allocation in sba_Axb_QR() failed!\n"); |
---|
| 121 | exit(1); |
---|
| 122 | } |
---|
| 123 | } |
---|
| 124 | |
---|
| 125 | if(!iscolmaj){ |
---|
| 126 | a=buf; |
---|
| 127 | /* store A (column major!) into a */ |
---|
| 128 | for(i=0; i<m; i++) |
---|
| 129 | for(j=0; j<m; j++) |
---|
| 130 | a[i+j*m]=A[i*m+j]; |
---|
| 131 | } |
---|
| 132 | else a=A; /* no copying required */ |
---|
| 133 | |
---|
| 134 | qtb=buf+a_sz; |
---|
| 135 | r=qtb+qtb_sz; |
---|
| 136 | tau=r+r_sz; |
---|
| 137 | work=tau+tau_sz; |
---|
| 138 | |
---|
| 139 | /* QR decomposition of A */ |
---|
| 140 | F77_FUNC(dgeqrf)((int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); |
---|
| 141 | /* error treatment */ |
---|
| 142 | if(info!=0){ |
---|
| 143 | if(info<0){ |
---|
| 144 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dgeqrf in sba_Axb_QR()\n", -info); |
---|
| 145 | exit(1); |
---|
| 146 | } |
---|
| 147 | else{ |
---|
| 148 | fprintf(stderr, "Unknown LAPACK error %d for dgeqrf in sba_Axb_QR()\n", info); |
---|
| 149 | return 0; |
---|
| 150 | } |
---|
| 151 | } |
---|
| 152 | |
---|
| 153 | /* R is now stored in the upper triangular part of a; copy it in r so that dorgqr() below won't destroy it */ |
---|
| 154 | for(i=0; i<r_sz; i++) |
---|
| 155 | r[i]=a[i]; |
---|
| 156 | |
---|
| 157 | /* compute Q using the elementary reflectors computed by the above decomposition */ |
---|
| 158 | F77_FUNC(dorgqr)((int *)&m, (int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); |
---|
| 159 | if(info!=0){ |
---|
| 160 | if(info<0){ |
---|
| 161 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dorgqr in sba_Axb_QR()\n", -info); |
---|
| 162 | exit(1); |
---|
| 163 | } |
---|
| 164 | else{ |
---|
| 165 | fprintf(stderr, "Unknown LAPACK error (%d) in sba_Axb_QR()\n", info); |
---|
| 166 | return 0; |
---|
| 167 | } |
---|
| 168 | } |
---|
| 169 | |
---|
| 170 | /* Q is now in a; compute Q^T b in qtb */ |
---|
| 171 | for(i=0; i<m; i++){ |
---|
| 172 | for(j=0, sum=0.0; j<m; j++) |
---|
| 173 | sum+=a[i*m+j]*B[j]; |
---|
| 174 | qtb[i]=sum; |
---|
| 175 | } |
---|
| 176 | |
---|
| 177 | /* solve the linear system R x = Q^t b */ |
---|
| 178 | F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, r, (int *)&m, qtb, (int *)&m, &info); |
---|
| 179 | /* error treatment */ |
---|
| 180 | if(info!=0){ |
---|
| 181 | if(info<0){ |
---|
| 182 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QR()\n", -info); |
---|
| 183 | exit(1); |
---|
| 184 | } |
---|
| 185 | else{ |
---|
| 186 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QR()\n", info); |
---|
| 187 | return 0; |
---|
| 188 | } |
---|
| 189 | } |
---|
| 190 | |
---|
| 191 | /* copy the result in x */ |
---|
| 192 | for(i=0; i<m; i++) |
---|
| 193 | x[i]=qtb[i]; |
---|
| 194 | |
---|
| 195 | return 1; |
---|
| 196 | } |
---|
| 197 | |
---|
| 198 | /* |
---|
| 199 | * This function returns the solution of Ax = b |
---|
| 200 | * |
---|
| 201 | * The function is based on QR decomposition without computation of Q: |
---|
| 202 | * If A=Q R with Q orthogonal and R upper triangular, the linear system becomes |
---|
| 203 | * (A^T A) x = A^T b or (R^T Q^T Q R) x = A^T b or (R^T R) x = A^T b. |
---|
| 204 | * This amounts to solving R^T y = A^T b for y and then R x = y for x |
---|
| 205 | * Note that Q does not need to be explicitly computed |
---|
| 206 | * |
---|
| 207 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
| 208 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
| 209 | * this function modifies A! |
---|
| 210 | * |
---|
| 211 | * The function returns 0 in case of error, 1 if successfull |
---|
| 212 | * |
---|
| 213 | * This function is often called repetitively to solve problems of identical |
---|
| 214 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 215 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 216 | */ |
---|
| 217 | int sba_Axb_QRnoQ(double *A, double *B, double *x, int m, int iscolmaj) |
---|
| 218 | { |
---|
| 219 | static double *buf=NULL; |
---|
| 220 | static int buf_sz=0; |
---|
| 221 | |
---|
| 222 | double *a, *atb, *tau, *work; |
---|
| 223 | int a_sz, atb_sz, tau_sz, tot_sz; |
---|
| 224 | register int i, j; |
---|
| 225 | int info, worksz, nrhs=1; |
---|
| 226 | register double sum; |
---|
| 227 | |
---|
| 228 | /* calculate required memory size */ |
---|
| 229 | a_sz=(iscolmaj)? 0 : m*m; |
---|
| 230 | atb_sz=m; |
---|
| 231 | tau_sz=m; |
---|
| 232 | worksz=3*m; /* this is probably too much */ |
---|
| 233 | tot_sz=a_sz + atb_sz + tau_sz + worksz; |
---|
| 234 | |
---|
| 235 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 236 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 237 | |
---|
| 238 | buf_sz=tot_sz; |
---|
| 239 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
| 240 | if(!buf){ |
---|
| 241 | fprintf(stderr, "memory allocation in sba_Axb_QRnoQ() failed!\n"); |
---|
| 242 | exit(1); |
---|
| 243 | } |
---|
| 244 | } |
---|
| 245 | |
---|
| 246 | if(!iscolmaj){ |
---|
| 247 | a=buf; |
---|
| 248 | /* store A (column major!) into a */ |
---|
| 249 | for(i=0; i<m; i++) |
---|
| 250 | for(j=0; j<m; j++) |
---|
| 251 | a[i+j*m]=A[i*m+j]; |
---|
| 252 | } |
---|
| 253 | else a=A; /* no copying required */ |
---|
| 254 | |
---|
| 255 | atb=buf+a_sz; |
---|
| 256 | tau=atb+atb_sz; |
---|
| 257 | work=tau+tau_sz; |
---|
| 258 | |
---|
| 259 | /* compute A^T b in atb */ |
---|
| 260 | for(i=0; i<m; i++){ |
---|
| 261 | for(j=0, sum=0.0; j<m; j++) |
---|
| 262 | sum+=a[i*m+j]*B[j]; |
---|
| 263 | atb[i]=sum; |
---|
| 264 | } |
---|
| 265 | |
---|
| 266 | /* QR decomposition of A */ |
---|
| 267 | F77_FUNC(dgeqrf)((int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); |
---|
| 268 | /* error treatment */ |
---|
| 269 | if(info!=0){ |
---|
| 270 | if(info<0){ |
---|
| 271 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dgeqrf in sba_Axb_QRnoQ()\n", -info); |
---|
| 272 | exit(1); |
---|
| 273 | } |
---|
| 274 | else{ |
---|
| 275 | fprintf(stderr, "Unknown LAPACK error %d for dgeqrf in sba_Axb_QRnoQ()\n", info); |
---|
| 276 | return 0; |
---|
| 277 | } |
---|
| 278 | } |
---|
| 279 | |
---|
| 280 | /* R is stored in the upper triangular part of a */ |
---|
| 281 | |
---|
| 282 | /* solve the linear system R^T y = A^t b */ |
---|
| 283 | F77_FUNC(dtrtrs)("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, atb, (int *)&m, &info); |
---|
| 284 | /* error treatment */ |
---|
| 285 | if(info!=0){ |
---|
| 286 | if(info<0){ |
---|
| 287 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QRnoQ()\n", -info); |
---|
| 288 | exit(1); |
---|
| 289 | } |
---|
| 290 | else{ |
---|
| 291 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QRnoQ()\n", info); |
---|
| 292 | return 0; |
---|
| 293 | } |
---|
| 294 | } |
---|
| 295 | |
---|
| 296 | /* solve the linear system R x = y */ |
---|
| 297 | F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, atb, (int *)&m, &info); |
---|
| 298 | /* error treatment */ |
---|
| 299 | if(info!=0){ |
---|
| 300 | if(info<0){ |
---|
| 301 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QRnoQ()\n", -info); |
---|
| 302 | exit(1); |
---|
| 303 | } |
---|
| 304 | else{ |
---|
| 305 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QRnoQ()\n", info); |
---|
| 306 | return 0; |
---|
| 307 | } |
---|
| 308 | } |
---|
| 309 | |
---|
| 310 | /* copy the result in x */ |
---|
| 311 | for(i=0; i<m; i++) |
---|
| 312 | x[i]=atb[i]; |
---|
| 313 | |
---|
| 314 | return 1; |
---|
| 315 | } |
---|
| 316 | |
---|
| 317 | /* |
---|
| 318 | * This function returns the solution of Ax=b |
---|
| 319 | * |
---|
| 320 | * The function assumes that A is symmetric & positive definite and employs |
---|
| 321 | * the Cholesky decomposition: |
---|
| 322 | * If A=U^T U with U upper triangular, the system to be solved becomes |
---|
| 323 | * (U^T U) x = b |
---|
| 324 | * This amount to solving U^T y = b for y and then U x = y for x |
---|
| 325 | * |
---|
| 326 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
| 327 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
| 328 | * this function modifies A and B! |
---|
| 329 | * |
---|
| 330 | * The function returns 0 in case of error, 1 if successfull |
---|
| 331 | * |
---|
| 332 | * This function is often called repetitively to solve problems of identical |
---|
| 333 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 334 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 335 | */ |
---|
| 336 | int sba_Axb_Chol(double *A, double *B, double *x, int m, int iscolmaj) |
---|
| 337 | { |
---|
| 338 | static double *buf=NULL; |
---|
| 339 | static int buf_sz=0; |
---|
| 340 | |
---|
| 341 | double *a, *b; |
---|
| 342 | int a_sz, b_sz, tot_sz; |
---|
| 343 | register int i, j; |
---|
| 344 | int info, nrhs=1; |
---|
| 345 | |
---|
| 346 | /* calculate required memory size */ |
---|
| 347 | a_sz=(iscolmaj)? 0 : m*m; |
---|
| 348 | b_sz=(iscolmaj)? 0 : m; |
---|
| 349 | tot_sz=a_sz + b_sz; |
---|
| 350 | |
---|
| 351 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 352 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 353 | |
---|
| 354 | buf_sz=tot_sz; |
---|
| 355 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
| 356 | if(!buf){ |
---|
| 357 | fprintf(stderr, "memory allocation in sba_Axb_Chol() failed!\n"); |
---|
| 358 | exit(1); |
---|
| 359 | } |
---|
| 360 | } |
---|
| 361 | |
---|
| 362 | if(!iscolmaj){ |
---|
| 363 | a=buf; |
---|
| 364 | b=a+a_sz; |
---|
| 365 | |
---|
| 366 | /* store A (column major!) into a anb B into b */ |
---|
| 367 | for(i=0; i<m; i++){ |
---|
| 368 | for(j=0; j<m; j++) |
---|
| 369 | a[i+j*m]=A[i*m+j]; |
---|
| 370 | |
---|
| 371 | b[i]=B[i]; |
---|
| 372 | } |
---|
| 373 | } |
---|
| 374 | else{ /* no copying is necessary */ |
---|
| 375 | a=A; |
---|
| 376 | b=B; |
---|
| 377 | } |
---|
| 378 | |
---|
| 379 | /* Cholesky decomposition of A */ |
---|
| 380 | //F77_FUNC(dpotf2)("U", (int *)&m, a, (int *)&m, (int *)&info); |
---|
| 381 | F77_FUNC(dpotrf)("U", (int *)&m, a, (int *)&m, (int *)&info); |
---|
| 382 | /* error treatment */ |
---|
| 383 | if(info!=0){ |
---|
| 384 | if(info<0){ |
---|
| 385 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2/dpotrf in sba_Axb_Chol()\n", -info); |
---|
| 386 | exit(1); |
---|
| 387 | } |
---|
| 388 | else{ |
---|
| 389 | fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for dpotf2/dpotrf in sba_Axb_Chol()\n", info); |
---|
| 390 | return 0; |
---|
| 391 | } |
---|
| 392 | } |
---|
| 393 | |
---|
| 394 | /* solve the linear system U^T y = b */ |
---|
| 395 | F77_FUNC(dtrtrs)("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info); |
---|
| 396 | /* error treatment */ |
---|
| 397 | if(info!=0){ |
---|
| 398 | if(info<0){ |
---|
| 399 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_Chol()\n", -info); |
---|
| 400 | exit(1); |
---|
| 401 | } |
---|
| 402 | else{ |
---|
| 403 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_Chol()\n", info); |
---|
| 404 | return 0; |
---|
| 405 | } |
---|
| 406 | } |
---|
| 407 | |
---|
| 408 | /* solve the linear system U x = y */ |
---|
| 409 | F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, b, (int *)&m, &info); |
---|
| 410 | /* error treatment */ |
---|
| 411 | if(info!=0){ |
---|
| 412 | if(info<0){ |
---|
| 413 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_Chol()\n", -info); |
---|
| 414 | exit(1); |
---|
| 415 | } |
---|
| 416 | else{ |
---|
| 417 | fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_Chol()\n", info); |
---|
| 418 | return 0; |
---|
| 419 | } |
---|
| 420 | } |
---|
| 421 | |
---|
| 422 | /* copy the result in x */ |
---|
| 423 | for(i=0; i<m; i++) |
---|
| 424 | x[i]=b[i]; |
---|
| 425 | |
---|
| 426 | return 1; |
---|
| 427 | } |
---|
| 428 | |
---|
| 429 | /* |
---|
| 430 | * This function returns the solution of Ax = b |
---|
| 431 | * |
---|
| 432 | * The function employs LU decomposition: |
---|
| 433 | * If A=L U with L lower and U upper triangular, then the original system |
---|
| 434 | * amounts to solving |
---|
| 435 | * L y = b, U x = y |
---|
| 436 | * |
---|
| 437 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
| 438 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
| 439 | * this function modifies A and B! |
---|
| 440 | * |
---|
| 441 | * The function returns 0 in case of error, |
---|
| 442 | * 1 if successfull |
---|
| 443 | * |
---|
| 444 | * This function is often called repetitively to solve problems of identical |
---|
| 445 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 446 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 447 | */ |
---|
| 448 | int sba_Axb_LU(double *A, double *B, double *x, int m, int iscolmaj) |
---|
| 449 | { |
---|
| 450 | static double *buf=NULL; |
---|
| 451 | static int buf_sz=0; |
---|
| 452 | |
---|
| 453 | int a_sz, ipiv_sz, b_sz, tot_sz; |
---|
| 454 | register int i, j; |
---|
| 455 | int info, *ipiv, nrhs=1; |
---|
| 456 | double *a, *b; |
---|
| 457 | |
---|
| 458 | /* calculate required memory size */ |
---|
| 459 | ipiv_sz=m; |
---|
| 460 | a_sz=(iscolmaj)? 0 : m*m; |
---|
| 461 | b_sz=(iscolmaj)? 0 : m; |
---|
| 462 | tot_sz=ipiv_sz*sizeof(int) + (a_sz + b_sz)*sizeof(double); |
---|
| 463 | |
---|
| 464 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 465 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 466 | |
---|
| 467 | buf_sz=tot_sz; |
---|
| 468 | buf=(double *)malloc(buf_sz); |
---|
| 469 | if(!buf){ |
---|
| 470 | fprintf(stderr, "memory allocation in sba_Axb_LU() failed!\n"); |
---|
| 471 | exit(1); |
---|
| 472 | } |
---|
| 473 | } |
---|
| 474 | |
---|
| 475 | ipiv=(int *)buf; |
---|
| 476 | if(!iscolmaj){ |
---|
| 477 | a=(double *)(ipiv + ipiv_sz); |
---|
| 478 | b=a+a_sz; |
---|
| 479 | |
---|
| 480 | /* store A (column major!) into a and B into b */ |
---|
| 481 | for(i=0; i<m; i++){ |
---|
| 482 | for(j=0; j<m; j++) |
---|
| 483 | a[i+j*m]=A[i*m+j]; |
---|
| 484 | |
---|
| 485 | b[i]=B[i]; |
---|
| 486 | } |
---|
| 487 | } |
---|
| 488 | else{ /* no copying is necessary */ |
---|
| 489 | a=A; |
---|
| 490 | b=B; |
---|
| 491 | } |
---|
| 492 | |
---|
| 493 | /* LU decomposition for A */ |
---|
| 494 | F77_FUNC(dgetrf)((int *)&m, (int *)&m, a, (int *)&m, ipiv, (int *)&info); |
---|
| 495 | if(info!=0){ |
---|
| 496 | if(info<0){ |
---|
| 497 | fprintf(stderr, "argument %d of dgetrf illegal in sba_Axb_LU()\n", -info); |
---|
| 498 | exit(1); |
---|
| 499 | } |
---|
| 500 | else{ |
---|
| 501 | fprintf(stderr, "singular matrix A for dgetrf in sba_Axb_LU()\n"); |
---|
| 502 | return 0; |
---|
| 503 | } |
---|
| 504 | } |
---|
| 505 | |
---|
| 506 | /* solve the system with the computed LU */ |
---|
| 507 | F77_FUNC(dgetrs)("N", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, b, (int *)&m, (int *)&info); |
---|
| 508 | if(info!=0){ |
---|
| 509 | if(info<0){ |
---|
| 510 | fprintf(stderr, "argument %d of dgetrs illegal in sba_Axb_LU()\n", -info); |
---|
| 511 | exit(1); |
---|
| 512 | } |
---|
| 513 | else{ |
---|
| 514 | fprintf(stderr, "unknown error for dgetrs in sba_Axb_LU()\n"); |
---|
| 515 | return 0; |
---|
| 516 | } |
---|
| 517 | } |
---|
| 518 | |
---|
| 519 | /* copy the result in x */ |
---|
| 520 | for(i=0; i<m; i++){ |
---|
| 521 | x[i]=b[i]; |
---|
| 522 | } |
---|
| 523 | |
---|
| 524 | return 1; |
---|
| 525 | } |
---|
| 526 | |
---|
| 527 | /* |
---|
| 528 | * This function returns the solution of Ax = b |
---|
| 529 | * |
---|
| 530 | * The function is based on SVD decomposition: |
---|
| 531 | * If A=U D V^T with U, V orthogonal and D diagonal, the linear system becomes |
---|
| 532 | * (U D V^T) x = b or x=V D^{-1} U^T b |
---|
| 533 | * Note that V D^{-1} U^T is the pseudoinverse A^+ |
---|
| 534 | * |
---|
| 535 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
| 536 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
| 537 | * this function modifies A! |
---|
| 538 | * |
---|
| 539 | * The function returns 0 in case of error, 1 if successfull |
---|
| 540 | * |
---|
| 541 | * This function is often called repetitively to solve problems of identical |
---|
| 542 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 543 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 544 | */ |
---|
| 545 | int sba_Axb_SVD(double *A, double *B, double *x, int m, int iscolmaj) |
---|
| 546 | { |
---|
| 547 | static double *buf=NULL; |
---|
| 548 | static int buf_sz=0; |
---|
| 549 | static double eps=-1.0; |
---|
| 550 | |
---|
| 551 | register int i, j; |
---|
| 552 | double *a, *u, *s, *vt, *work; |
---|
| 553 | int a_sz, u_sz, s_sz, vt_sz, tot_sz; |
---|
| 554 | double thresh, one_over_denom; |
---|
| 555 | register double sum; |
---|
| 556 | int info, rank, worksz, *iwork, iworksz; |
---|
| 557 | |
---|
| 558 | /* calculate required memory size */ |
---|
| 559 | worksz=-1; // workspace query. Keep in mind that dgesdd requires more memory than dgesvd |
---|
| 560 | /* note that optimal work size is returned in thresh */ |
---|
| 561 | F77_FUNC(dgesdd)("A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, |
---|
| 562 | (double *)&thresh, (int *)&worksz, NULL, &info); |
---|
| 563 | /* F77_FUNC(dgesvd)("A", "A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, |
---|
| 564 | (double *)&thresh, (int *)&worksz, &info); */ |
---|
| 565 | worksz=(int)thresh; |
---|
| 566 | iworksz=8*m; |
---|
| 567 | a_sz=(!iscolmaj)? m*m : 0; |
---|
| 568 | u_sz=m*m; s_sz=m; vt_sz=m*m; |
---|
| 569 | |
---|
| 570 | tot_sz=iworksz*sizeof(int) + (a_sz + u_sz + s_sz + vt_sz + worksz)*sizeof(double); |
---|
| 571 | |
---|
| 572 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 573 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 574 | |
---|
| 575 | buf_sz=tot_sz; |
---|
| 576 | buf=(double *)malloc(buf_sz); |
---|
| 577 | if(!buf){ |
---|
| 578 | fprintf(stderr, "memory allocation in sba_Axb_SVD() failed!\n"); |
---|
| 579 | exit(1); |
---|
| 580 | } |
---|
| 581 | } |
---|
| 582 | |
---|
| 583 | iwork=(int *)buf; |
---|
| 584 | if(!iscolmaj){ |
---|
| 585 | a=(double *)(iwork+iworksz); |
---|
| 586 | /* store A (column major!) into a */ |
---|
| 587 | for(i=0; i<m; i++) |
---|
| 588 | for(j=0; j<m; j++) |
---|
| 589 | a[i+j*m]=A[i*m+j]; |
---|
| 590 | } |
---|
| 591 | else{ |
---|
| 592 | a=A; /* no copying required */ |
---|
| 593 | } |
---|
| 594 | |
---|
| 595 | u=((double *)(iwork+iworksz)) + a_sz; |
---|
| 596 | s=u+u_sz; |
---|
| 597 | vt=s+s_sz; |
---|
| 598 | work=vt+vt_sz; |
---|
| 599 | |
---|
| 600 | /* SVD decomposition of A */ |
---|
| 601 | F77_FUNC(dgesdd)("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info); |
---|
| 602 | //F77_FUNC(dgesvd)("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, &info); |
---|
| 603 | |
---|
| 604 | /* error treatment */ |
---|
| 605 | if(info!=0){ |
---|
| 606 | if(info<0){ |
---|
| 607 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dgesdd/dgesvd in sba_Axb_SVD()\n", -info); |
---|
| 608 | exit(1); |
---|
| 609 | } |
---|
| 610 | else{ |
---|
| 611 | fprintf(stderr, "LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in sba_Axb_SVD() [info=%d]\n", info); |
---|
| 612 | |
---|
| 613 | return 0; |
---|
| 614 | } |
---|
| 615 | } |
---|
| 616 | |
---|
| 617 | if(eps<0.0){ |
---|
| 618 | double aux; |
---|
| 619 | |
---|
| 620 | /* compute machine epsilon. DBL_EPSILON should do also */ |
---|
| 621 | for(eps=1.0; aux=eps+1.0, aux-1.0>0.0; eps*=0.5) |
---|
| 622 | ; |
---|
| 623 | eps*=2.0; |
---|
| 624 | } |
---|
| 625 | |
---|
| 626 | /* compute the pseudoinverse in a */ |
---|
| 627 | memset(a, 0, m*m*sizeof(double)); /* initialize to zero */ |
---|
| 628 | for(rank=0, thresh=eps*s[0]; rank<m && s[rank]>thresh; rank++){ |
---|
| 629 | one_over_denom=1.0/s[rank]; |
---|
| 630 | |
---|
| 631 | for(j=0; j<m; j++) |
---|
| 632 | for(i=0; i<m; i++) |
---|
| 633 | a[i*m+j]+=vt[rank+i*m]*u[j+rank*m]*one_over_denom; |
---|
| 634 | } |
---|
| 635 | |
---|
| 636 | /* compute A^+ b in x */ |
---|
| 637 | for(i=0; i<m; i++){ |
---|
| 638 | for(j=0, sum=0.0; j<m; j++) |
---|
| 639 | sum+=a[i*m+j]*B[j]; |
---|
| 640 | x[i]=sum; |
---|
| 641 | } |
---|
| 642 | |
---|
| 643 | return 1; |
---|
| 644 | } |
---|
| 645 | |
---|
| 646 | /* |
---|
| 647 | * This function returns the solution of Ax = b for a real symmetric matrix A |
---|
| 648 | * |
---|
| 649 | * The function is based on Bunch-Kaufman factorization: |
---|
| 650 | * A is factored as U*D*U^T where U is upper triangular and |
---|
| 651 | * D symmetric and block diagonal |
---|
| 652 | * |
---|
| 653 | * A is mxm, b is mx1. Argument iscolmaj specifies whether A is |
---|
| 654 | * stored in column or row major order. Note that if iscolmaj==1 |
---|
| 655 | * this function modifies A and B! |
---|
| 656 | * |
---|
| 657 | * The function returns 0 in case of error, |
---|
| 658 | * 1 if successfull |
---|
| 659 | * |
---|
| 660 | * This function is often called repetitively to solve problems of identical |
---|
| 661 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 662 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 663 | */ |
---|
| 664 | int sba_Axb_BK(double *A, double *B, double *x, int m, int iscolmaj) |
---|
| 665 | { |
---|
| 666 | static double *buf=NULL; |
---|
| 667 | static int buf_sz=0; |
---|
| 668 | |
---|
| 669 | int a_sz, ipiv_sz, b_sz, work_sz, tot_sz; |
---|
| 670 | register int i, j; |
---|
| 671 | int info, *ipiv, nrhs=1; |
---|
| 672 | double *a, *b, *work; |
---|
| 673 | |
---|
| 674 | /* calculate required memory size */ |
---|
| 675 | ipiv_sz=m; |
---|
| 676 | a_sz=(iscolmaj)? 0 : m*m; |
---|
| 677 | b_sz=(iscolmaj)? 0 : m; |
---|
| 678 | work_sz=16*m; /* this is probably too much */ |
---|
| 679 | tot_sz=ipiv_sz*sizeof(int) + (a_sz + b_sz + work_sz)*sizeof(double); |
---|
| 680 | |
---|
| 681 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 682 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 683 | |
---|
| 684 | buf_sz=tot_sz; |
---|
| 685 | buf=(double *)malloc(buf_sz); |
---|
| 686 | if(!buf){ |
---|
| 687 | fprintf(stderr, "memory allocation in sba_Axb_BK() failed!\n"); |
---|
| 688 | exit(1); |
---|
| 689 | } |
---|
| 690 | } |
---|
| 691 | |
---|
| 692 | ipiv=(int *)buf; |
---|
| 693 | if(!iscolmaj){ |
---|
| 694 | a=(double *)(ipiv + ipiv_sz); |
---|
| 695 | b=a+a_sz; |
---|
| 696 | work=b+b_sz; |
---|
| 697 | |
---|
| 698 | /* store A (column major!) into a and B into b */ |
---|
| 699 | for(i=0; i<m; i++){ |
---|
| 700 | for(j=0; j<m; j++) |
---|
| 701 | a[i+j*m]=A[i*m+j]; |
---|
| 702 | |
---|
| 703 | b[i]=B[i]; |
---|
| 704 | } |
---|
| 705 | } |
---|
| 706 | else{ /* no copying is necessary */ |
---|
| 707 | a=A; |
---|
| 708 | b=B; |
---|
| 709 | work=(double *)(ipiv + ipiv_sz); |
---|
| 710 | } |
---|
| 711 | |
---|
| 712 | /* factorization for A */ |
---|
| 713 | F77_FUNC(dsytrf)("U", (int *)&m, a, (int *)&m, ipiv, work, (int *)&work_sz, (int *)&info); |
---|
| 714 | if(info!=0){ |
---|
| 715 | if(info<0){ |
---|
| 716 | fprintf(stderr, "argument %d of dsytrf illegal in sba_Axb_BK()\n", -info); |
---|
| 717 | exit(1); |
---|
| 718 | } |
---|
| 719 | else{ |
---|
| 720 | fprintf(stderr, "singular block diagonal matrix D for dsytrf in sba_Axb_BK() [D(%d, %d) is zero]\n", info, info); |
---|
| 721 | return 0; |
---|
| 722 | } |
---|
| 723 | } |
---|
| 724 | |
---|
| 725 | /* solve the system with the computed factorization */ |
---|
| 726 | F77_FUNC(dsytrs)("U", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, b, (int *)&m, (int *)&info); |
---|
| 727 | if(info!=0){ |
---|
| 728 | if(info<0){ |
---|
| 729 | fprintf(stderr, "argument %d of dsytrs illegal in sba_Axb_BK()\n", -info); |
---|
| 730 | exit(1); |
---|
| 731 | } |
---|
| 732 | else{ |
---|
| 733 | fprintf(stderr, "unknown error for dsytrs in sba_Axb_BK()\n"); |
---|
| 734 | return 0; |
---|
| 735 | } |
---|
| 736 | } |
---|
| 737 | |
---|
| 738 | /* copy the result in x */ |
---|
| 739 | for(i=0; i<m; i++){ |
---|
| 740 | x[i]=b[i]; |
---|
| 741 | } |
---|
| 742 | |
---|
| 743 | return 1; |
---|
| 744 | } |
---|
| 745 | |
---|
| 746 | /* |
---|
| 747 | * This function computes the inverse of a square matrix A into B |
---|
| 748 | * using LU decomposition |
---|
| 749 | * |
---|
| 750 | * The function returns 0 in case of error (e.g. A is singular), |
---|
| 751 | * 1 if successfull |
---|
| 752 | * |
---|
| 753 | * This function is often called repetitively to solve problems of identical |
---|
| 754 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 755 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 756 | */ |
---|
| 757 | int sba_mat_invert_LU(double *A, double *B, int m) |
---|
| 758 | { |
---|
| 759 | static double *buf=NULL; |
---|
| 760 | static int buf_sz=0; |
---|
| 761 | |
---|
| 762 | int a_sz, ipiv_sz, work_sz, tot_sz; |
---|
| 763 | register int i, j; |
---|
| 764 | int info, *ipiv; |
---|
| 765 | double *a, *work; |
---|
| 766 | |
---|
| 767 | /* calculate required memory size */ |
---|
| 768 | ipiv_sz=m; |
---|
| 769 | a_sz=m*m; |
---|
| 770 | work_sz=16*m; /* this is probably too much */ |
---|
| 771 | tot_sz=ipiv_sz*sizeof(int) + (a_sz + work_sz)*sizeof(double); |
---|
| 772 | |
---|
| 773 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 774 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 775 | |
---|
| 776 | buf_sz=tot_sz; |
---|
| 777 | buf=(double *)malloc(buf_sz); |
---|
| 778 | if(!buf){ |
---|
| 779 | fprintf(stderr, "memory allocation in sba_mat_invert_LU() failed!\n"); |
---|
| 780 | exit(1); |
---|
| 781 | } |
---|
| 782 | } |
---|
| 783 | |
---|
| 784 | ipiv=(int *)buf; |
---|
| 785 | a=(double *)(ipiv + ipiv_sz); |
---|
| 786 | work=a+a_sz; |
---|
| 787 | |
---|
| 788 | /* store A (column major!) into a */ |
---|
| 789 | for(i=0; i<m; i++) |
---|
| 790 | for(j=0; j<m; j++) |
---|
| 791 | a[i+j*m]=A[i*m+j]; |
---|
| 792 | |
---|
| 793 | /* LU decomposition for A */ |
---|
| 794 | F77_FUNC(dgetrf)((int *)&m, (int *)&m, a, (int *)&m, ipiv, (int *)&info); |
---|
| 795 | if(info!=0){ |
---|
| 796 | if(info<0){ |
---|
| 797 | fprintf(stderr, "argument %d of dgetrf illegal in sba_mat_invert_LU()\n", -info); |
---|
| 798 | exit(1); |
---|
| 799 | } |
---|
| 800 | else{ |
---|
| 801 | fprintf(stderr, "singular matrix A for dgetrf in sba_mat_invert_LU()\n"); |
---|
| 802 | return 0; |
---|
| 803 | } |
---|
| 804 | } |
---|
| 805 | |
---|
| 806 | /* (A)^{-1} from LU */ |
---|
| 807 | F77_FUNC(dgetri)((int *)&m, a, (int *)&m, ipiv, work, (int *)&work_sz, (int *)&info); |
---|
| 808 | if(info!=0){ |
---|
| 809 | if(info<0){ |
---|
| 810 | fprintf(stderr, "argument %d of dgetri illegal in sba_mat_invert_LU()\n", -info); |
---|
| 811 | exit(1); |
---|
| 812 | } |
---|
| 813 | else{ |
---|
| 814 | fprintf(stderr, "singular matrix A for dgetri in sba_mat_invert_LU()\n"); |
---|
| 815 | return 0; |
---|
| 816 | } |
---|
| 817 | } |
---|
| 818 | |
---|
| 819 | /* store (A)^{-1} in B */ |
---|
| 820 | for(i=0; i<m; i++) |
---|
| 821 | for(j=0; j<m; j++) |
---|
| 822 | B[i*m+j]=a[i+j*m]; |
---|
| 823 | |
---|
| 824 | return 1; |
---|
| 825 | } |
---|
| 826 | |
---|
| 827 | /* |
---|
| 828 | * This function computes the inverse of a square symmetric positive definite |
---|
| 829 | * matrix A into B using Cholesky factorization |
---|
| 830 | * |
---|
| 831 | * The function returns 0 in case of error (e.g. A is not positive definite or singular), |
---|
| 832 | * 1 if successfull |
---|
| 833 | * |
---|
| 834 | * This function is often called repetitively to solve problems of identical |
---|
| 835 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 836 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 837 | */ |
---|
| 838 | int sba_mat_invert_Chol(double *A, double *B, int m) |
---|
| 839 | { |
---|
| 840 | static double *buf=NULL; |
---|
| 841 | static int buf_sz=0; |
---|
| 842 | |
---|
| 843 | int a_sz, tot_sz; |
---|
| 844 | register int i, j; |
---|
| 845 | int info; |
---|
| 846 | double *a; |
---|
| 847 | |
---|
| 848 | /* calculate required memory size */ |
---|
| 849 | a_sz=m*m; |
---|
| 850 | tot_sz=a_sz; |
---|
| 851 | |
---|
| 852 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 853 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 854 | |
---|
| 855 | buf_sz=tot_sz; |
---|
| 856 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
| 857 | if(!buf){ |
---|
| 858 | fprintf(stderr, "memory allocation in sba_mat_invert_Chol() failed!\n"); |
---|
| 859 | exit(1); |
---|
| 860 | } |
---|
| 861 | } |
---|
| 862 | |
---|
| 863 | a=(double *)buf; |
---|
| 864 | |
---|
| 865 | /* store A (column major!) into a */ |
---|
| 866 | for(i=0; i<m; i++) |
---|
| 867 | for(j=0; j<m; j++) |
---|
| 868 | a[i+j*m]=A[i*m+j]; |
---|
| 869 | |
---|
| 870 | /* Cholesky factorization for A */ |
---|
| 871 | F77_FUNC(dpotrf)("L", (int *)&m, a, (int *)&m, (int *)&info); |
---|
| 872 | /* error treatment */ |
---|
| 873 | if(info!=0){ |
---|
| 874 | if(info<0){ |
---|
| 875 | fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotrf in sba_mat_invert_Chol()\n", -info); |
---|
| 876 | exit(1); |
---|
| 877 | } |
---|
| 878 | else{ |
---|
| 879 | fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for dpotrf in sba_mat_invert_Chol()\n", info); |
---|
| 880 | return 0; |
---|
| 881 | } |
---|
| 882 | } |
---|
| 883 | |
---|
| 884 | /* (A)^{-1} from Cholesky */ |
---|
| 885 | F77_FUNC(dpotri)("L", (int *)&m, a, (int *)&m, (int *)&info); |
---|
| 886 | if(info!=0){ |
---|
| 887 | if(info<0){ |
---|
| 888 | fprintf(stderr, "argument %d of dpotri illegal in sba_mat_invert_Chol()\n", -info); |
---|
| 889 | exit(1); |
---|
| 890 | } |
---|
| 891 | else{ |
---|
| 892 | fprintf(stderr, "the (%d, %d) element of the factor U or L is zero, singular matrix A for dpotri in sba_mat_invert_Chol()\n", info, info); |
---|
| 893 | return 0; |
---|
| 894 | } |
---|
| 895 | } |
---|
| 896 | |
---|
| 897 | /* store (A)^{-1} in B. The lower triangle of the symmetric A^{-1} is in the lower triangle of a */ |
---|
| 898 | for(i=0; i<m; i++) |
---|
| 899 | for(j=0; j<=i; j++) |
---|
| 900 | B[i*m+j]=B[j*m+i]=a[i+j*m]; |
---|
| 901 | |
---|
| 902 | return 1; |
---|
| 903 | } |
---|
| 904 | |
---|
| 905 | |
---|
| 906 | #define __CG_LINALG_BLOCKSIZE 8 |
---|
| 907 | |
---|
| 908 | /* Dot product of two vectors x and y using loop unrolling and blocking. |
---|
| 909 | * see http://www.abarnett.demon.co.uk/tutorial.html |
---|
| 910 | */ |
---|
| 911 | |
---|
| 912 | inline static double dprod(int n, double *x, double *y) |
---|
| 913 | { |
---|
| 914 | register int i, j1, j2, j3, j4, j5, j6, j7; |
---|
| 915 | int blockn; |
---|
| 916 | register double sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0, |
---|
| 917 | sum4=0.0, sum5=0.0, sum6=0.0, sum7=0.0; |
---|
| 918 | |
---|
| 919 | /* n may not be divisible by __CG_LINALG_BLOCKSIZE, |
---|
| 920 | * go as near as we can first, then tidy up. |
---|
| 921 | */ |
---|
| 922 | blockn = (n / __CG_LINALG_BLOCKSIZE) * __CG_LINALG_BLOCKSIZE; |
---|
| 923 | |
---|
| 924 | /* unroll the loop in blocks of `__CG_LINALG_BLOCKSIZE' */ |
---|
| 925 | for(i=0; i<blockn; i+=__CG_LINALG_BLOCKSIZE){ |
---|
| 926 | sum0+=x[i]*y[i]; |
---|
| 927 | j1=i+1; sum1+=x[j1]*y[j1]; |
---|
| 928 | j2=i+2; sum2+=x[j2]*y[j2]; |
---|
| 929 | j3=i+3; sum3+=x[j3]*y[j3]; |
---|
| 930 | j4=i+4; sum4+=x[j4]*y[j4]; |
---|
| 931 | j5=i+5; sum5+=x[j5]*y[j5]; |
---|
| 932 | j6=i+6; sum6+=x[j6]*y[j6]; |
---|
| 933 | j7=i+7; sum7+=x[j7]*y[j7]; |
---|
| 934 | } |
---|
| 935 | |
---|
| 936 | /* |
---|
| 937 | * There may be some left to do. |
---|
| 938 | * This could be done as a simple for() loop, |
---|
| 939 | * but a switch is faster (and more interesting) |
---|
| 940 | */ |
---|
| 941 | |
---|
| 942 | if(i<n){ |
---|
| 943 | /* Jump into the case at the place that will allow |
---|
| 944 | * us to finish off the appropriate number of items. |
---|
| 945 | */ |
---|
| 946 | |
---|
| 947 | switch(n - i){ |
---|
| 948 | case 7 : sum0+=x[i]*y[i]; ++i; |
---|
| 949 | case 6 : sum1+=x[i]*y[i]; ++i; |
---|
| 950 | case 5 : sum2+=x[i]*y[i]; ++i; |
---|
| 951 | case 4 : sum3+=x[i]*y[i]; ++i; |
---|
| 952 | case 3 : sum4+=x[i]*y[i]; ++i; |
---|
| 953 | case 2 : sum5+=x[i]*y[i]; ++i; |
---|
| 954 | case 1 : sum6+=x[i]*y[i]; ++i; |
---|
| 955 | } |
---|
| 956 | } |
---|
| 957 | |
---|
| 958 | return sum0+sum1+sum2+sum3+sum4+sum5+sum6+sum7; |
---|
| 959 | } |
---|
| 960 | |
---|
| 961 | |
---|
| 962 | /* Compute z=x+a*y for two vectors x and y and a scalar a; z can be one of x, y. |
---|
| 963 | * Similarly to the dot product routine, this one uses loop unrolling and blocking |
---|
| 964 | */ |
---|
| 965 | |
---|
| 966 | inline static void daxpy(int n, double *z, double *x, double a, double *y) |
---|
| 967 | { |
---|
| 968 | register int i, j1, j2, j3, j4, j5, j6, j7; |
---|
| 969 | int blockn; |
---|
| 970 | |
---|
| 971 | /* n may not be divisible by __CG_LINALG_BLOCKSIZE, |
---|
| 972 | * go as near as we can first, then tidy up. |
---|
| 973 | */ |
---|
| 974 | blockn = (n / __CG_LINALG_BLOCKSIZE) * __CG_LINALG_BLOCKSIZE; |
---|
| 975 | |
---|
| 976 | /* unroll the loop in blocks of `__CG_LINALG_BLOCKSIZE' */ |
---|
| 977 | for(i=0; i<blockn; i+=__CG_LINALG_BLOCKSIZE){ |
---|
| 978 | z[i]=x[i]+a*y[i]; |
---|
| 979 | j1=i+1; z[j1]=x[j1]+a*y[j1]; |
---|
| 980 | j2=i+2; z[j2]=x[j2]+a*y[j2]; |
---|
| 981 | j3=i+3; z[j3]=x[j3]+a*y[j3]; |
---|
| 982 | j4=i+4; z[j4]=x[j4]+a*y[j4]; |
---|
| 983 | j5=i+5; z[j5]=x[j5]+a*y[j5]; |
---|
| 984 | j6=i+6; z[j6]=x[j6]+a*y[j6]; |
---|
| 985 | j7=i+7; z[j7]=x[j7]+a*y[j7]; |
---|
| 986 | } |
---|
| 987 | |
---|
| 988 | /* |
---|
| 989 | * There may be some left to do. |
---|
| 990 | * This could be done as a simple for() loop, |
---|
| 991 | * but a switch is faster (and more interesting) |
---|
| 992 | */ |
---|
| 993 | |
---|
| 994 | if(i<n){ |
---|
| 995 | /* Jump into the case at the place that will allow |
---|
| 996 | * us to finish off the appropriate number of items. |
---|
| 997 | */ |
---|
| 998 | |
---|
| 999 | switch(n - i){ |
---|
| 1000 | case 7 : z[i]=x[i]+a*y[i]; ++i; |
---|
| 1001 | case 6 : z[i]=x[i]+a*y[i]; ++i; |
---|
| 1002 | case 5 : z[i]=x[i]+a*y[i]; ++i; |
---|
| 1003 | case 4 : z[i]=x[i]+a*y[i]; ++i; |
---|
| 1004 | case 3 : z[i]=x[i]+a*y[i]; ++i; |
---|
| 1005 | case 2 : z[i]=x[i]+a*y[i]; ++i; |
---|
| 1006 | case 1 : z[i]=x[i]+a*y[i]; ++i; |
---|
| 1007 | } |
---|
| 1008 | } |
---|
| 1009 | } |
---|
| 1010 | |
---|
| 1011 | /* |
---|
| 1012 | * This function returns the solution of Ax = b where A is posititive definite, |
---|
| 1013 | * based on the conjugate gradients method; see "An intro to the CG method" by J.R. Shewchuk, p. 50-51 |
---|
| 1014 | * |
---|
| 1015 | * A is mxm, b, x are is mx1. Argument niter specifies the maximum number of |
---|
| 1016 | * iterations and eps is the desired solution accuracy. niter<0 signals that |
---|
| 1017 | * x contains a valid initial approximation to the solution; if niter>0 then |
---|
| 1018 | * the starting point is taken to be zero. Argument prec selects the desired |
---|
| 1019 | * preconditioning method as follows: |
---|
| 1020 | * 0: no preconditioning |
---|
| 1021 | * 1: jacobi (diagonal) preconditioning |
---|
| 1022 | * 2: SSOR preconditioning |
---|
| 1023 | * Argument iscolmaj specifies whether A is stored in column or row major order. |
---|
| 1024 | * |
---|
| 1025 | * The function returns 0 in case of error, |
---|
| 1026 | * the number of iterations performed if successfull |
---|
| 1027 | * |
---|
| 1028 | * This function is often called repetitively to solve problems of identical |
---|
| 1029 | * dimensions. To avoid repetitive malloc's and free's, allocated memory is |
---|
| 1030 | * retained between calls and free'd-malloc'ed when not of the appropriate size. |
---|
| 1031 | */ |
---|
| 1032 | int sba_Axb_CG(double *A, double *B, double *x, int m, int niter, double eps, int prec, int iscolmaj) |
---|
| 1033 | { |
---|
| 1034 | static double *buf=NULL; |
---|
| 1035 | static int buf_sz=0; |
---|
| 1036 | |
---|
| 1037 | register int i, j; |
---|
| 1038 | register double *aim; |
---|
| 1039 | int iter, a_sz, res_sz, d_sz, q_sz, s_sz, wk_sz, z_sz, tot_sz; |
---|
| 1040 | double *a, *res, *d, *q, *s, *wk, *z; |
---|
| 1041 | double delta0, deltaold, deltanew, alpha, beta, eps_sq=eps*eps; |
---|
| 1042 | register double sum; |
---|
| 1043 | int rec_res; |
---|
| 1044 | |
---|
| 1045 | /* calculate required memory size */ |
---|
| 1046 | a_sz=(iscolmaj)? m*m : 0; |
---|
| 1047 | res_sz=m; d_sz=m; q_sz=m; |
---|
| 1048 | if(prec!=SBA_CG_NOPREC){ |
---|
| 1049 | s_sz=m; wk_sz=m; |
---|
| 1050 | z_sz=(prec==SBA_CG_SSOR)? m : 0; |
---|
| 1051 | } |
---|
| 1052 | else |
---|
| 1053 | s_sz=wk_sz=z_sz=0; |
---|
| 1054 | |
---|
| 1055 | tot_sz=a_sz+res_sz+d_sz+q_sz+s_sz+wk_sz+z_sz; |
---|
| 1056 | |
---|
| 1057 | if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ |
---|
| 1058 | if(buf) free(buf); /* free previously allocated memory */ |
---|
| 1059 | |
---|
| 1060 | buf_sz=tot_sz; |
---|
| 1061 | buf=(double *)malloc(buf_sz*sizeof(double)); |
---|
| 1062 | if(!buf){ |
---|
| 1063 | fprintf(stderr, "memory allocation request failed in sba_Axb_CG()\n"); |
---|
| 1064 | exit(1); |
---|
| 1065 | } |
---|
| 1066 | } |
---|
| 1067 | |
---|
| 1068 | if(iscolmaj){ |
---|
| 1069 | a=buf; |
---|
| 1070 | /* store A (row major!) into a */ |
---|
| 1071 | for(i=0; i<m; ++i) |
---|
| 1072 | for(j=0, aim=a+i*m; j<m; ++j) |
---|
| 1073 | aim[j]=A[i+j*m]; |
---|
| 1074 | } |
---|
| 1075 | else a=A; /* no copying required */ |
---|
| 1076 | |
---|
| 1077 | res=buf+a_sz; |
---|
| 1078 | d=res+res_sz; |
---|
| 1079 | q=d+d_sz; |
---|
| 1080 | if(prec!=SBA_CG_NOPREC){ |
---|
| 1081 | s=q+q_sz; |
---|
| 1082 | wk=s+s_sz; |
---|
| 1083 | z=(prec==SBA_CG_SSOR)? wk+wk_sz : NULL; |
---|
| 1084 | |
---|
| 1085 | for(i=0; i<m; ++i){ // compute jacobi (i.e. diagonal) preconditioners and save them in wk |
---|
| 1086 | sum=a[i*m+i]; |
---|
| 1087 | if(sum>DBL_EPSILON || -sum<-DBL_EPSILON) // != 0.0 |
---|
| 1088 | wk[i]=1.0/sum; |
---|
| 1089 | else |
---|
| 1090 | wk[i]=1.0/DBL_EPSILON; |
---|
| 1091 | } |
---|
| 1092 | } |
---|
| 1093 | else{ |
---|
| 1094 | s=res; |
---|
| 1095 | wk=z=NULL; |
---|
| 1096 | } |
---|
| 1097 | |
---|
| 1098 | if(niter>0) |
---|
| 1099 | for(i=0; i<m; ++i){ // clear solution and initialize residual vector: res <-- B |
---|
| 1100 | x[i]=0.0; |
---|
| 1101 | res[i]=B[i]; |
---|
| 1102 | } |
---|
| 1103 | else{ |
---|
| 1104 | niter=-niter; |
---|
| 1105 | |
---|
| 1106 | for(i=0; i<m; ++i){ // initialize residual vector: res <-- B - A*x |
---|
| 1107 | for(j=0, aim=a+i*m, sum=0.0; j<m; ++j) |
---|
| 1108 | sum+=aim[j]*x[j]; |
---|
| 1109 | res[i]=B[i]-sum; |
---|
| 1110 | } |
---|
| 1111 | } |
---|
| 1112 | |
---|
| 1113 | switch(prec){ |
---|
| 1114 | case SBA_CG_NOPREC: |
---|
| 1115 | for(i=0, deltanew=0.0; i<m; ++i){ |
---|
| 1116 | d[i]=res[i]; |
---|
| 1117 | deltanew+=res[i]*res[i]; |
---|
| 1118 | } |
---|
| 1119 | break; |
---|
| 1120 | case SBA_CG_JACOBI: // jacobi preconditioning |
---|
| 1121 | for(i=0, deltanew=0.0; i<m; ++i){ |
---|
| 1122 | d[i]=res[i]*wk[i]; |
---|
| 1123 | deltanew+=res[i]*d[i]; |
---|
| 1124 | } |
---|
| 1125 | break; |
---|
| 1126 | case SBA_CG_SSOR: // SSOR preconditioning; see the "templates" book, fig. 3.2, p. 44 |
---|
| 1127 | for(i=0; i<m; ++i){ |
---|
| 1128 | for(j=0, sum=0.0, aim=a+i*m; j<i; ++j) |
---|
| 1129 | sum+=aim[j]*z[j]; |
---|
| 1130 | z[i]=wk[i]*(res[i]-sum); |
---|
| 1131 | } |
---|
| 1132 | |
---|
| 1133 | for(i=m-1; i>=0; --i){ |
---|
| 1134 | for(j=i+1, sum=0.0, aim=a+i*m; j<m; ++j) |
---|
| 1135 | sum+=aim[j]*d[j]; |
---|
| 1136 | d[i]=z[i]-wk[i]*sum; |
---|
| 1137 | } |
---|
| 1138 | deltanew=dprod(m, res, d); |
---|
| 1139 | break; |
---|
| 1140 | default: |
---|
| 1141 | fprintf(stderr, "unknown preconditioning option %d in sba_Axb_CG\n", prec); |
---|
| 1142 | exit(1); |
---|
| 1143 | } |
---|
| 1144 | |
---|
| 1145 | delta0=deltanew; |
---|
| 1146 | |
---|
| 1147 | for(iter=1; deltanew>eps_sq*delta0 && iter<=niter; ++iter){ |
---|
| 1148 | for(i=0; i<m; ++i){ // q <-- A d |
---|
| 1149 | aim=a+i*m; |
---|
| 1150 | /*** |
---|
| 1151 | for(j=0, sum=0.0; j<m; ++j) |
---|
| 1152 | sum+=aim[j]*d[j]; |
---|
| 1153 | ***/ |
---|
| 1154 | q[i]=dprod(m, aim, d); //sum; |
---|
| 1155 | } |
---|
| 1156 | |
---|
| 1157 | /*** |
---|
| 1158 | for(i=0, sum=0.0; i<m; ++i) |
---|
| 1159 | sum+=d[i]*q[i]; |
---|
| 1160 | ***/ |
---|
| 1161 | alpha=deltanew/dprod(m, d, q); // deltanew/sum; |
---|
| 1162 | |
---|
| 1163 | /*** |
---|
| 1164 | for(i=0; i<m; ++i) |
---|
| 1165 | x[i]+=alpha*d[i]; |
---|
| 1166 | ***/ |
---|
| 1167 | daxpy(m, x, x, alpha, d); |
---|
| 1168 | |
---|
| 1169 | if(!(iter%50)){ |
---|
| 1170 | for(i=0; i<m; ++i){ // accurate computation of the residual vector |
---|
| 1171 | aim=a+i*m; |
---|
| 1172 | /*** |
---|
| 1173 | for(j=0, sum=0.0; j<m; ++j) |
---|
| 1174 | sum+=aim[j]*x[j]; |
---|
| 1175 | ***/ |
---|
| 1176 | res[i]=B[i]-dprod(m, aim, x); //B[i]-sum; |
---|
| 1177 | } |
---|
| 1178 | rec_res=0; |
---|
| 1179 | } |
---|
| 1180 | else{ |
---|
| 1181 | /*** |
---|
| 1182 | for(i=0; i<m; ++i) // approximate computation of the residual vector |
---|
| 1183 | res[i]-=alpha*q[i]; |
---|
| 1184 | ***/ |
---|
| 1185 | daxpy(m, res, res, -alpha, q); |
---|
| 1186 | rec_res=1; |
---|
| 1187 | } |
---|
| 1188 | |
---|
| 1189 | if(prec){ |
---|
| 1190 | switch(prec){ |
---|
| 1191 | case SBA_CG_JACOBI: // jacobi |
---|
| 1192 | for(i=0; i<m; ++i) |
---|
| 1193 | s[i]=res[i]*wk[i]; |
---|
| 1194 | break; |
---|
| 1195 | case SBA_CG_SSOR: // SSOR |
---|
| 1196 | for(i=0; i<m; ++i){ |
---|
| 1197 | for(j=0, sum=0.0, aim=a+i*m; j<i; ++j) |
---|
| 1198 | sum+=aim[j]*z[j]; |
---|
| 1199 | z[i]=wk[i]*(res[i]-sum); |
---|
| 1200 | } |
---|
| 1201 | |
---|
| 1202 | for(i=m-1; i>=0; --i){ |
---|
| 1203 | for(j=i+1, sum=0.0, aim=a+i*m; j<m; ++j) |
---|
| 1204 | sum+=aim[j]*s[j]; |
---|
| 1205 | s[i]=z[i]-wk[i]*sum; |
---|
| 1206 | } |
---|
| 1207 | break; |
---|
| 1208 | } |
---|
| 1209 | } |
---|
| 1210 | |
---|
| 1211 | deltaold=deltanew; |
---|
| 1212 | /*** |
---|
| 1213 | for(i=0, sum=0.0; i<m; ++i) |
---|
| 1214 | sum+=res[i]*s[i]; |
---|
| 1215 | ***/ |
---|
| 1216 | deltanew=dprod(m, res, s); //sum; |
---|
| 1217 | |
---|
| 1218 | /* make sure that we get around small delta that are due to |
---|
| 1219 | * accumulated floating point roundoff errors |
---|
| 1220 | */ |
---|
| 1221 | if(rec_res && deltanew<=eps_sq*delta0){ |
---|
| 1222 | /* analytically recompute delta */ |
---|
| 1223 | for(i=0; i<m; ++i){ |
---|
| 1224 | for(j=0, aim=a+i*m, sum=0.0; j<m; ++j) |
---|
| 1225 | sum+=aim[j]*x[j]; |
---|
| 1226 | res[i]=B[i]-sum; |
---|
| 1227 | } |
---|
| 1228 | deltanew=dprod(m, res, s); |
---|
| 1229 | } |
---|
| 1230 | |
---|
| 1231 | beta=deltanew/deltaold; |
---|
| 1232 | |
---|
| 1233 | /*** |
---|
| 1234 | for(i=0; i<m; ++i) |
---|
| 1235 | d[i]=s[i]+beta*d[i]; |
---|
| 1236 | ***/ |
---|
| 1237 | daxpy(m, d, s, beta, d); |
---|
| 1238 | } |
---|
| 1239 | |
---|
| 1240 | return iter; |
---|
| 1241 | } |
---|