Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

sba_lapack.c @ 37

Last change on this file since 37 was 37, checked in by (none), 14 years ago
Added original make3d
File size: 35.3 KB

Line
1	/////////////////////////////////////////////////////////////////////////////////
2	////
3	//// Linear algebra operations for the sba package
4	//// Copyright (C) 2004 Manolis Lourakis (lourakis@ics.forth.gr)
5	//// Institute of Computer Science, Foundation for Research & Technology - Hellas
6	//// Heraklion, Crete, Greece.
7	////
8	//// This program is free software; you can redistribute it and/or modify
9	//// it under the terms of the GNU General Public License as published by
10	//// the Free Software Foundation; either version 2 of the License, or
11	//// (at your option) any later version.
12	////
13	//// This program is distributed in the hope that it will be useful,
14	//// but WITHOUT ANY WARRANTY; without even the implied warranty of
15	//// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	//// GNU General Public License for more details.
17	////
18	///////////////////////////////////////////////////////////////////////////////////
19
20	#include <stdio.h>
21	#include <stdlib.h>
22	#include <string.h>
23	#include <math.h>
24	#include <float.h>
25
26	/* inline */
27	#ifdef _MSC_VER
28	#define inline __inline //MSVC
29	#elif !defined(__GNUC__)
30	#define inline //other than MSVC, GCC: define empty
31	#endif
32
33	#include "sba.h"
34
35	#ifdef SBA_APPEND_UNDERSCORE_SUFFIX
36	#define F77_FUNC(func) func ## _
37	#else
38	#define F77_FUNC(func) func
39	#endif /* SBA_APPEND_UNDERSCORE_SUFFIX */
40
41
42	/* declarations of LAPACK routines employed */
43
44	/* QR decomposition */
45	extern int F77_FUNC(dgeqrf)(int m, int n, double a, int lda, double tau, double work, int lwork, int info);
46	extern int F77_FUNC(dorgqr)(int m, int n, int k, double a, int lda, double tau, double work, int lwork, int *info);
47
48	/* solution of triangular system */
49	extern int F77_FUNC(dtrtrs)(char uplo, char trans, char diag, int n, int nrhs, double a, int lda, double b, int ldb, int info);
50
51	/* cholesky decomposition, matrix inversion */
52	extern int F77_FUNC(dpotf2)(char uplo, int n, double a, int lda, int *info);
53	extern int F77_FUNC(dpotrf)(char uplo, int n, double a, int lda, int info); / block version of dpotf2 */
54	extern int F77_FUNC(dpotri)(char uplo, int n, double a, int lda, int *info);
55
56	/* LU decomposition, linear system solution and matrix inversion */
57	extern int F77_FUNC(dgetrf)(int m, int n, double a, int lda, int ipiv, int info);
58	extern int F77_FUNC(dgetrs)(char trans, int n, int nrhs, double a, int lda, int ipiv, double b, int ldb, int *info);
59	extern int F77_FUNC(dgetri)(int n, double a, int lda, int ipiv, double work, int lwork, int *info);
60
61	/* SVD */
62	extern int F77_FUNC(dgesvd)(char jobu, char jobvt, int m, int n,
63	double a, int lda, double s, double u, int *ldu,
64	double vt, int ldvt, double work, int lwork,
65	int *info);
66
67	/* lapack 3.0 routine, faster than dgesvd() */
68	extern int F77_FUNC(dgesdd)(char jobz, int m, int n, double a, int *lda,
69	double s, double u, int ldu, double vt, int *ldvt,
70	double work, int lwork, int iwork, int info);
71
72
73	/* Bunch-Kaufman factorization of a real symmetric matrix A and solution of linear systems */
74	extern int F77_FUNC(dsytrf)(char uplo, int n, double a, int lda, int ipiv, double work, int lwork, int info);
75	extern int F77_FUNC(dsytrs)(char uplo, int n, int nrhs, double a, int lda, int ipiv, double b, int ldb, int *info);
76
77
78	/*
79	* This function returns the solution of Ax = b
80	*
81	* The function is based on QR decomposition with explicit computation of Q:
82	* If A=Q R with Q orthogonal and R upper triangular, the linear system becomes
83	* Q R x = b or R x = Q^T b.
84	*
85	* A is mxm, b is mx1. Argument iscolmaj specifies whether A is
86	* stored in column or row major order. Note that if iscolmaj==1
87	* this function modifies A!
88	*
89	* The function returns 0 in case of error, 1 if successfull
90	*
91	* This function is often called repetitively to solve problems of identical
92	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
93	* retained between calls and free'd-malloc'ed when not of the appropriate size.
94	*/
95	int sba_Axb_QR(double A, double B, double *x, int m, int iscolmaj)
96	{
97	static double *buf=NULL;
98	static int buf_sz=0;
99
100	double a, qtb, r, tau, *work;
101	int a_sz, qtb_sz, r_sz, tau_sz, tot_sz;
102	register int i, j;
103	int info, worksz, nrhs=1;
104	register double sum;
105
106	/* calculate required memory size */
107	a_sz=(iscolmaj)? 0 : m*m;
108	qtb_sz=m;
109	r_sz=mm; / only the upper triangular part really needed */
110	tau_sz=m;
111	worksz=3m; / this is probably too much */
112	tot_sz=a_sz + qtb_sz + r_sz + tau_sz + worksz;
113
114	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
115	if(buf) free(buf); /* free previously allocated memory */
116
117	buf_sz=tot_sz;
118	buf=(double )malloc(buf_szsizeof(double));
119	if(!buf){
120	fprintf(stderr, "memory allocation in sba_Axb_QR() failed!\n");
121	exit(1);
122	}
123	}
124
125	if(!iscolmaj){
126	a=buf;
127	/* store A (column major!) into a */
128	for(i=0; i<m; i++)
129	for(j=0; j<m; j++)
130	a[i+jm]=A[im+j];
131	}
132	else a=A; /* no copying required */
133
134	qtb=buf+a_sz;
135	r=qtb+qtb_sz;
136	tau=r+r_sz;
137	work=tau+tau_sz;
138
139	/* QR decomposition of A */
140	F77_FUNC(dgeqrf)((int )&m, (int )&m, a, (int )&m, tau, work, (int )&worksz, (int *)&info);
141	/* error treatment */
142	if(info!=0){
143	if(info<0){
144	fprintf(stderr, "LAPACK error: illegal value for argument %d of dgeqrf in sba_Axb_QR()\n", -info);
145	exit(1);
146	}
147	else{
148	fprintf(stderr, "Unknown LAPACK error %d for dgeqrf in sba_Axb_QR()\n", info);
149	return 0;
150	}
151	}
152
153	/* R is now stored in the upper triangular part of a; copy it in r so that dorgqr() below won't destroy it */
154	for(i=0; i<r_sz; i++)
155	r[i]=a[i];
156
157	/* compute Q using the elementary reflectors computed by the above decomposition */
158	F77_FUNC(dorgqr)((int )&m, (int )&m, (int )&m, a, (int )&m, tau, work, (int )&worksz, (int )&info);
159	if(info!=0){
160	if(info<0){
161	fprintf(stderr, "LAPACK error: illegal value for argument %d of dorgqr in sba_Axb_QR()\n", -info);
162	exit(1);
163	}
164	else{
165	fprintf(stderr, "Unknown LAPACK error (%d) in sba_Axb_QR()\n", info);
166	return 0;
167	}
168	}
169
170	/* Q is now in a; compute Q^T b in qtb */
171	for(i=0; i<m; i++){
172	for(j=0, sum=0.0; j<m; j++)
173	sum+=a[im+j]B[j];
174	qtb[i]=sum;
175	}
176
177	/* solve the linear system R x = Q^t b */
178	F77_FUNC(dtrtrs)("U", "N", "N", (int )&m, (int )&nrhs, r, (int )&m, qtb, (int )&m, &info);
179	/* error treatment */
180	if(info!=0){
181	if(info<0){
182	fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QR()\n", -info);
183	exit(1);
184	}
185	else{
186	fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QR()\n", info);
187	return 0;
188	}
189	}
190
191	/* copy the result in x */
192	for(i=0; i<m; i++)
193	x[i]=qtb[i];
194
195	return 1;
196	}
197
198	/*
199	* This function returns the solution of Ax = b
200	*
201	* The function is based on QR decomposition without computation of Q:
202	* If A=Q R with Q orthogonal and R upper triangular, the linear system becomes
203	* (A^T A) x = A^T b or (R^T Q^T Q R) x = A^T b or (R^T R) x = A^T b.
204	* This amounts to solving R^T y = A^T b for y and then R x = y for x
205	* Note that Q does not need to be explicitly computed
206	*
207	* A is mxm, b is mx1. Argument iscolmaj specifies whether A is
208	* stored in column or row major order. Note that if iscolmaj==1
209	* this function modifies A!
210	*
211	* The function returns 0 in case of error, 1 if successfull
212	*
213	* This function is often called repetitively to solve problems of identical
214	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
215	* retained between calls and free'd-malloc'ed when not of the appropriate size.
216	*/
217	int sba_Axb_QRnoQ(double A, double B, double *x, int m, int iscolmaj)
218	{
219	static double *buf=NULL;
220	static int buf_sz=0;
221
222	double a, atb, tau, work;
223	int a_sz, atb_sz, tau_sz, tot_sz;
224	register int i, j;
225	int info, worksz, nrhs=1;
226	register double sum;
227
228	/* calculate required memory size */
229	a_sz=(iscolmaj)? 0 : m*m;
230	atb_sz=m;
231	tau_sz=m;
232	worksz=3m; / this is probably too much */
233	tot_sz=a_sz + atb_sz + tau_sz + worksz;
234
235	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
236	if(buf) free(buf); /* free previously allocated memory */
237
238	buf_sz=tot_sz;
239	buf=(double )malloc(buf_szsizeof(double));
240	if(!buf){
241	fprintf(stderr, "memory allocation in sba_Axb_QRnoQ() failed!\n");
242	exit(1);
243	}
244	}
245
246	if(!iscolmaj){
247	a=buf;
248	/* store A (column major!) into a */
249	for(i=0; i<m; i++)
250	for(j=0; j<m; j++)
251	a[i+jm]=A[im+j];
252	}
253	else a=A; /* no copying required */
254
255	atb=buf+a_sz;
256	tau=atb+atb_sz;
257	work=tau+tau_sz;
258
259	/* compute A^T b in atb */
260	for(i=0; i<m; i++){
261	for(j=0, sum=0.0; j<m; j++)
262	sum+=a[im+j]B[j];
263	atb[i]=sum;
264	}
265
266	/* QR decomposition of A */
267	F77_FUNC(dgeqrf)((int )&m, (int )&m, a, (int )&m, tau, work, (int )&worksz, (int *)&info);
268	/* error treatment */
269	if(info!=0){
270	if(info<0){
271	fprintf(stderr, "LAPACK error: illegal value for argument %d of dgeqrf in sba_Axb_QRnoQ()\n", -info);
272	exit(1);
273	}
274	else{
275	fprintf(stderr, "Unknown LAPACK error %d for dgeqrf in sba_Axb_QRnoQ()\n", info);
276	return 0;
277	}
278	}
279
280	/* R is stored in the upper triangular part of a */
281
282	/* solve the linear system R^T y = A^t b */
283	F77_FUNC(dtrtrs)("U", "T", "N", (int )&m, (int )&nrhs, a, (int )&m, atb, (int )&m, &info);
284	/* error treatment */
285	if(info!=0){
286	if(info<0){
287	fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QRnoQ()\n", -info);
288	exit(1);
289	}
290	else{
291	fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QRnoQ()\n", info);
292	return 0;
293	}
294	}
295
296	/* solve the linear system R x = y */
297	F77_FUNC(dtrtrs)("U", "N", "N", (int )&m, (int )&nrhs, a, (int )&m, atb, (int )&m, &info);
298	/* error treatment */
299	if(info!=0){
300	if(info<0){
301	fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QRnoQ()\n", -info);
302	exit(1);
303	}
304	else{
305	fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QRnoQ()\n", info);
306	return 0;
307	}
308	}
309
310	/* copy the result in x */
311	for(i=0; i<m; i++)
312	x[i]=atb[i];
313
314	return 1;
315	}
316
317	/*
318	* This function returns the solution of Ax=b
319	*
320	* The function assumes that A is symmetric & positive definite and employs
321	* the Cholesky decomposition:
322	* If A=U^T U with U upper triangular, the system to be solved becomes
323	* (U^T U) x = b
324	* This amount to solving U^T y = b for y and then U x = y for x
325	*
326	* A is mxm, b is mx1. Argument iscolmaj specifies whether A is
327	* stored in column or row major order. Note that if iscolmaj==1
328	* this function modifies A and B!
329	*
330	* The function returns 0 in case of error, 1 if successfull
331	*
332	* This function is often called repetitively to solve problems of identical
333	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
334	* retained between calls and free'd-malloc'ed when not of the appropriate size.
335	*/
336	int sba_Axb_Chol(double A, double B, double *x, int m, int iscolmaj)
337	{
338	static double *buf=NULL;
339	static int buf_sz=0;
340
341	double a, b;
342	int a_sz, b_sz, tot_sz;
343	register int i, j;
344	int info, nrhs=1;
345
346	/* calculate required memory size */
347	a_sz=(iscolmaj)? 0 : m*m;
348	b_sz=(iscolmaj)? 0 : m;
349	tot_sz=a_sz + b_sz;
350
351	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
352	if(buf) free(buf); /* free previously allocated memory */
353
354	buf_sz=tot_sz;
355	buf=(double )malloc(buf_szsizeof(double));
356	if(!buf){
357	fprintf(stderr, "memory allocation in sba_Axb_Chol() failed!\n");
358	exit(1);
359	}
360	}
361
362	if(!iscolmaj){
363	a=buf;
364	b=a+a_sz;
365
366	/* store A (column major!) into a anb B into b */
367	for(i=0; i<m; i++){
368	for(j=0; j<m; j++)
369	a[i+jm]=A[im+j];
370
371	b[i]=B[i];
372	}
373	}
374	else{ /* no copying is necessary */
375	a=A;
376	b=B;
377	}
378
379	/* Cholesky decomposition of A */
380	//F77_FUNC(dpotf2)("U", (int )&m, a, (int )&m, (int *)&info);
381	F77_FUNC(dpotrf)("U", (int )&m, a, (int )&m, (int *)&info);
382	/* error treatment */
383	if(info!=0){
384	if(info<0){
385	fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2/dpotrf in sba_Axb_Chol()\n", -info);
386	exit(1);
387	}
388	else{
389	fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for dpotf2/dpotrf in sba_Axb_Chol()\n", info);
390	return 0;
391	}
392	}
393
394	/* solve the linear system U^T y = b */
395	F77_FUNC(dtrtrs)("U", "T", "N", (int )&m, (int )&nrhs, a, (int )&m, b, (int )&m, &info);
396	/* error treatment */
397	if(info!=0){
398	if(info<0){
399	fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_Chol()\n", -info);
400	exit(1);
401	}
402	else{
403	fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_Chol()\n", info);
404	return 0;
405	}
406	}
407
408	/* solve the linear system U x = y */
409	F77_FUNC(dtrtrs)("U", "N", "N", (int )&m, (int )&nrhs, a, (int )&m, b, (int )&m, &info);
410	/* error treatment */
411	if(info!=0){
412	if(info<0){
413	fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_Chol()\n", -info);
414	exit(1);
415	}
416	else{
417	fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_Chol()\n", info);
418	return 0;
419	}
420	}
421
422	/* copy the result in x */
423	for(i=0; i<m; i++)
424	x[i]=b[i];
425
426	return 1;
427	}
428
429	/*
430	* This function returns the solution of Ax = b
431	*
432	* The function employs LU decomposition:
433	* If A=L U with L lower and U upper triangular, then the original system
434	* amounts to solving
435	* L y = b, U x = y
436	*
437	* A is mxm, b is mx1. Argument iscolmaj specifies whether A is
438	* stored in column or row major order. Note that if iscolmaj==1
439	* this function modifies A and B!
440	*
441	* The function returns 0 in case of error,
442	* 1 if successfull
443	*
444	* This function is often called repetitively to solve problems of identical
445	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
446	* retained between calls and free'd-malloc'ed when not of the appropriate size.
447	*/
448	int sba_Axb_LU(double A, double B, double *x, int m, int iscolmaj)
449	{
450	static double *buf=NULL;
451	static int buf_sz=0;
452
453	int a_sz, ipiv_sz, b_sz, tot_sz;
454	register int i, j;
455	int info, *ipiv, nrhs=1;
456	double a, b;
457
458	/* calculate required memory size */
459	ipiv_sz=m;
460	a_sz=(iscolmaj)? 0 : m*m;
461	b_sz=(iscolmaj)? 0 : m;
462	tot_sz=ipiv_szsizeof(int) + (a_sz + b_sz)sizeof(double);
463
464	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
465	if(buf) free(buf); /* free previously allocated memory */
466
467	buf_sz=tot_sz;
468	buf=(double *)malloc(buf_sz);
469	if(!buf){
470	fprintf(stderr, "memory allocation in sba_Axb_LU() failed!\n");
471	exit(1);
472	}
473	}
474
475	ipiv=(int *)buf;
476	if(!iscolmaj){
477	a=(double *)(ipiv + ipiv_sz);
478	b=a+a_sz;
479
480	/* store A (column major!) into a and B into b */
481	for(i=0; i<m; i++){
482	for(j=0; j<m; j++)
483	a[i+jm]=A[im+j];
484
485	b[i]=B[i];
486	}
487	}
488	else{ /* no copying is necessary */
489	a=A;
490	b=B;
491	}
492
493	/* LU decomposition for A */
494	F77_FUNC(dgetrf)((int )&m, (int )&m, a, (int )&m, ipiv, (int )&info);
495	if(info!=0){
496	if(info<0){
497	fprintf(stderr, "argument %d of dgetrf illegal in sba_Axb_LU()\n", -info);
498	exit(1);
499	}
500	else{
501	fprintf(stderr, "singular matrix A for dgetrf in sba_Axb_LU()\n");
502	return 0;
503	}
504	}
505
506	/* solve the system with the computed LU */
507	F77_FUNC(dgetrs)("N", (int )&m, (int )&nrhs, a, (int )&m, ipiv, b, (int )&m, (int *)&info);
508	if(info!=0){
509	if(info<0){
510	fprintf(stderr, "argument %d of dgetrs illegal in sba_Axb_LU()\n", -info);
511	exit(1);
512	}
513	else{
514	fprintf(stderr, "unknown error for dgetrs in sba_Axb_LU()\n");
515	return 0;
516	}
517	}
518
519	/* copy the result in x */
520	for(i=0; i<m; i++){
521	x[i]=b[i];
522	}
523
524	return 1;
525	}
526
527	/*
528	* This function returns the solution of Ax = b
529	*
530	* The function is based on SVD decomposition:
531	* If A=U D V^T with U, V orthogonal and D diagonal, the linear system becomes
532	* (U D V^T) x = b or x=V D^{-1} U^T b
533	* Note that V D^{-1} U^T is the pseudoinverse A^+
534	*
535	* A is mxm, b is mx1. Argument iscolmaj specifies whether A is
536	* stored in column or row major order. Note that if iscolmaj==1
537	* this function modifies A!
538	*
539	* The function returns 0 in case of error, 1 if successfull
540	*
541	* This function is often called repetitively to solve problems of identical
542	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
543	* retained between calls and free'd-malloc'ed when not of the appropriate size.
544	*/
545	int sba_Axb_SVD(double A, double B, double *x, int m, int iscolmaj)
546	{
547	static double *buf=NULL;
548	static int buf_sz=0;
549	static double eps=-1.0;
550
551	register int i, j;
552	double a, u, s, vt, *work;
553	int a_sz, u_sz, s_sz, vt_sz, tot_sz;
554	double thresh, one_over_denom;
555	register double sum;
556	int info, rank, worksz, *iwork, iworksz;
557
558	/* calculate required memory size */
559	worksz=-1; // workspace query. Keep in mind that dgesdd requires more memory than dgesvd
560	/* note that optimal work size is returned in thresh */
561	F77_FUNC(dgesdd)("A", (int )&m, (int )&m, NULL, (int )&m, NULL, NULL, (int )&m, NULL, (int *)&m,
562	(double )&thresh, (int )&worksz, NULL, &info);
563	/* F77_FUNC(dgesvd)("A", "A", (int )&m, (int )&m, NULL, (int )&m, NULL, NULL, (int )&m, NULL, (int *)&m,
564	(double )&thresh, (int )&worksz, &info); */
565	worksz=(int)thresh;
566	iworksz=8*m;
567	a_sz=(!iscolmaj)? m*m : 0;
568	u_sz=mm; s_sz=m; vt_sz=mm;
569
570	tot_sz=iworkszsizeof(int) + (a_sz + u_sz + s_sz + vt_sz + worksz)sizeof(double);
571
572	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
573	if(buf) free(buf); /* free previously allocated memory */
574
575	buf_sz=tot_sz;
576	buf=(double *)malloc(buf_sz);
577	if(!buf){
578	fprintf(stderr, "memory allocation in sba_Axb_SVD() failed!\n");
579	exit(1);
580	}
581	}
582
583	iwork=(int *)buf;
584	if(!iscolmaj){
585	a=(double *)(iwork+iworksz);
586	/* store A (column major!) into a */
587	for(i=0; i<m; i++)
588	for(j=0; j<m; j++)
589	a[i+jm]=A[im+j];
590	}
591	else{
592	a=A; /* no copying required */
593	}
594
595	u=((double *)(iwork+iworksz)) + a_sz;
596	s=u+u_sz;
597	vt=s+s_sz;
598	work=vt+vt_sz;
599
600	/* SVD decomposition of A */
601	F77_FUNC(dgesdd)("A", (int )&m, (int )&m, a, (int )&m, s, u, (int )&m, vt, (int )&m, work, (int )&worksz, iwork, &info);
602	//F77_FUNC(dgesvd)("A", "A", (int )&m, (int )&m, a, (int )&m, s, u, (int )&m, vt, (int )&m, work, (int )&worksz, &info);
603
604	/* error treatment */
605	if(info!=0){
606	if(info<0){
607	fprintf(stderr, "LAPACK error: illegal value for argument %d of dgesdd/dgesvd in sba_Axb_SVD()\n", -info);
608	exit(1);
609	}
610	else{
611	fprintf(stderr, "LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in sba_Axb_SVD() [info=%d]\n", info);
612
613	return 0;
614	}
615	}
616
617	if(eps<0.0){
618	double aux;
619
620	/* compute machine epsilon. DBL_EPSILON should do also */
621	for(eps=1.0; aux=eps+1.0, aux-1.0>0.0; eps*=0.5)
622	;
623	eps*=2.0;
624	}
625
626	/* compute the pseudoinverse in a */
627	memset(a, 0, mmsizeof(double)); /* initialize to zero */
628	for(rank=0, thresh=eps*s[0]; rank<m && s[rank]>thresh; rank++){
629	one_over_denom=1.0/s[rank];
630
631	for(j=0; j<m; j++)
632	for(i=0; i<m; i++)
633	a[im+j]+=vt[rank+im]u[j+rankm]*one_over_denom;
634	}
635
636	/* compute A^+ b in x */
637	for(i=0; i<m; i++){
638	for(j=0, sum=0.0; j<m; j++)
639	sum+=a[im+j]B[j];
640	x[i]=sum;
641	}
642
643	return 1;
644	}
645
646	/*
647	* This function returns the solution of Ax = b for a real symmetric matrix A
648	*
649	* The function is based on Bunch-Kaufman factorization:
650	* A is factored as UDU^T where U is upper triangular and
651	* D symmetric and block diagonal
652	*
653	* A is mxm, b is mx1. Argument iscolmaj specifies whether A is
654	* stored in column or row major order. Note that if iscolmaj==1
655	* this function modifies A and B!
656	*
657	* The function returns 0 in case of error,
658	* 1 if successfull
659	*
660	* This function is often called repetitively to solve problems of identical
661	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
662	* retained between calls and free'd-malloc'ed when not of the appropriate size.
663	*/
664	int sba_Axb_BK(double A, double B, double *x, int m, int iscolmaj)
665	{
666	static double *buf=NULL;
667	static int buf_sz=0;
668
669	int a_sz, ipiv_sz, b_sz, work_sz, tot_sz;
670	register int i, j;
671	int info, *ipiv, nrhs=1;
672	double a, b, *work;
673
674	/* calculate required memory size */
675	ipiv_sz=m;
676	a_sz=(iscolmaj)? 0 : m*m;
677	b_sz=(iscolmaj)? 0 : m;
678	work_sz=16m; / this is probably too much */
679	tot_sz=ipiv_szsizeof(int) + (a_sz + b_sz + work_sz)sizeof(double);
680
681	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
682	if(buf) free(buf); /* free previously allocated memory */
683
684	buf_sz=tot_sz;
685	buf=(double *)malloc(buf_sz);
686	if(!buf){
687	fprintf(stderr, "memory allocation in sba_Axb_BK() failed!\n");
688	exit(1);
689	}
690	}
691
692	ipiv=(int *)buf;
693	if(!iscolmaj){
694	a=(double *)(ipiv + ipiv_sz);
695	b=a+a_sz;
696	work=b+b_sz;
697
698	/* store A (column major!) into a and B into b */
699	for(i=0; i<m; i++){
700	for(j=0; j<m; j++)
701	a[i+jm]=A[im+j];
702
703	b[i]=B[i];
704	}
705	}
706	else{ /* no copying is necessary */
707	a=A;
708	b=B;
709	work=(double *)(ipiv + ipiv_sz);
710	}
711
712	/* factorization for A */
713	F77_FUNC(dsytrf)("U", (int )&m, a, (int )&m, ipiv, work, (int )&work_sz, (int )&info);
714	if(info!=0){
715	if(info<0){
716	fprintf(stderr, "argument %d of dsytrf illegal in sba_Axb_BK()\n", -info);
717	exit(1);
718	}
719	else{
720	fprintf(stderr, "singular block diagonal matrix D for dsytrf in sba_Axb_BK() [D(%d, %d) is zero]\n", info, info);
721	return 0;
722	}
723	}
724
725	/* solve the system with the computed factorization */
726	F77_FUNC(dsytrs)("U", (int )&m, (int )&nrhs, a, (int )&m, ipiv, b, (int )&m, (int *)&info);
727	if(info!=0){
728	if(info<0){
729	fprintf(stderr, "argument %d of dsytrs illegal in sba_Axb_BK()\n", -info);
730	exit(1);
731	}
732	else{
733	fprintf(stderr, "unknown error for dsytrs in sba_Axb_BK()\n");
734	return 0;
735	}
736	}
737
738	/* copy the result in x */
739	for(i=0; i<m; i++){
740	x[i]=b[i];
741	}
742
743	return 1;
744	}
745
746	/*
747	* This function computes the inverse of a square matrix A into B
748	* using LU decomposition
749	*
750	* The function returns 0 in case of error (e.g. A is singular),
751	* 1 if successfull
752	*
753	* This function is often called repetitively to solve problems of identical
754	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
755	* retained between calls and free'd-malloc'ed when not of the appropriate size.
756	*/
757	int sba_mat_invert_LU(double A, double B, int m)
758	{
759	static double *buf=NULL;
760	static int buf_sz=0;
761
762	int a_sz, ipiv_sz, work_sz, tot_sz;
763	register int i, j;
764	int info, *ipiv;
765	double a, work;
766
767	/* calculate required memory size */
768	ipiv_sz=m;
769	a_sz=m*m;
770	work_sz=16m; / this is probably too much */
771	tot_sz=ipiv_szsizeof(int) + (a_sz + work_sz)sizeof(double);
772
773	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
774	if(buf) free(buf); /* free previously allocated memory */
775
776	buf_sz=tot_sz;
777	buf=(double *)malloc(buf_sz);
778	if(!buf){
779	fprintf(stderr, "memory allocation in sba_mat_invert_LU() failed!\n");
780	exit(1);
781	}
782	}
783
784	ipiv=(int *)buf;
785	a=(double *)(ipiv + ipiv_sz);
786	work=a+a_sz;
787
788	/* store A (column major!) into a */
789	for(i=0; i<m; i++)
790	for(j=0; j<m; j++)
791	a[i+jm]=A[im+j];
792
793	/* LU decomposition for A */
794	F77_FUNC(dgetrf)((int )&m, (int )&m, a, (int )&m, ipiv, (int )&info);
795	if(info!=0){
796	if(info<0){
797	fprintf(stderr, "argument %d of dgetrf illegal in sba_mat_invert_LU()\n", -info);
798	exit(1);
799	}
800	else{
801	fprintf(stderr, "singular matrix A for dgetrf in sba_mat_invert_LU()\n");
802	return 0;
803	}
804	}
805
806	/* (A)^{-1} from LU */
807	F77_FUNC(dgetri)((int )&m, a, (int )&m, ipiv, work, (int )&work_sz, (int )&info);
808	if(info!=0){
809	if(info<0){
810	fprintf(stderr, "argument %d of dgetri illegal in sba_mat_invert_LU()\n", -info);
811	exit(1);
812	}
813	else{
814	fprintf(stderr, "singular matrix A for dgetri in sba_mat_invert_LU()\n");
815	return 0;
816	}
817	}
818
819	/* store (A)^{-1} in B */
820	for(i=0; i<m; i++)
821	for(j=0; j<m; j++)
822	B[im+j]=a[i+jm];
823
824	return 1;
825	}
826
827	/*
828	* This function computes the inverse of a square symmetric positive definite
829	* matrix A into B using Cholesky factorization
830	*
831	* The function returns 0 in case of error (e.g. A is not positive definite or singular),
832	* 1 if successfull
833	*
834	* This function is often called repetitively to solve problems of identical
835	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
836	* retained between calls and free'd-malloc'ed when not of the appropriate size.
837	*/
838	int sba_mat_invert_Chol(double A, double B, int m)
839	{
840	static double *buf=NULL;
841	static int buf_sz=0;
842
843	int a_sz, tot_sz;
844	register int i, j;
845	int info;
846	double *a;
847
848	/* calculate required memory size */
849	a_sz=m*m;
850	tot_sz=a_sz;
851
852	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
853	if(buf) free(buf); /* free previously allocated memory */
854
855	buf_sz=tot_sz;
856	buf=(double )malloc(buf_szsizeof(double));
857	if(!buf){
858	fprintf(stderr, "memory allocation in sba_mat_invert_Chol() failed!\n");
859	exit(1);
860	}
861	}
862
863	a=(double *)buf;
864
865	/* store A (column major!) into a */
866	for(i=0; i<m; i++)
867	for(j=0; j<m; j++)
868	a[i+jm]=A[im+j];
869
870	/* Cholesky factorization for A */
871	F77_FUNC(dpotrf)("L", (int )&m, a, (int )&m, (int *)&info);
872	/* error treatment */
873	if(info!=0){
874	if(info<0){
875	fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotrf in sba_mat_invert_Chol()\n", -info);
876	exit(1);
877	}
878	else{
879	fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization could not be completed for dpotrf in sba_mat_invert_Chol()\n", info);
880	return 0;
881	}
882	}
883
884	/* (A)^{-1} from Cholesky */
885	F77_FUNC(dpotri)("L", (int )&m, a, (int )&m, (int *)&info);
886	if(info!=0){
887	if(info<0){
888	fprintf(stderr, "argument %d of dpotri illegal in sba_mat_invert_Chol()\n", -info);
889	exit(1);
890	}
891	else{
892	fprintf(stderr, "the (%d, %d) element of the factor U or L is zero, singular matrix A for dpotri in sba_mat_invert_Chol()\n", info, info);
893	return 0;
894	}
895	}
896
897	/* store (A)^{-1} in B. The lower triangle of the symmetric A^{-1} is in the lower triangle of a */
898	for(i=0; i<m; i++)
899	for(j=0; j<=i; j++)
900	B[im+j]=B[jm+i]=a[i+j*m];
901
902	return 1;
903	}
904
905
906	#define __CG_LINALG_BLOCKSIZE 8
907
908	/* Dot product of two vectors x and y using loop unrolling and blocking.
909	* see http://www.abarnett.demon.co.uk/tutorial.html
910	*/
911
912	inline static double dprod(int n, double x, double y)
913	{
914	register int i, j1, j2, j3, j4, j5, j6, j7;
915	int blockn;
916	register double sum0=0.0, sum1=0.0, sum2=0.0, sum3=0.0,
917	sum4=0.0, sum5=0.0, sum6=0.0, sum7=0.0;
918
919	/* n may not be divisible by __CG_LINALG_BLOCKSIZE,
920	* go as near as we can first, then tidy up.
921	*/
922	blockn = (n / __CG_LINALG_BLOCKSIZE) * __CG_LINALG_BLOCKSIZE;
923
924	/* unroll the loop in blocks of `__CG_LINALG_BLOCKSIZE' */
925	for(i=0; i<blockn; i+=__CG_LINALG_BLOCKSIZE){
926	sum0+=x[i]*y[i];
927	j1=i+1; sum1+=x[j1]*y[j1];
928	j2=i+2; sum2+=x[j2]*y[j2];
929	j3=i+3; sum3+=x[j3]*y[j3];
930	j4=i+4; sum4+=x[j4]*y[j4];
931	j5=i+5; sum5+=x[j5]*y[j5];
932	j6=i+6; sum6+=x[j6]*y[j6];
933	j7=i+7; sum7+=x[j7]*y[j7];
934	}
935
936	/*
937	* There may be some left to do.
938	* This could be done as a simple for() loop,
939	* but a switch is faster (and more interesting)
940	*/
941
942	if(i<n){
943	/* Jump into the case at the place that will allow
944	* us to finish off the appropriate number of items.
945	*/
946
947	switch(n - i){
948	case 7 : sum0+=x[i]*y[i]; ++i;
949	case 6 : sum1+=x[i]*y[i]; ++i;
950	case 5 : sum2+=x[i]*y[i]; ++i;
951	case 4 : sum3+=x[i]*y[i]; ++i;
952	case 3 : sum4+=x[i]*y[i]; ++i;
953	case 2 : sum5+=x[i]*y[i]; ++i;
954	case 1 : sum6+=x[i]*y[i]; ++i;
955	}
956	}
957
958	return sum0+sum1+sum2+sum3+sum4+sum5+sum6+sum7;
959	}
960
961
962	/* Compute z=x+a*y for two vectors x and y and a scalar a; z can be one of x, y.
963	* Similarly to the dot product routine, this one uses loop unrolling and blocking
964	*/
965
966	inline static void daxpy(int n, double z, double x, double a, double *y)
967	{
968	register int i, j1, j2, j3, j4, j5, j6, j7;
969	int blockn;
970
971	/* n may not be divisible by __CG_LINALG_BLOCKSIZE,
972	* go as near as we can first, then tidy up.
973	*/
974	blockn = (n / __CG_LINALG_BLOCKSIZE) * __CG_LINALG_BLOCKSIZE;
975
976	/* unroll the loop in blocks of `__CG_LINALG_BLOCKSIZE' */
977	for(i=0; i<blockn; i+=__CG_LINALG_BLOCKSIZE){
978	z[i]=x[i]+a*y[i];
979	j1=i+1; z[j1]=x[j1]+a*y[j1];
980	j2=i+2; z[j2]=x[j2]+a*y[j2];
981	j3=i+3; z[j3]=x[j3]+a*y[j3];
982	j4=i+4; z[j4]=x[j4]+a*y[j4];
983	j5=i+5; z[j5]=x[j5]+a*y[j5];
984	j6=i+6; z[j6]=x[j6]+a*y[j6];
985	j7=i+7; z[j7]=x[j7]+a*y[j7];
986	}
987
988	/*
989	* There may be some left to do.
990	* This could be done as a simple for() loop,
991	* but a switch is faster (and more interesting)
992	*/
993
994	if(i<n){
995	/* Jump into the case at the place that will allow
996	* us to finish off the appropriate number of items.
997	*/
998
999	switch(n - i){
1000	case 7 : z[i]=x[i]+a*y[i]; ++i;
1001	case 6 : z[i]=x[i]+a*y[i]; ++i;
1002	case 5 : z[i]=x[i]+a*y[i]; ++i;
1003	case 4 : z[i]=x[i]+a*y[i]; ++i;
1004	case 3 : z[i]=x[i]+a*y[i]; ++i;
1005	case 2 : z[i]=x[i]+a*y[i]; ++i;
1006	case 1 : z[i]=x[i]+a*y[i]; ++i;
1007	}
1008	}
1009	}
1010
1011	/*
1012	* This function returns the solution of Ax = b where A is posititive definite,
1013	* based on the conjugate gradients method; see "An intro to the CG method" by J.R. Shewchuk, p. 50-51
1014	*
1015	* A is mxm, b, x are is mx1. Argument niter specifies the maximum number of
1016	* iterations and eps is the desired solution accuracy. niter<0 signals that
1017	* x contains a valid initial approximation to the solution; if niter>0 then
1018	* the starting point is taken to be zero. Argument prec selects the desired
1019	* preconditioning method as follows:
1020	* 0: no preconditioning
1021	* 1: jacobi (diagonal) preconditioning
1022	* 2: SSOR preconditioning
1023	* Argument iscolmaj specifies whether A is stored in column or row major order.
1024	*
1025	* The function returns 0 in case of error,
1026	* the number of iterations performed if successfull
1027	*
1028	* This function is often called repetitively to solve problems of identical
1029	* dimensions. To avoid repetitive malloc's and free's, allocated memory is
1030	* retained between calls and free'd-malloc'ed when not of the appropriate size.
1031	*/
1032	int sba_Axb_CG(double A, double B, double *x, int m, int niter, double eps, int prec, int iscolmaj)
1033	{
1034	static double *buf=NULL;
1035	static int buf_sz=0;
1036
1037	register int i, j;
1038	register double *aim;
1039	int iter, a_sz, res_sz, d_sz, q_sz, s_sz, wk_sz, z_sz, tot_sz;
1040	double a, res, d, q, s, wk, *z;
1041	double delta0, deltaold, deltanew, alpha, beta, eps_sq=eps*eps;
1042	register double sum;
1043	int rec_res;
1044
1045	/* calculate required memory size */
1046	a_sz=(iscolmaj)? m*m : 0;
1047	res_sz=m; d_sz=m; q_sz=m;
1048	if(prec!=SBA_CG_NOPREC){
1049	s_sz=m; wk_sz=m;
1050	z_sz=(prec==SBA_CG_SSOR)? m : 0;
1051	}
1052	else
1053	s_sz=wk_sz=z_sz=0;
1054
1055	tot_sz=a_sz+res_sz+d_sz+q_sz+s_sz+wk_sz+z_sz;
1056
1057	if(tot_sz>buf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */
1058	if(buf) free(buf); /* free previously allocated memory */
1059
1060	buf_sz=tot_sz;
1061	buf=(double )malloc(buf_szsizeof(double));
1062	if(!buf){
1063	fprintf(stderr, "memory allocation request failed in sba_Axb_CG()\n");
1064	exit(1);
1065	}
1066	}
1067
1068	if(iscolmaj){
1069	a=buf;
1070	/* store A (row major!) into a */
1071	for(i=0; i<m; ++i)
1072	for(j=0, aim=a+i*m; j<m; ++j)
1073	aim[j]=A[i+j*m];
1074	}
1075	else a=A; /* no copying required */
1076
1077	res=buf+a_sz;
1078	d=res+res_sz;
1079	q=d+d_sz;
1080	if(prec!=SBA_CG_NOPREC){
1081	s=q+q_sz;
1082	wk=s+s_sz;
1083	z=(prec==SBA_CG_SSOR)? wk+wk_sz : NULL;
1084
1085	for(i=0; i<m; ++i){ // compute jacobi (i.e. diagonal) preconditioners and save them in wk
1086	sum=a[i*m+i];
1087	if(sum>DBL_EPSILON \|\| -sum<-DBL_EPSILON) // != 0.0
1088	wk[i]=1.0/sum;
1089	else
1090	wk[i]=1.0/DBL_EPSILON;
1091	}
1092	}
1093	else{
1094	s=res;
1095	wk=z=NULL;
1096	}
1097
1098	if(niter>0)
1099	for(i=0; i<m; ++i){ // clear solution and initialize residual vector: res <-- B
1100	x[i]=0.0;
1101	res[i]=B[i];
1102	}
1103	else{
1104	niter=-niter;
1105
1106	for(i=0; i<m; ++i){ // initialize residual vector: res <-- B - A*x
1107	for(j=0, aim=a+i*m, sum=0.0; j<m; ++j)
1108	sum+=aim[j]*x[j];
1109	res[i]=B[i]-sum;
1110	}
1111	}
1112
1113	switch(prec){
1114	case SBA_CG_NOPREC:
1115	for(i=0, deltanew=0.0; i<m; ++i){
1116	d[i]=res[i];
1117	deltanew+=res[i]*res[i];
1118	}
1119	break;
1120	case SBA_CG_JACOBI: // jacobi preconditioning
1121	for(i=0, deltanew=0.0; i<m; ++i){
1122	d[i]=res[i]*wk[i];
1123	deltanew+=res[i]*d[i];
1124	}
1125	break;
1126	case SBA_CG_SSOR: // SSOR preconditioning; see the "templates" book, fig. 3.2, p. 44
1127	for(i=0; i<m; ++i){
1128	for(j=0, sum=0.0, aim=a+i*m; j<i; ++j)
1129	sum+=aim[j]*z[j];
1130	z[i]=wk[i]*(res[i]-sum);
1131	}
1132
1133	for(i=m-1; i>=0; --i){
1134	for(j=i+1, sum=0.0, aim=a+i*m; j<m; ++j)
1135	sum+=aim[j]*d[j];
1136	d[i]=z[i]-wk[i]*sum;
1137	}
1138	deltanew=dprod(m, res, d);
1139	break;
1140	default:
1141	fprintf(stderr, "unknown preconditioning option %d in sba_Axb_CG\n", prec);
1142	exit(1);
1143	}
1144
1145	delta0=deltanew;
1146
1147	for(iter=1; deltanew>eps_sq*delta0 && iter<=niter; ++iter){
1148	for(i=0; i<m; ++i){ // q <-- A d
1149	aim=a+i*m;
1150	/***
1151	for(j=0, sum=0.0; j<m; ++j)
1152	sum+=aim[j]*d[j];
1153	***/
1154	q[i]=dprod(m, aim, d); //sum;
1155	}
1156
1157	/***
1158	for(i=0, sum=0.0; i<m; ++i)
1159	sum+=d[i]*q[i];
1160	***/
1161	alpha=deltanew/dprod(m, d, q); // deltanew/sum;
1162
1163	/***
1164	for(i=0; i<m; ++i)
1165	x[i]+=alpha*d[i];
1166	***/
1167	daxpy(m, x, x, alpha, d);
1168
1169	if(!(iter%50)){
1170	for(i=0; i<m; ++i){ // accurate computation of the residual vector
1171	aim=a+i*m;
1172	/***
1173	for(j=0, sum=0.0; j<m; ++j)
1174	sum+=aim[j]*x[j];
1175	***/
1176	res[i]=B[i]-dprod(m, aim, x); //B[i]-sum;
1177	}
1178	rec_res=0;
1179	}
1180	else{
1181	/***
1182	for(i=0; i<m; ++i) // approximate computation of the residual vector
1183	res[i]-=alpha*q[i];
1184	***/
1185	daxpy(m, res, res, -alpha, q);
1186	rec_res=1;
1187	}
1188
1189	if(prec){
1190	switch(prec){
1191	case SBA_CG_JACOBI: // jacobi
1192	for(i=0; i<m; ++i)
1193	s[i]=res[i]*wk[i];
1194	break;
1195	case SBA_CG_SSOR: // SSOR
1196	for(i=0; i<m; ++i){
1197	for(j=0, sum=0.0, aim=a+i*m; j<i; ++j)
1198	sum+=aim[j]*z[j];
1199	z[i]=wk[i]*(res[i]-sum);
1200	}
1201
1202	for(i=m-1; i>=0; --i){
1203	for(j=i+1, sum=0.0, aim=a+i*m; j<m; ++j)
1204	sum+=aim[j]*s[j];
1205	s[i]=z[i]-wk[i]*sum;
1206	}
1207	break;
1208	}
1209	}
1210
1211	deltaold=deltanew;
1212	/***
1213	for(i=0, sum=0.0; i<m; ++i)
1214	sum+=res[i]*s[i];
1215	***/
1216	deltanew=dprod(m, res, s); //sum;
1217
1218	/* make sure that we get around small delta that are due to
1219	* accumulated floating point roundoff errors
1220	*/
1221	if(rec_res && deltanew<=eps_sq*delta0){
1222	/* analytically recompute delta */
1223	for(i=0; i<m; ++i){
1224	for(j=0, aim=a+i*m, sum=0.0; j<m; ++j)
1225	sum+=aim[j]*x[j];
1226	res[i]=B[i]-sum;
1227	}
1228	deltanew=dprod(m, res, s);
1229	}
1230
1231	beta=deltanew/deltaold;
1232
1233	/***
1234	for(i=0; i<m; ++i)
1235	d[i]=s[i]+beta*d[i];
1236	***/
1237	daxpy(m, d, s, beta, d);
1238	}
1239
1240	return iter;
1241	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: proiecte/pmake3d/make3d_original/Make3dSingleImageStanford_version0.1/third_party/sba-1.3/sba_lapack.c @ 37

Download in other formats: