From c9cf2446c5eb18a29119c7dfe90c5b62f6b374c5 Mon Sep 17 00:00:00 2001 From: Justin Berger Date: Thu, 15 Mar 2018 23:14:29 -0600 Subject: Add sba poser --- redist/sba/sba_lapack.c | 1730 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1730 insertions(+) create mode 100644 redist/sba/sba_lapack.c (limited to 'redist/sba/sba_lapack.c') diff --git a/redist/sba/sba_lapack.c b/redist/sba/sba_lapack.c new file mode 100644 index 0000000..14fc44f --- /dev/null +++ b/redist/sba/sba_lapack.c @@ -0,0 +1,1730 @@ +///////////////////////////////////////////////////////////////////////////////// +//// +//// Linear algebra operations for the sba package +//// Copyright (C) 2004-2009 Manolis Lourakis (lourakis at ics forth gr) +//// Institute of Computer Science, Foundation for Research & Technology - Hellas +//// Heraklion, Crete, Greece. +//// +//// This program is free software; you can redistribute it and/or modify +//// it under the terms of the GNU General Public License as published by +//// the Free Software Foundation; either version 2 of the License, or +//// (at your option) any later version. +//// +//// This program is distributed in the hope that it will be useful, +//// but WITHOUT ANY WARRANTY; without even the implied warranty of +//// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//// GNU General Public License for more details. +//// +/////////////////////////////////////////////////////////////////////////////////// + +/* A note on memory alignment: + * + * Several of the functions below use a piece of dynamically allocated memory + * to store variables of different size (i.e., ints and doubles). To avoid + * alignment problems, care must be taken so that elements that are larger + * (doubles) are stored before smaller ones (ints). This ensures proper + * alignment under different alignment choices made by different CPUs: + * For instance, a double variable is aligned on x86 to 4 bytes but + * aligned to 8 bytes on AMD64 despite having the same size of 8 bytes. + */ + +#include +#include +#include +#include +#include + +#include "compiler.h" +#include "sba.h" + +#ifdef SBA_APPEND_UNDERSCORE_SUFFIX +#define F77_FUNC(func) func##_ +#else +#define F77_FUNC(func) func +#endif /* SBA_APPEND_UNDERSCORE_SUFFIX */ + +/* declarations of LAPACK routines employed */ + +/* QR decomposition */ +extern int F77_FUNC(dgeqrf)(int *m, int *n, double *a, int *lda, double *tau, double *work, int *lwork, int *info); +extern int F77_FUNC(dorgqr)(int *m, int *n, int *k, double *a, int *lda, double *tau, double *work, int *lwork, + int *info); + +/* solution of triangular system */ +extern int F77_FUNC(dtrtrs)(char *uplo, char *trans, char *diag, int *n, int *nrhs, double *a, int *lda, double *b, + int *ldb, int *info); + +/* Cholesky decomposition, linear system solution and matrix inversion */ +extern int F77_FUNC(dpotf2)(char *uplo, int *n, double *a, int *lda, int *info); /* unblocked Cholesky */ +extern int F77_FUNC(dpotrf)(char *uplo, int *n, double *a, int *lda, int *info); /* block version of dpotf2 */ +extern int F77_FUNC(dpotrs)(char *uplo, int *n, int *nrhs, double *a, int *lda, double *b, int *ldb, int *info); +extern int F77_FUNC(dpotri)(char *uplo, int *n, double *a, int *lda, int *info); + +/* LU decomposition, linear system solution and matrix inversion */ +extern int F77_FUNC(dgetrf)(int *m, int *n, double *a, int *lda, int *ipiv, int *info); /* blocked LU */ +extern int F77_FUNC(dgetf2)(int *m, int *n, double *a, int *lda, int *ipiv, int *info); /* unblocked LU */ +extern int F77_FUNC(dgetrs)(char *trans, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, + int *info); +extern int F77_FUNC(dgetri)(int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, int *info); + +/* SVD */ +extern int F77_FUNC(dgesvd)(char *jobu, char *jobvt, int *m, int *n, double *a, int *lda, double *s, double *u, + int *ldu, double *vt, int *ldvt, double *work, int *lwork, int *info); + +/* lapack 3.0 routine, faster than dgesvd() */ +extern int F77_FUNC(dgesdd)(char *jobz, int *m, int *n, double *a, int *lda, double *s, double *u, int *ldu, double *vt, + int *ldvt, double *work, int *lwork, int *iwork, int *info); + +/* Bunch-Kaufman factorization of a real symmetric matrix A, solution of linear systems and matrix inverse */ +extern int F77_FUNC(dsytrf)(char *uplo, int *n, double *a, int *lda, int *ipiv, double *work, int *lwork, + int *info); /* blocked ver. */ +extern int F77_FUNC(dsytrs)(char *uplo, int *n, int *nrhs, double *a, int *lda, int *ipiv, double *b, int *ldb, + int *info); +extern int F77_FUNC(dsytri)(char *uplo, int *n, double *a, int *lda, int *ipiv, double *work, int *info); + +/* + * This function returns the solution of Ax = b + * + * The function is based on QR decomposition with explicit computation of Q: + * If A=Q R with Q orthogonal and R upper triangular, the linear system becomes + * Q R x = b or R x = Q^T b. + * + * A is mxm, b is mx1. Argument iscolmaj specifies whether A is + * stored in column or row major order. Note that if iscolmaj==1 + * this function modifies A! + * + * The function returns 0 in case of error, 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_Axb_QR(double *A, double *B, double *x, int m, int iscolmaj) { + static double *buf = NULL; + static int buf_sz = 0, nb = 0; + + double *a, *r, *tau, *work; + int a_sz, r_sz, tau_sz, tot_sz; + register int i, j; + int info, worksz, nrhs = 1; + register double sum; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + a_sz = (iscolmaj) ? 0 : m * m; + r_sz = m * m; /* only the upper triangular part really needed */ + tau_sz = m; + if (!nb) { +#ifndef SBA_LS_SCARCE_MEMORY + double tmp; + + worksz = -1; // workspace query; optimal size is returned in tmp + F77_FUNC(dgeqrf)((int *)&m, (int *)&m, NULL, (int *)&m, NULL, (double *)&tmp, (int *)&worksz, (int *)&info); + nb = ((int)tmp) / m; // optimal worksize is m*nb +#else + nb = 1; // min worksize is m +#endif /* SBA_LS_SCARCE_MEMORY */ + } + worksz = nb * m; + tot_sz = a_sz + r_sz + tau_sz + worksz; + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz * sizeof(double)); + if (!buf) { + fprintf(stderr, "memory allocation in sba_Axb_QR() failed!\n"); + exit(1); + } + } + + if (!iscolmaj) { + a = buf; + /* store A (column major!) into a */ + for (i = 0; i < m; ++i) + for (j = 0; j < m; ++j) + a[i + j * m] = A[i * m + j]; + } else + a = A; /* no copying required */ + + r = buf + a_sz; + tau = r + r_sz; + work = tau + tau_sz; + + /* QR decomposition of A */ + F77_FUNC(dgeqrf)((int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dgeqrf in sba_Axb_QR()\n", -info); + exit(1); + } else { + fprintf(stderr, "Unknown LAPACK error %d for dgeqrf in sba_Axb_QR()\n", info); + return 0; + } + } + + /* R is now stored in the upper triangular part of a; copy it in r so that dorgqr() below won't destroy it */ + for (i = 0; i < r_sz; ++i) + r[i] = a[i]; + + /* compute Q using the elementary reflectors computed by the above decomposition */ + F77_FUNC(dorgqr)((int *)&m, (int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dorgqr in sba_Axb_QR()\n", -info); + exit(1); + } else { + fprintf(stderr, "Unknown LAPACK error (%d) in sba_Axb_QR()\n", info); + return 0; + } + } + + /* Q is now in a; compute Q^T b in x */ + for (i = 0; i < m; ++i) { + for (j = 0, sum = 0.0; j < m; ++j) + sum += a[i * m + j] * B[j]; + x[i] = sum; + } + + /* solve the linear system R x = Q^t b */ + F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, r, (int *)&m, x, (int *)&m, &info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QR()\n", -info); + exit(1); + } else { + fprintf(stderr, "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QR()\n", + info); + return 0; + } + } + + return 1; +} + +/* + * This function returns the solution of Ax = b + * + * The function is based on QR decomposition without computation of Q: + * If A=Q R with Q orthogonal and R upper triangular, the linear system becomes + * (A^T A) x = A^T b or (R^T Q^T Q R) x = A^T b or (R^T R) x = A^T b. + * This amounts to solving R^T y = A^T b for y and then R x = y for x + * Note that Q does not need to be explicitly computed + * + * A is mxm, b is mx1. Argument iscolmaj specifies whether A is + * stored in column or row major order. Note that if iscolmaj==1 + * this function modifies A! + * + * The function returns 0 in case of error, 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_Axb_QRnoQ(double *A, double *B, double *x, int m, int iscolmaj) { + static double *buf = NULL; + static int buf_sz = 0, nb = 0; + + double *a, *tau, *work; + int a_sz, tau_sz, tot_sz; + register int i, j; + int info, worksz, nrhs = 1; + register double sum; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + a_sz = (iscolmaj) ? 0 : m * m; + tau_sz = m; + if (!nb) { +#ifndef SBA_LS_SCARCE_MEMORY + double tmp; + + worksz = -1; // workspace query; optimal size is returned in tmp + F77_FUNC(dgeqrf)((int *)&m, (int *)&m, NULL, (int *)&m, NULL, (double *)&tmp, (int *)&worksz, (int *)&info); + nb = ((int)tmp) / m; // optimal worksize is m*nb +#else + nb = 1; // min worksize is m +#endif /* SBA_LS_SCARCE_MEMORY */ + } + worksz = nb * m; + tot_sz = a_sz + tau_sz + worksz; + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz * sizeof(double)); + if (!buf) { + fprintf(stderr, "memory allocation in sba_Axb_QRnoQ() failed!\n"); + exit(1); + } + } + + if (!iscolmaj) { + a = buf; + /* store A (column major!) into a */ + for (i = 0; i < m; ++i) + for (j = 0; j < m; ++j) + a[i + j * m] = A[i * m + j]; + } else + a = A; /* no copying required */ + + tau = buf + a_sz; + work = tau + tau_sz; + + /* compute A^T b in x */ + for (i = 0; i < m; ++i) { + for (j = 0, sum = 0.0; j < m; ++j) + sum += a[i * m + j] * B[j]; + x[i] = sum; + } + + /* QR decomposition of A */ + F77_FUNC(dgeqrf)((int *)&m, (int *)&m, a, (int *)&m, tau, work, (int *)&worksz, (int *)&info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dgeqrf in sba_Axb_QRnoQ()\n", -info); + exit(1); + } else { + fprintf(stderr, "Unknown LAPACK error %d for dgeqrf in sba_Axb_QRnoQ()\n", info); + return 0; + } + } + + /* R is stored in the upper triangular part of a */ + + /* solve the linear system R^T y = A^t b */ + F77_FUNC(dtrtrs)("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, x, (int *)&m, &info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QRnoQ()\n", -info); + exit(1); + } else { + fprintf(stderr, + "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QRnoQ()\n", + info); + return 0; + } + } + + /* solve the linear system R x = y */ + F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, x, (int *)&m, &info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_QRnoQ()\n", -info); + exit(1); + } else { + fprintf(stderr, + "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_QRnoQ()\n", + info); + return 0; + } + } + + return 1; +} + +/* + * This function returns the solution of Ax=b + * + * The function assumes that A is symmetric & positive definite and employs + * the Cholesky decomposition: + * If A=U^T U with U upper triangular, the system to be solved becomes + * (U^T U) x = b + * This amounts to solving U^T y = b for y and then U x = y for x + * + * A is mxm, b is mx1. Argument iscolmaj specifies whether A is + * stored in column or row major order. Note that if iscolmaj==1 + * this function modifies A! + * + * The function returns 0 in case of error, 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_Axb_Chol(double *A, double *B, double *x, int m, int iscolmaj) { + static double *buf = NULL; + static int buf_sz = 0; + + double *a; + int a_sz, tot_sz; + register int i, j; + int info, nrhs = 1; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + a_sz = (iscolmaj) ? 0 : m * m; + tot_sz = a_sz; + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz * sizeof(double)); + if (!buf) { + fprintf(stderr, "memory allocation in sba_Axb_Chol() failed!\n"); + exit(1); + } + } + + if (!iscolmaj) { + a = buf; + + /* store A into a and B into x; A is assumed to be symmetric, hence + * the column and row major order representations are the same + */ + for (i = 0; i < m; ++i) { + a[i] = A[i]; + x[i] = B[i]; + } + for (j = m * m; i < j; ++i) // copy remaining rows; note that i is not re-initialized + a[i] = A[i]; + } else { /* no copying is necessary for A */ + a = A; + for (i = 0; i < m; ++i) + x[i] = B[i]; + } + + /* Cholesky decomposition of A */ + // F77_FUNC(dpotf2)("U", (int *)&m, a, (int *)&m, (int *)&info); + F77_FUNC(dpotrf)("U", (int *)&m, a, (int *)&m, (int *)&info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotf2/dpotrf in sba_Axb_Chol()\n", -info); + exit(1); + } else { + fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization " + "could not be completed for dpotf2/dpotrf in sba_Axb_Chol()\n", + info); + return 0; + } + } + +/* below are two alternative ways for solving the linear system: */ +#if 1 + /* use the computed Cholesky in one lapack call */ + F77_FUNC(dpotrs)("U", (int *)&m, (int *)&nrhs, a, (int *)&m, x, (int *)&m, &info); + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotrs in sba_Axb_Chol()\n", -info); + exit(1); + } +#else + /* solve the linear systems U^T y = b, U x = y */ + F77_FUNC(dtrtrs)("U", "T", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, x, (int *)&m, &info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_Chol()\n", -info); + exit(1); + } else { + fprintf(stderr, + "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_Chol()\n", + info); + return 0; + } + } + + /* solve U x = y */ + F77_FUNC(dtrtrs)("U", "N", "N", (int *)&m, (int *)&nrhs, a, (int *)&m, x, (int *)&m, &info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dtrtrs in sba_Axb_Chol()\n", -info); + exit(1); + } else { + fprintf(stderr, + "LAPACK error: the %d-th diagonal element of A is zero (singular matrix) in sba_Axb_Chol()\n", + info); + return 0; + } + } +#endif /* 1 */ + + return 1; +} + +/* + * This function returns the solution of Ax = b + * + * The function employs LU decomposition: + * If A=L U with L lower and U upper triangular, then the original system + * amounts to solving + * L y = b, U x = y + * + * A is mxm, b is mx1. Argument iscolmaj specifies whether A is + * stored in column or row major order. Note that if iscolmaj==1 + * this function modifies A! + * + * The function returns 0 in case of error, + * 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_Axb_LU(double *A, double *B, double *x, int m, int iscolmaj) { + static double *buf = NULL; + static int buf_sz = 0; + + int a_sz, ipiv_sz, tot_sz; + register int i, j; + int info, *ipiv, nrhs = 1; + double *a; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + ipiv_sz = m; + a_sz = (iscolmaj) ? 0 : m * m; + tot_sz = a_sz * sizeof(double) + + ipiv_sz * sizeof(int); /* should be arranged in that order for proper doubles alignment */ + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz); + if (!buf) { + fprintf(stderr, "memory allocation in sba_Axb_LU() failed!\n"); + exit(1); + } + } + + if (!iscolmaj) { + a = buf; + ipiv = (int *)(a + a_sz); + + /* store A (column major!) into a and B into x */ + for (i = 0; i < m; ++i) { + for (j = 0; j < m; ++j) + a[i + j * m] = A[i * m + j]; + + x[i] = B[i]; + } + } else { /* no copying is necessary for A */ + a = A; + for (i = 0; i < m; ++i) + x[i] = B[i]; + ipiv = (int *)buf; + } + + /* LU decomposition for A */ + F77_FUNC(dgetrf)((int *)&m, (int *)&m, a, (int *)&m, ipiv, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dgetrf illegal in sba_Axb_LU()\n", -info); + exit(1); + } else { + fprintf(stderr, "singular matrix A for dgetrf in sba_Axb_LU()\n"); + return 0; + } + } + + /* solve the system with the computed LU */ + F77_FUNC(dgetrs)("N", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, x, (int *)&m, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dgetrs illegal in sba_Axb_LU()\n", -info); + exit(1); + } else { + fprintf(stderr, "unknown error for dgetrs in sba_Axb_LU()\n"); + return 0; + } + } + + return 1; +} + +/* + * This function returns the solution of Ax = b + * + * The function is based on SVD decomposition: + * If A=U D V^T with U, V orthogonal and D diagonal, the linear system becomes + * (U D V^T) x = b or x=V D^{-1} U^T b + * Note that V D^{-1} U^T is the pseudoinverse A^+ + * + * A is mxm, b is mx1. Argument iscolmaj specifies whether A is + * stored in column or row major order. Note that if iscolmaj==1 + * this function modifies A! + * + * The function returns 0 in case of error, 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_Axb_SVD(double *A, double *B, double *x, int m, int iscolmaj) { + static double *buf = NULL; + static int buf_sz = 0; + static double eps = -1.0; + + register int i, j; + double *a, *u, *s, *vt, *work; + int a_sz, u_sz, s_sz, vt_sz, tot_sz; + double thresh, one_over_denom; + register double sum; + int info, rank, worksz, *iwork, iworksz; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + +/* calculate required memory size */ +#ifndef SBA_LS_SCARCE_MEMORY + worksz = -1; // workspace query. Keep in mind that dgesdd requires more memory than dgesvd + /* note that optimal work size is returned in thresh */ + F77_FUNC(dgesdd) + ("A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, (double *)&thresh, + (int *)&worksz, NULL, &info); + /* F77_FUNC(dgesvd)("A", "A", (int *)&m, (int *)&m, NULL, (int *)&m, NULL, NULL, (int *)&m, NULL, (int *)&m, + (double *)&thresh, (int *)&worksz, &info); */ + worksz = (int)thresh; +#else + worksz = m * (7 * m + 4); // min worksize for dgesdd + // worksz=5*m; // min worksize for dgesvd +#endif /* SBA_LS_SCARCE_MEMORY */ + iworksz = 8 * m; + a_sz = (!iscolmaj) ? m * m : 0; + u_sz = m * m; + s_sz = m; + vt_sz = m * m; + + tot_sz = (a_sz + u_sz + s_sz + vt_sz + worksz) * sizeof(double) + + iworksz * sizeof(int); /* should be arranged in that order for proper doubles alignment */ + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz); + if (!buf) { + fprintf(stderr, "memory allocation in sba_Axb_SVD() failed!\n"); + exit(1); + } + } + + if (!iscolmaj) { + a = buf; + u = a + a_sz; + /* store A (column major!) into a */ + for (i = 0; i < m; ++i) + for (j = 0; j < m; ++j) + a[i + j * m] = A[i * m + j]; + } else { + a = A; /* no copying required */ + u = buf; + } + + s = u + u_sz; + vt = s + s_sz; + work = vt + vt_sz; + iwork = (int *)(work + worksz); + + /* SVD decomposition of A */ + F77_FUNC(dgesdd) + ("A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int *)&worksz, iwork, &info); + // F77_FUNC(dgesvd)("A", "A", (int *)&m, (int *)&m, a, (int *)&m, s, u, (int *)&m, vt, (int *)&m, work, (int + // *)&worksz, &info); + + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dgesdd/dgesvd in sba_Axb_SVD()\n", -info); + exit(1); + } else { + fprintf(stderr, + "LAPACK error: dgesdd (dbdsdc)/dgesvd (dbdsqr) failed to converge in sba_Axb_SVD() [info=%d]\n", + info); + + return 0; + } + } + + if (eps < 0.0) { + double aux; + + /* compute machine epsilon. DBL_EPSILON should do also */ + for (eps = 1.0; aux = eps + 1.0, aux - 1.0 > 0.0; eps *= 0.5) + ; + eps *= 2.0; + } + + /* compute the pseudoinverse in a */ + memset(a, 0, m * m * sizeof(double)); /* initialize to zero */ + for (rank = 0, thresh = eps * s[0]; rank < m && s[rank] > thresh; ++rank) { + one_over_denom = 1.0 / s[rank]; + + for (j = 0; j < m; ++j) + for (i = 0; i < m; ++i) + a[i * m + j] += vt[rank + i * m] * u[j + rank * m] * one_over_denom; + } + + /* compute A^+ b in x */ + for (i = 0; i < m; ++i) { + for (j = 0, sum = 0.0; j < m; ++j) + sum += a[i * m + j] * B[j]; + x[i] = sum; + } + + return 1; +} + +/* + * This function returns the solution of Ax = b for a real symmetric matrix A + * + * The function is based on UDUT factorization with the pivoting + * strategy of Bunch and Kaufman: + * A is factored as U*D*U^T where U is upper triangular and + * D symmetric and block diagonal (aka spectral decomposition, + * Banachiewicz factorization, modified Cholesky factorization) + * + * A is mxm, b is mx1. Argument iscolmaj specifies whether A is + * stored in column or row major order. Note that if iscolmaj==1 + * this function modifies A! + * + * The function returns 0 in case of error, + * 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_Axb_BK(double *A, double *B, double *x, int m, int iscolmaj) { + static double *buf = NULL; + static int buf_sz = 0, nb = 0; + + int a_sz, ipiv_sz, work_sz, tot_sz; + register int i, j; + int info, *ipiv, nrhs = 1; + double *a, *work; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + ipiv_sz = m; + a_sz = (iscolmaj) ? 0 : m * m; + if (!nb) { +#ifndef SBA_LS_SCARCE_MEMORY + double tmp; + + work_sz = -1; // workspace query; optimal size is returned in tmp + F77_FUNC(dsytrf)("U", (int *)&m, NULL, (int *)&m, NULL, (double *)&tmp, (int *)&work_sz, (int *)&info); + nb = ((int)tmp) / m; // optimal worksize is m*nb +#else + nb = -1; // min worksize is 1 +#endif /* SBA_LS_SCARCE_MEMORY */ + } + work_sz = (nb != -1) ? nb * m : 1; + tot_sz = (a_sz + work_sz) * sizeof(double) + + ipiv_sz * sizeof(int); /* should be arranged in that order for proper doubles alignment */ + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz); + if (!buf) { + fprintf(stderr, "memory allocation in sba_Axb_BK() failed!\n"); + exit(1); + } + } + + if (!iscolmaj) { + a = buf; + work = a + a_sz; + + /* store A into a and B into x; A is assumed to be symmetric, hence + * the column and row major order representations are the same + */ + for (i = 0; i < m; ++i) { + a[i] = A[i]; + x[i] = B[i]; + } + for (j = m * m; i < j; ++i) // copy remaining rows; note that i is not re-initialized + a[i] = A[i]; + } else { /* no copying is necessary for A */ + a = A; + for (i = 0; i < m; ++i) + x[i] = B[i]; + work = buf; + } + ipiv = (int *)(work + work_sz); + + /* factorization for A */ + F77_FUNC(dsytrf)("U", (int *)&m, a, (int *)&m, ipiv, work, (int *)&work_sz, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dsytrf illegal in sba_Axb_BK()\n", -info); + exit(1); + } else { + fprintf(stderr, "singular block diagonal matrix D for dsytrf in sba_Axb_BK() [D(%d, %d) is zero]\n", info, + info); + return 0; + } + } + + /* solve the system with the computed factorization */ + F77_FUNC(dsytrs)("U", (int *)&m, (int *)&nrhs, a, (int *)&m, ipiv, x, (int *)&m, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dsytrs illegal in sba_Axb_BK()\n", -info); + exit(1); + } else { + fprintf(stderr, "unknown error for dsytrs in sba_Axb_BK()\n"); + return 0; + } + } + + return 1; +} + +/* + * This function computes the inverse of a square matrix whose upper triangle + * is stored in A into its lower triangle using LU decomposition + * + * The function returns 0 in case of error (e.g. A is singular), + * 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_symat_invert_LU(double *A, int m) { + static double *buf = NULL; + static int buf_sz = 0, nb = 0; + + int a_sz, ipiv_sz, work_sz, tot_sz; + register int i, j; + int info, *ipiv; + double *a, *work; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + ipiv_sz = m; + a_sz = m * m; + if (!nb) { +#ifndef SBA_LS_SCARCE_MEMORY + double tmp; + + work_sz = -1; // workspace query; optimal size is returned in tmp + F77_FUNC(dgetri)((int *)&m, NULL, (int *)&m, NULL, (double *)&tmp, (int *)&work_sz, (int *)&info); + nb = ((int)tmp) / m; // optimal worksize is m*nb +#else + nb = 1; // min worksize is m +#endif /* SBA_LS_SCARCE_MEMORY */ + } + work_sz = nb * m; + tot_sz = (a_sz + work_sz) * sizeof(double) + + ipiv_sz * sizeof(int); /* should be arranged in that order for proper doubles alignment */ + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz); + if (!buf) { + fprintf(stderr, "memory allocation in sba_symat_invert_LU() failed!\n"); + exit(1); + } + } + + a = buf; + work = a + a_sz; + ipiv = (int *)(work + work_sz); + + /* store A (column major!) into a */ + for (i = 0; i < m; ++i) + for (j = i; j < m; ++j) + a[i + j * m] = a[j + i * m] = A[i * m + j]; // copy A's upper part to a's upper & lower + + /* LU decomposition for A */ + F77_FUNC(dgetrf)((int *)&m, (int *)&m, a, (int *)&m, ipiv, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dgetrf illegal in sba_symat_invert_LU()\n", -info); + exit(1); + } else { + fprintf(stderr, "singular matrix A for dgetrf in sba_symat_invert_LU()\n"); + return 0; + } + } + + /* (A)^{-1} from LU */ + F77_FUNC(dgetri)((int *)&m, a, (int *)&m, ipiv, work, (int *)&work_sz, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dgetri illegal in sba_symat_invert_LU()\n", -info); + exit(1); + } else { + fprintf(stderr, "singular matrix A for dgetri in sba_symat_invert_LU()\n"); + return 0; + } + } + + /* store (A)^{-1} in A's lower triangle */ + for (i = 0; i < m; ++i) + for (j = 0; j <= i; ++j) + A[i * m + j] = a[i + j * m]; + + return 1; +} + +/* + * This function computes the inverse of a square symmetric positive definite + * matrix whose upper triangle is stored in A into its lower triangle using + * Cholesky factorization + * + * The function returns 0 in case of error (e.g. A is not positive definite or singular), + * 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_symat_invert_Chol(double *A, int m) { + static double *buf = NULL; + static int buf_sz = 0; + + int a_sz, tot_sz; + register int i, j; + int info; + double *a; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + a_sz = m * m; + tot_sz = a_sz; + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz * sizeof(double)); + if (!buf) { + fprintf(stderr, "memory allocation in sba_symat_invert_Chol() failed!\n"); + exit(1); + } + } + + a = (double *)buf; + + /* store A into a; A is assumed symmetric, hence no transposition is needed */ + for (i = 0, j = a_sz; i < j; ++i) + a[i] = A[i]; + + /* Cholesky factorization for A; a's lower part corresponds to A's upper */ + // F77_FUNC(dpotrf)("L", (int *)&m, a, (int *)&m, (int *)&info); + F77_FUNC(dpotf2)("L", (int *)&m, a, (int *)&m, (int *)&info); + /* error treatment */ + if (info != 0) { + if (info < 0) { + fprintf(stderr, "LAPACK error: illegal value for argument %d of dpotrf in sba_symat_invert_Chol()\n", + -info); + exit(1); + } else { + fprintf(stderr, "LAPACK error: the leading minor of order %d is not positive definite,\nthe factorization " + "could not be completed for dpotrf in sba_symat_invert_Chol()\n", + info); + return 0; + } + } + + /* (A)^{-1} from Cholesky */ + F77_FUNC(dpotri)("L", (int *)&m, a, (int *)&m, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dpotri illegal in sba_symat_invert_Chol()\n", -info); + exit(1); + } else { + fprintf(stderr, "the (%d, %d) element of the factor U or L is zero, singular matrix A for dpotri in " + "sba_symat_invert_Chol()\n", + info, info); + return 0; + } + } + + /* store (A)^{-1} in A's lower triangle. The lower triangle of the symmetric A^{-1} is in the lower triangle of a */ + for (i = 0; i < m; ++i) + for (j = 0; j <= i; ++j) + A[i * m + j] = a[i + j * m]; + + return 1; +} + +/* + * This function computes the inverse of a symmetric indefinite + * matrix whose upper triangle is stored in A into its lower triangle + * using LDLT factorization with the pivoting strategy of Bunch and Kaufman + * + * The function returns 0 in case of error (e.g. A is singular), + * 1 if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_symat_invert_BK(double *A, int m) { + static double *buf = NULL; + static int buf_sz = 0, nb = 0; + + int a_sz, ipiv_sz, work_sz, tot_sz; + register int i, j; + int info, *ipiv; + double *a, *work; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + ipiv_sz = m; + a_sz = m * m; + if (!nb) { +#ifndef SBA_LS_SCARCE_MEMORY + double tmp; + + work_sz = -1; // workspace query; optimal size is returned in tmp + F77_FUNC(dsytrf)("L", (int *)&m, NULL, (int *)&m, NULL, (double *)&tmp, (int *)&work_sz, (int *)&info); + nb = ((int)tmp) / m; // optimal worksize is m*nb +#else + nb = -1; // min worksize is 1 +#endif /* SBA_LS_SCARCE_MEMORY */ + } + work_sz = (nb != -1) ? nb * m : 1; + work_sz = (work_sz >= m) ? work_sz : m; /* ensure that work is at least m elements long, as required by dsytri */ + tot_sz = (a_sz + work_sz) * sizeof(double) + + ipiv_sz * sizeof(int); /* should be arranged in that order for proper doubles alignment */ + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz); + if (!buf) { + fprintf(stderr, "memory allocation in sba_symat_invert_BK() failed!\n"); + exit(1); + } + } + + a = buf; + work = a + a_sz; + ipiv = (int *)(work + work_sz); + + /* store A into a; A is assumed symmetric, hence no transposition is needed */ + for (i = 0, j = a_sz; i < j; ++i) + a[i] = A[i]; + + /* LDLT factorization for A; a's lower part corresponds to A's upper */ + F77_FUNC(dsytrf)("L", (int *)&m, a, (int *)&m, ipiv, work, (int *)&work_sz, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dsytrf illegal in sba_symat_invert_BK()\n", -info); + exit(1); + } else { + fprintf(stderr, + "singular block diagonal matrix D for dsytrf in sba_symat_invert_BK() [D(%d, %d) is zero]\n", info, + info); + return 0; + } + } + + /* (A)^{-1} from LDLT */ + F77_FUNC(dsytri)("L", (int *)&m, a, (int *)&m, ipiv, work, (int *)&info); + if (info != 0) { + if (info < 0) { + fprintf(stderr, "argument %d of dsytri illegal in sba_symat_invert_BK()\n", -info); + exit(1); + } else { + fprintf(stderr, + "D(%d, %d)=0, matrix is singular and its inverse could not be computed in sba_symat_invert_BK()\n", + info, info); + return 0; + } + } + + /* store (A)^{-1} in A's lower triangle. The lower triangle of the symmetric A^{-1} is in the lower triangle of a */ + for (i = 0; i < m; ++i) + for (j = 0; j <= i; ++j) + A[i * m + j] = a[i + j * m]; + + return 1; +} + +#define __CG_LINALG_BLOCKSIZE 8 + +/* Dot product of two vectors x and y using loop unrolling and blocking. + * see http://www.abarnett.demon.co.uk/tutorial.html + */ + +inline static double dprod(const int n, const double *const x, const double *const y) { + register int i, j1, j2, j3, j4, j5, j6, j7; + int blockn; + register double sum0 = 0.0, sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, sum6 = 0.0, sum7 = 0.0; + + /* n may not be divisible by __CG_LINALG_BLOCKSIZE, + * go as near as we can first, then tidy up. + */ + blockn = (n / __CG_LINALG_BLOCKSIZE) * __CG_LINALG_BLOCKSIZE; + + /* unroll the loop in blocks of `__CG_LINALG_BLOCKSIZE' */ + for (i = 0; i < blockn; i += __CG_LINALG_BLOCKSIZE) { + sum0 += x[i] * y[i]; + j1 = i + 1; + sum1 += x[j1] * y[j1]; + j2 = i + 2; + sum2 += x[j2] * y[j2]; + j3 = i + 3; + sum3 += x[j3] * y[j3]; + j4 = i + 4; + sum4 += x[j4] * y[j4]; + j5 = i + 5; + sum5 += x[j5] * y[j5]; + j6 = i + 6; + sum6 += x[j6] * y[j6]; + j7 = i + 7; + sum7 += x[j7] * y[j7]; + } + + /* + * There may be some left to do. + * This could be done as a simple for() loop, + * but a switch is faster (and more interesting) + */ + + if (i < n) { + /* Jump into the case at the place that will allow + * us to finish off the appropriate number of items. + */ + + switch (n - i) { + case 7: + sum0 += x[i] * y[i]; + ++i; + case 6: + sum1 += x[i] * y[i]; + ++i; + case 5: + sum2 += x[i] * y[i]; + ++i; + case 4: + sum3 += x[i] * y[i]; + ++i; + case 3: + sum4 += x[i] * y[i]; + ++i; + case 2: + sum5 += x[i] * y[i]; + ++i; + case 1: + sum6 += x[i] * y[i]; + ++i; + } + } + + return sum0 + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7; +} + +/* Compute z=x+a*y for two vectors x and y and a scalar a; z can be one of x, y. + * Similarly to the dot product routine, this one uses loop unrolling and blocking + */ + +inline static void daxpy(const int n, double *const z, const double *const x, const double a, const double *const y) { + register int i, j1, j2, j3, j4, j5, j6, j7; + int blockn; + + /* n may not be divisible by __CG_LINALG_BLOCKSIZE, + * go as near as we can first, then tidy up. + */ + blockn = (n / __CG_LINALG_BLOCKSIZE) * __CG_LINALG_BLOCKSIZE; + + /* unroll the loop in blocks of `__CG_LINALG_BLOCKSIZE' */ + for (i = 0; i < blockn; i += __CG_LINALG_BLOCKSIZE) { + z[i] = x[i] + a * y[i]; + j1 = i + 1; + z[j1] = x[j1] + a * y[j1]; + j2 = i + 2; + z[j2] = x[j2] + a * y[j2]; + j3 = i + 3; + z[j3] = x[j3] + a * y[j3]; + j4 = i + 4; + z[j4] = x[j4] + a * y[j4]; + j5 = i + 5; + z[j5] = x[j5] + a * y[j5]; + j6 = i + 6; + z[j6] = x[j6] + a * y[j6]; + j7 = i + 7; + z[j7] = x[j7] + a * y[j7]; + } + + /* + * There may be some left to do. + * This could be done as a simple for() loop, + * but a switch is faster (and more interesting) + */ + + if (i < n) { + /* Jump into the case at the place that will allow + * us to finish off the appropriate number of items. + */ + + switch (n - i) { + case 7: + z[i] = x[i] + a * y[i]; + ++i; + case 6: + z[i] = x[i] + a * y[i]; + ++i; + case 5: + z[i] = x[i] + a * y[i]; + ++i; + case 4: + z[i] = x[i] + a * y[i]; + ++i; + case 3: + z[i] = x[i] + a * y[i]; + ++i; + case 2: + z[i] = x[i] + a * y[i]; + ++i; + case 1: + z[i] = x[i] + a * y[i]; + ++i; + } + } +} + +/* + * This function returns the solution of Ax = b where A is posititive definite, + * based on the conjugate gradients method; see "An intro to the CG method" by J.R. Shewchuk, p. 50-51 + * + * A is mxm, b, x are is mx1. Argument niter specifies the maximum number of + * iterations and eps is the desired solution accuracy. niter<0 signals that + * x contains a valid initial approximation to the solution; if niter>0 then + * the starting point is taken to be zero. Argument prec selects the desired + * preconditioning method as follows: + * 0: no preconditioning + * 1: jacobi (diagonal) preconditioning + * 2: SSOR preconditioning + * Argument iscolmaj specifies whether A is stored in column or row major order. + * + * The function returns 0 in case of error, + * the number of iterations performed if successfull + * + * This function is often called repetitively to solve problems of identical + * dimensions. To avoid repetitive malloc's and free's, allocated memory is + * retained between calls and free'd-malloc'ed when not of the appropriate size. + * A call with NULL as the first argument forces this memory to be released. + */ +int sba_Axb_CG(double *A, double *B, double *x, int m, int niter, double eps, int prec, int iscolmaj) { + static double *buf = NULL; + static int buf_sz = 0; + + register int i, j; + register double *aim; + int iter, a_sz, res_sz, d_sz, q_sz, s_sz, wk_sz, z_sz, tot_sz; + double *a, *res, *d, *q, *s, *wk, *z; + double delta0, deltaold, deltanew, alpha, beta, eps_sq = eps * eps; + register double sum; + int rec_res; + + if (A == NULL) { + if (buf) + free(buf); + buf = NULL; + buf_sz = 0; + + return 1; + } + + /* calculate required memory size */ + a_sz = (iscolmaj) ? m * m : 0; + res_sz = m; + d_sz = m; + q_sz = m; + if (prec != SBA_CG_NOPREC) { + s_sz = m; + wk_sz = m; + z_sz = (prec == SBA_CG_SSOR) ? m : 0; + } else + s_sz = wk_sz = z_sz = 0; + + tot_sz = a_sz + res_sz + d_sz + q_sz + s_sz + wk_sz + z_sz; + + if (tot_sz > buf_sz) { /* insufficient memory, allocate a "big" memory chunk at once */ + if (buf) + free(buf); /* free previously allocated memory */ + + buf_sz = tot_sz; + buf = (double *)malloc(buf_sz * sizeof(double)); + if (!buf) { + fprintf(stderr, "memory allocation request failed in sba_Axb_CG()\n"); + exit(1); + } + } + + if (iscolmaj) { + a = buf; + /* store A (row major!) into a */ + for (i = 0; i < m; ++i) + for (j = 0, aim = a + i * m; j < m; ++j) + aim[j] = A[i + j * m]; + } else + a = A; /* no copying required */ + + res = buf + a_sz; + d = res + res_sz; + q = d + d_sz; + if (prec != SBA_CG_NOPREC) { + s = q + q_sz; + wk = s + s_sz; + z = (prec == SBA_CG_SSOR) ? wk + wk_sz : NULL; + + for (i = 0; i < m; ++i) { // compute jacobi (i.e. diagonal) preconditioners and save them in wk + sum = a[i * m + i]; + if (sum > DBL_EPSILON || -sum < -DBL_EPSILON) // != 0.0 + wk[i] = 1.0 / sum; + else + wk[i] = 1.0 / DBL_EPSILON; + } + } else { + s = res; + wk = z = NULL; + } + + if (niter > 0) { + for (i = 0; i < m; ++i) { // clear solution and initialize residual vector: res <-- B + x[i] = 0.0; + res[i] = B[i]; + } + } else { + niter = -niter; + + for (i = 0; i < m; ++i) { // initialize residual vector: res <-- B - A*x + for (j = 0, aim = a + i * m, sum = 0.0; j < m; ++j) + sum += aim[j] * x[j]; + res[i] = B[i] - sum; + } + } + + switch (prec) { + case SBA_CG_NOPREC: + for (i = 0, deltanew = 0.0; i < m; ++i) { + d[i] = res[i]; + deltanew += res[i] * res[i]; + } + break; + case SBA_CG_JACOBI: // jacobi preconditioning + for (i = 0, deltanew = 0.0; i < m; ++i) { + d[i] = res[i] * wk[i]; + deltanew += res[i] * d[i]; + } + break; + case SBA_CG_SSOR: // SSOR preconditioning; see the "templates" book, fig. 3.2, p. 44 + for (i = 0; i < m; ++i) { + for (j = 0, sum = 0.0, aim = a + i * m; j < i; ++j) + sum += aim[j] * z[j]; + z[i] = wk[i] * (res[i] - sum); + } + + for (i = m - 1; i >= 0; --i) { + for (j = i + 1, sum = 0.0, aim = a + i * m; j < m; ++j) + sum += aim[j] * d[j]; + d[i] = z[i] - wk[i] * sum; + } + deltanew = dprod(m, res, d); + break; + default: + fprintf(stderr, "unknown preconditioning option %d in sba_Axb_CG\n", prec); + exit(1); + } + + delta0 = deltanew; + + for (iter = 1; deltanew > eps_sq * delta0 && iter <= niter; ++iter) { + for (i = 0; i < m; ++i) { // q <-- A d + aim = a + i * m; + /*** + for(j=0, sum=0.0; j= 0; --i) { + for (j = i + 1, sum = 0.0, aim = a + i * m; j < m; ++j) + sum += aim[j] * s[j]; + s[i] = z[i] - wk[i] * sum; + } + break; + } + } + + deltaold = deltanew; + /*** + for(i=0, sum=0.0; ibuf_sz){ /* insufficient memory, allocate a "big" memory chunk at once */ + if(buf) free(buf); /* free previously allocated memory */ + + buf_sz=tot_sz; + buf=(double *)malloc(buf_sz); + if(!buf){ + fprintf(stderr, "memory allocation in sba_mat_cholinv() failed!\n"); + exit(1); + } + } + + a=buf; + work=a+a_sz; + ipiv=(int *)(work+work_sz); + + /* step 1: invert A */ + /* store A into a; A is assumed symmetric, hence no transposition is needed */ + for(i=0; i