redist/svd.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450

/**************************************************************************
**
**  svd3
**
** Quick singular value decomposition as described by:
** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis,
** "Computing the Singular Value Decomposition of 3x3 matrices
** with minimal branching and elementary floating point operations",
**  University of Wisconsin - Madison technical report TR1690, May 2011
**
**	OPTIMIZED CPU VERSION
** 	Implementation by: Eric Jang
**
**  13 Apr 2014
**
** This file originally retrieved from:
** https://github.com/ericjang/svd3/blob/master/svd3.h  3/26/2017
**
** Original licesnse is MIT per:
** https://github.com/ericjang/svd3/blob/master/LICENSE.md
**
** Ported from C++ to C by Mike Turvey.  All modifications also released 
** under an MIT license.
**************************************************************************/


#ifndef SVD3_H
#define SVD3_H

#define _gamma 5.828427124 // FOUR_GAMMA_SQUARED = sqrt(8)+3;
#define _cstar 0.923879532 // cos(pi/8)
#define _sstar 0.3826834323 // sin(p/8)
#define EPSILON 1e-6

#include <math.h>

/* This is a novel and fast routine for the reciprocal square root of an
IEEE float (single precision).
http://www.lomont.org/Math/Papers/2003/InvSqrt.pdf
http://playstation2-linux.com/download/p2lsd/fastrsqrt.pdf
http://www.beyond3d.com/content/articles/8/
*/
inline float rsqrt(float x) {
	// int ihalf = *(int *)&x - 0x00800000; // Alternative to next line,
	// float xhalf = *(float *)&ihalf;      // for sufficiently large nos.
	float xhalf = 0.5f*x;
	int i = *(int *)&x;          // View x as an int.
								 // i = 0x5f3759df - (i >> 1);   // Initial guess (traditional).
	i = 0x5f375a82 - (i >> 1);   // Initial guess (slightly better).
	x = *(float *)&i;            // View i as float.
	x = x*(1.5f - xhalf*x*x);    // Newton step.
								 // x = x*(1.5008908 - xhalf*x*x);  // Newton step for a balanced error.
	return x;
}

/* This is rsqrt with an additional step of the Newton iteration, for
increased accuracy. The constant 0x5f37599e makes the relative error
range from 0 to -0.00000463.
You can't balance the error by adjusting the constant. */
inline float rsqrt1(float x) {
	float xhalf = 0.5f*x;
	int i = *(int *)&x;          // View x as an int.
	i = 0x5f37599e - (i >> 1);   // Initial guess.
	x = *(float *)&i;            // View i as float.
	x = x*(1.5f - xhalf*x*x);    // Newton step.
	x = x*(1.5f - xhalf*x*x);    // Newton step again.
	return x;
}

inline float accurateSqrt(float x)
{
	return x * rsqrt1(x);
}

inline void condSwap(bool c, float *X, float *Y)
{
	// used in step 2
	float Z = *X;
	*X = c ? *Y : *X;
	*Y = c ? Z : *Y;
}

inline void condNegSwap(bool c, float *X, float *Y)
{
	// used in step 2 and 3
	float Z = -*X;
	*X = c ? *Y : *X;
	*Y = c ? Z : *Y;
}

// matrix multiplication M = A * B
inline void multAB(float a11, float a12, float a13,
	float a21, float a22, float a23,
	float a31, float a32, float a33,
	//
	float b11, float b12, float b13,
	float b21, float b22, float b23,
	float b31, float b32, float b33,
	//
	float *m11, float *m12, float *m13,
	float *m21, float *m22, float *m23,
	float *m31, float *m32, float *m33)
{

	*m11 = a11*b11 + a12*b21 + a13*b31; *m12 = a11*b12 + a12*b22 + a13*b32; *m13 = a11*b13 + a12*b23 + a13*b33;
	*m21 = a21*b11 + a22*b21 + a23*b31; *m22 = a21*b12 + a22*b22 + a23*b32; *m23 = a21*b13 + a22*b23 + a23*b33;
	*m31 = a31*b11 + a32*b21 + a33*b31; *m32 = a31*b12 + a32*b22 + a33*b32; *m33 = a31*b13 + a32*b23 + a33*b33;
}

// matrix multiplication M = Transpose[A] * B
inline void multAtB(float a11, float a12, float a13,
	float a21, float a22, float a23,
	float a31, float a32, float a33,
	//
	float b11, float b12, float b13,
	float b21, float b22, float b23,
	float b31, float b32, float b33,
	//
	float *m11, float *m12, float *m13,
	float *m21, float *m22, float *m23,
	float *m31, float *m32, float *m33)
{
	*m11 = a11*b11 + a21*b21 + a31*b31; *m12 = a11*b12 + a21*b22 + a31*b32; *m13 = a11*b13 + a21*b23 + a31*b33;
	*m21 = a12*b11 + a22*b21 + a32*b31; *m22 = a12*b12 + a22*b22 + a32*b32; *m23 = a12*b13 + a22*b23 + a32*b33;
	*m31 = a13*b11 + a23*b21 + a33*b31; *m32 = a13*b12 + a23*b22 + a33*b32; *m33 = a13*b13 + a23*b23 + a33*b33;
}

inline void quatToMat3(const float * qV,
	float *m11, float *m12, float *m13,
	float *m21, float *m22, float *m23,
	float *m31, float *m32, float *m33
)
{
	float w = qV[3];
	float x = qV[0];
	float y = qV[1];
	float z = qV[2];

	float qxx = x*x;
	float qyy = y*y;
	float qzz = z*z;
	float qxz = x*z;
	float qxy = x*y;
	float qyz = y*z;
	float qwx = w*x;
	float qwy = w*y;
	float qwz = w*z;

	*m11 = 1 - 2 * (qyy + qzz); *m12 = 2 * (qxy - qwz); *m13 = 2 * (qxz + qwy);
	*m21 = 2 * (qxy + qwz); *m22 = 1 - 2 * (qxx + qzz); *m23 = 2 * (qyz - qwx);
	*m31 = 2 * (qxz - qwy); *m32 = 2 * (qyz + qwx); *m33 = 1 - 2 * (qxx + qyy);
}

inline void approximateGivensQuaternion(float a11, float a12, float a22, float *ch, float *sh)
{
	/*
	* Given givens angle computed by approximateGivensAngles,
	* compute the corresponding rotation quaternion.
	*/
	*ch = 2 * (a11 - a22);
	*sh = a12;
	bool b = _gamma* (*sh)*(*sh) < (*ch)*(*ch);
	// fast rsqrt function suffices
	// rsqrt2 (https://code.google.com/p/lppython/source/browse/algorithm/HDcode/newCode/rsqrt.c?r=26)
	// is even faster but results in too much error
	float w = rsqrt((*ch)*(*ch) + (*sh)*(*sh));
	*ch = b ? w*(*ch) : (float)_cstar;
	*sh = b ? w*(*sh) : (float)_sstar;
}

inline void jacobiConjugation(const int x, const int y, const int z,
	float *s11,
	float *s21, float *s22,
	float *s31, float *s32, float *s33,
	float * qV)
{
	float ch, sh;
	approximateGivensQuaternion(*s11, *s21, *s22, &ch, &sh);

	float scale = ch*ch + sh*sh;
	float a = (ch*ch - sh*sh) / scale;
	float b = (2 * sh*ch) / scale;

	// make temp copy of S
	float _s11 = *s11;
	float _s21 = *s21; float _s22 = *s22;
	float _s31 = *s31; float _s32 = *s32; float _s33 = *s33;

	// perform conjugation S = Q'*S*Q
	// Q already implicitly solved from a, b
	*s11 = a*(a*_s11 + b*_s21) + b*(a*_s21 + b*_s22);
	*s21 = a*(-b*_s11 + a*_s21) + b*(-b*_s21 + a*_s22);	*s22 = -b*(-b*_s11 + a*_s21) + a*(-b*_s21 + a*_s22);
	*s31 = a*_s31 + b*_s32;								*s32 = -b*_s31 + a*_s32; *s33 = _s33;

	// update cumulative rotation qV
	float tmp[3];
	tmp[0] = qV[0] * sh;
	tmp[1] = qV[1] * sh;
	tmp[2] = qV[2] * sh;
	sh *= qV[3];

	qV[0] *= ch;
	qV[1] *= ch;
	qV[2] *= ch;
	qV[3] *= ch;

	// (x,y,z) corresponds to ((0,1,2),(1,2,0),(2,0,1))
	// for (p,q) = ((0,1),(1,2),(0,2))
	qV[z] += sh;
	qV[3] -= tmp[z]; // w
	qV[x] += tmp[y];
	qV[y] -= tmp[x];

	// re-arrange matrix for next iteration
	_s11 = *s22;
	_s21 = *s32; _s22 = *s33;
	_s31 = *s21; _s32 = *s31; _s33 = *s11;
	*s11 = _s11;
	*s21 = _s21; *s22 = _s22;
	*s31 = _s31; *s32 = _s32; *s33 = _s33;

}

inline float dist2(float x, float y, float z)
{
	return x*x + y*y + z*z;
}

// finds transformation that diagonalizes a symmetric matrix
inline void jacobiEigenanlysis( // symmetric matrix
	float *s11,
	float *s21, float *s22,
	float *s31, float *s32, float *s33,
	// quaternion representation of V
	float * qV)
{
	qV[3] = 1; qV[0] = 0; qV[1] = 0; qV[2] = 0; // follow same indexing convention as GLM
	for (int i = 0; i<4; i++)
	{
		// we wish to eliminate the maximum off-diagonal element
		// on every iteration, but cycling over all 3 possible rotations
		// in fixed order (p,q) = (1,2) , (2,3), (1,3) still retains
		//  asymptotic convergence
		jacobiConjugation(0, 1, 2, s11, s21, s22, s31, s32, s33, qV); // p,q = 0,1
		jacobiConjugation(1, 2, 0, s11, s21, s22, s31, s32, s33, qV); // p,q = 1,2
		jacobiConjugation(2, 0, 1, s11, s21, s22, s31, s32, s33, qV); // p,q = 0,2
	}
}


inline void sortSingularValues(// matrix that we want to decompose
	float *b11, float *b12, float *b13,
	float *b21, float *b22, float *b23,
	float *b31, float *b32, float *b33,
	// sort V simultaneously
	float *v11, float *v12, float *v13,
	float *v21, float *v22, float *v23,
	float *v31, float *v32, float *v33)
{
	float rho1 = dist2(*b11, *b21, *b31);
	float rho2 = dist2(*b12, *b22, *b32);
	float rho3 = dist2(*b13, *b23, *b33);
	bool c;
	c = rho1 < rho2;
	condNegSwap(c, b11, b12); condNegSwap(c, v11, v12);
	condNegSwap(c, b21, b22); condNegSwap(c, v21, v22);
	condNegSwap(c, b31, b32); condNegSwap(c, v31, v32);
	condSwap(c, &rho1, &rho2);
	c = rho1 < rho3;
	condNegSwap(c, b11, b13); condNegSwap(c, v11, v13);
	condNegSwap(c, b21, b23); condNegSwap(c, v21, v23);
	condNegSwap(c, b31, b33); condNegSwap(c, v31, v33);
	condSwap(c, &rho1, &rho3);
	c = rho2 < rho3;
	condNegSwap(c, b12, b13); condNegSwap(c, v12, v13);
	condNegSwap(c, b22, b23); condNegSwap(c, v22, v23);
	condNegSwap(c, b32, b33); condNegSwap(c, v32, v33);
}


void QRGivensQuaternion(float a1, float a2, float *ch, float *sh)
{
	// a1 = pivot point on diagonal
	// a2 = lower triangular entry we want to annihilate
	float epsilon = (float)EPSILON;
	float rho = accurateSqrt(a1*a1 + a2*a2);

	*sh = rho > epsilon ? a2 : 0;
	*ch = fabsf(a1) + fmaxf(rho, epsilon);
	bool b = a1 < 0;
	condSwap(b, sh, ch);
	float w = rsqrt((*ch)*(*ch) + (*sh)*(*sh));
	*ch *= w;
	*sh *= w;
}


inline void QRDecomposition(// matrix that we want to decompose
	float b11, float b12, float b13,
	float b21, float b22, float b23,
	float b31, float b32, float b33,
	// output Q
	float *q11, float *q12, float *q13,
	float *q21, float *q22, float *q23,
	float *q31, float *q32, float *q33,
	// output R
	float *r11, float *r12, float *r13,
	float *r21, float *r22, float *r23,
	float *r31, float *r32, float *r33)
{
	float ch1, sh1, ch2, sh2, ch3, sh3;
	float a, b;

	// first givens rotation (ch,0,0,sh)
	QRGivensQuaternion(b11, b21, &ch1, &sh1);
	a = 1 - 2 * sh1*sh1;
	b = 2 * ch1*sh1;
	// apply B = Q' * B
	*r11 = a*b11 + b*b21;  *r12 = a*b12 + b*b22;  *r13 = a*b13 + b*b23;
	*r21 = -b*b11 + a*b21; *r22 = -b*b12 + a*b22; *r23 = -b*b13 + a*b23;
	*r31 = b31;          *r32 = b32;          *r33 = b33;

	// second givens rotation (ch,0,-sh,0)
	QRGivensQuaternion(*r11, *r31, &ch2, &sh2);
	a = 1 - 2 * sh2*sh2;
	b = 2 * ch2*sh2;
	// apply B = Q' * B;
	b11 = a*(*r11) + b*(*r31);  b12 = a*(*r12) + b*(*r32);  b13 = a*(*r13) + b*(*r33);
	b21 = *r21;           b22 = *r22;           b23 = *r23;
	b31 = -b*(*r11) + a*(*r31); b32 = -b*(*r12) + a*(*r32); b33 = -b*(*r13) + a*(*r33);

	// third givens rotation (ch,sh,0,0)
	QRGivensQuaternion(b22, b32, &ch3, &sh3);
	a = 1 - 2 * sh3*sh3;
	b = 2 * ch3*sh3;
	// R is now set to desired value
	*r11 = b11;               *r12 = b12;             *r13 = b13;
	*r21 = a*b21 + b*b31;     *r22 = a*b22 + b*b32;   *r23 = a*b23 + b*b33;
	*r31 = -b*b21 + a*b31;    *r32 = -b*b22 + a*b32;  *r33 = -b*b23 + a*b33;

	// construct the cumulative rotation Q=Q1 * Q2 * Q3
	// the number of floating point operations for three quaternion multiplications
	// is more or less comparable to the explicit form of the joined matrix.
	// certainly more memory-efficient!
	float sh12 = sh1*sh1;
	float sh22 = sh2*sh2;
	float sh32 = sh3*sh3;

	*q11 = (-1 + 2 * sh12)*(-1 + 2 * sh22);
	*q12 = 4 * ch2*ch3*(-1 + 2 * sh12)*sh2*sh3 + 2 * ch1*sh1*(-1 + 2 * sh32);
	*q13 = 4 * ch1*ch3*sh1*sh3 - 2 * ch2*(-1 + 2 * sh12)*sh2*(-1 + 2 * sh32);

	*q21 = 2 * ch1*sh1*(1 - 2 * sh22);
	*q22 = -8 * ch1*ch2*ch3*sh1*sh2*sh3 + (-1 + 2 * sh12)*(-1 + 2 * sh32);
	*q23 = -2 * ch3*sh3 + 4 * sh1*(ch3*sh1*sh3 + ch1*ch2*sh2*(-1 + 2 * sh32));

	*q31 = 2 * ch2*sh2;
	*q32 = 2 * ch3*(1 - 2 * sh22)*sh3;
	*q33 = (-1 + 2 * sh22)*(-1 + 2 * sh32);
}

void svd(// input A
	float a11, float a12, float a13,
	float a21, float a22, float a23,
	float a31, float a32, float a33,
	// output U
	float *u11, float *u12, float *u13,
	float *u21, float *u22, float *u23,
	float *u31, float *u32, float *u33,
	// output S
	float *s11, float *s12, float *s13,
	float *s21, float *s22, float *s23,
	float *s31, float *s32, float *s33,
	// output V
	float *v11, float *v12, float *v13,
	float *v21, float *v22, float *v23,
	float *v31, float *v32, float *v33)
{
	// normal equations matrix
	float ATA11, ATA12, ATA13;
	float ATA21, ATA22, ATA23;
	float ATA31, ATA32, ATA33;

	multAtB(a11, a12, a13, a21, a22, a23, a31, a32, a33,
		a11, a12, a13, a21, a22, a23, a31, a32, a33,
		&ATA11, &ATA12, &ATA13, &ATA21, &ATA22, &ATA23, &ATA31, &ATA32, &ATA33);

	// symmetric eigenalysis
	float qV[4];
	jacobiEigenanlysis(&ATA11, &ATA21, &ATA22, &ATA31, &ATA32, &ATA33, qV);
	quatToMat3(qV, v11, v12, v13, v21, v22, v23, v31, v32, v33);

	float b11, b12, b13;
	float b21, b22, b23;
	float b31, b32, b33;
	multAB(a11, a12, a13, a21, a22, a23, a31, a32, a33,
		*v11, *v12, *v13, *v21, *v22, *v23, *v31, *v32, *v33,
		&b11, &b12, &b13, &b21, &b22, &b23, &b31, &b32, &b33);

	// sort singular values and find V
	sortSingularValues(&b11, &b12, &b13, &b21, &b22, &b23, &b31, &b32, &b33,
		v11, v12, v13, v21, v22, v23, v31, v32, v33);

	// QR decomposition
	QRDecomposition(b11, b12, b13, b21, b22, b23, b31, b32, b33,
		u11, u12, u13, u21, u22, u23, u31, u32, u33,
		s11, s12, s13, s21, s22, s23, s31, s32, s33
	);
}

/// polar decomposition can be reconstructed trivially from SVD result
// A = UP
void pd(float a11, float a12, float a13,
	float a21, float a22, float a23,
	float a31, float a32, float a33,
	// output U
	float *u11, float *u12, float *u13,
	float *u21, float *u22, float *u23,
	float *u31, float *u32, float *u33,
	// output P
	float *p11, float *p12, float *p13,
	float *p21, float *p22, float *p23,
	float *p31, float *p32, float *p33)
{
	float w11, w12, w13, w21, w22, w23, w31, w32, w33;
	float s11, s12, s13, s21, s22, s23, s31, s32, s33;
	float v11, v12, v13, v21, v22, v23, v31, v32, v33;

	svd(a11, a12, a13, a21, a22, a23, a31, a32, a33,
		&w11, &w12, &w13, &w21, &w22, &w23, &w31, &w32, &w33,
		&s11, &s12, &s13, &s21, &s22, &s23, &s31, &s32, &s33,
		&v11, &v12, &v13, &v21, &v22, &v23, &v31, &v32, &v33);

	// P = VSV'
	float t11, t12, t13, t21, t22, t23, t31, t32, t33;
	multAB(v11, v12, v13, v21, v22, v23, v31, v32, v33,
		s11, s12, s13, s21, s22, s23, s31, s32, s33,
		&t11, &t12, &t13, &t21, &t22, &t23, &t31, &t32, &t33);

	multAB(t11, t12, t13, t21, t22, t23, t31, t32, t33,
		v11, v21, v31, v12, v22, v32, v13, v23, v33,
		p11, p12, p13, p21, p22, p23, p31, p32, p33);

	// U = WV'
	multAB(w11, w12, w13, w21, w22, w23, w31, w32, w33,
		v11, v21, v31, v12, v22, v32, v13, v23, v33,
		u11, u12, u13, u21, u22, u23, u31, u32, u33);
}

#endif