9 files changed, 583 insertions, 296 deletions
diff --git a/Makefile b/Makefile
index 3e5059a..7c0b946 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,9 @@ all : lib data_recorder test calibrate calibrate_client simple_pose_test
 
 CC?=gcc
 
-CFLAGS:=-Iinclude/libsurvive -fPIC -g -O0 -Iredist -flto -DUSE_DOUBLE -std=gnu99 -rdynamic -llapacke  -lcblas -lm -fsanitize=address -fsanitize=undefined
+CFLAGS:=-Iinclude/libsurvive -fPIC -g -O0 -Iredist -flto -DUSE_DOUBLE -std=gnu99 -rdynamic -llapacke  -lcblas -lm
+CFLAGS_RELEASE:=-Iinclude/libsurvive -fPIC -msse2 -ftree-vectorize -O3 -Iredist -flto -DUSE_DOUBLE -std=gnu99 -rdynamic -llapacke  -lcblas -lm 
+
 #LDFLAGS:=-L/usr/local/lib -lpthread -lusb-1.0 -lz -lm -flto -g
 LDFLAGS:=-L/usr/local/lib -lpthread -lz -lm -flto -g
 
@@ -70,7 +72,7 @@ test : test.c ./lib/libsurvive.so redist/os_generic.o
 simple_pose_test : simple_pose_test.c ./lib/libsurvive.so redist/os_generic.o $(DRAWFUNCTIONS)
 	$(CC) -o $@ $^ $(LDFLAGS) $(CFLAGS)
 
-data_recorder : data_recorder.c ./lib/libsurvive.so redist/os_generic.c $(DRAWFUNCTIONS)
+data_recorder : data_recorder.c ./lib/libsurvive.so redist/os_generic.c
 	$(CC) -o $@ $^ $(LDFLAGS) $(CFLAGS)
 
 calibrate :  calibrate.c ./lib/libsurvive.so redist/os_generic.c $(DRAWFUNCTIONS)
@@ -83,6 +85,18 @@ calibrate_client :  calibrate_client.c ./lib/libsurvive.so redist/os_generic.c $
 static_calibrate : calibrate.c redist/os_generic.c $(DRAWFUNCTIONS) $(LIBSURVIVE_C)
 	tcc -o $@ $^ $(CFLAGS) $(LDFLAGS) -DTCC
 
+./redist/dclhelpers_debuggable.c : ./redist/dclhelpers.c ./redist/dclhelpers.h ./redist/dclapack.h
+	gcc -E ./redist/dclhelpers.c  > ./redist/dclhelpers_debuggable.c
+	clang-format -i ./redist/dclhelpers_debuggable.c
+	sed -i 's/#/\/\/#/g' ./redist/dclhelpers_debuggable.c
+
+
+test_dcl: ./redist/test_dcl.c ./redist/dclhelpers.c ./redist/dclhelpers.h ./redist/dclapack.h redist/os_generic.c ./redist/minimal_opencv.c ./src/epnp/epnp.c
+	$(CC) -o $@ $^ $(LDFLAGS) $(CFLAGS_RELEASE) -DFLT=double
+
+test_dcl_debug: ./redist/test_dcl.c ./redist/dclhelpers_debuggable.c ./redist/dclhelpers.h ./redist/dclapack.h redist/os_generic.c
+	$(CC) -o $@ $^ $(LDFLAGS) $(CFLAGS) -DFLT=double -fsanitize=address -fsanitize=undefined
+
 test_minimal_cv: ./src/epnp/test_minimal_cv.c ./lib/libsurvive.so 
 	$(CC) -o $@ $^ $(LDFLAGS) $(CFLAGS)
 
diff --git a/README.md b/README.md
index 4dce4f9..f404f39 100644
--- a/README.md
+++ b/README.md
@@ -82,6 +82,9 @@ Will ~~I~~ we succeed?  Probably not.  ~~Definitely going to try!~~ Though it's
 * Optionally OpenGL.
 On Debian, ```sudo apt-get install build-essential zlib1g-dev libx11-dev libusb-1.0-0-dev freeglut3-dev``` should be sufficient.
 
+Temporarily needed packages: liblapack and libopenblas
+On Debian, ```sudo apt-get install liblapacke-dev libopenblas-dev```
+
 ## Architecture
 
 <TABLE><TR><TH>Description</TH><TH>Diagram</TH>
diff --git a/data_recorder.c b/data_recorder.c
index 7f53d18..271b996 100644
--- a/data_recorder.c
+++ b/data_recorder.c
@@ -4,7 +4,6 @@
 #include <unistd.h>
 #endif
 
-#include <CNFGFunctions.h>
 #include <os_generic.h>
 #include <stdarg.h>
 #include <stdint.h>
@@ -25,28 +24,6 @@ struct SurviveContext *ctx;
 
 FILE *output_file = 0;
 
-void HandleKey(int keycode, int bDown) {
-	if (!bDown)
-		return;
-
-	if (keycode == 'O' || keycode == 'o') {
-		survive_send_magic(ctx, 1, 0, 0);
-	}
-	if (keycode == 'F' || keycode == 'f') {
-		survive_send_magic(ctx, 0, 0, 0);
-	}
-}
-
-void HandleButton(int x, int y, int button, int bDown) {}
-
-void HandleMotion(int x, int y, int mask) {}
-
-void HandleDestroy() {}
-
-int bufferpts[32 * 2 * 3];
-char buffermts[32 * 128 * 3];
-int buffertimeto[32 * 3];
-
 double timestamp_in_us() {
 	static double start_time_us = 0;
 	if (start_time_us == 0)
@@ -74,6 +51,7 @@ int my_config_process(SurviveObject *so, char *ct0conf, int len) {
 	fwrite("\n", 1, 1, output_file);
 	return survive_default_htc_config_process(so, ct0conf, len);
 }
+
 void my_lighthouse_process(SurviveContext *ctx, uint8_t lighthouse, SurvivePose *pose) {
 	survive_default_lighthouse_pose_process(ctx, lighthouse, pose);
 	write_to_output("%d LH_POSE %0.6f %0.6f %0.6f %0.6f %0.6f %0.6f %0.6f\n", lighthouse, pose->Pos[0], pose->Pos[1],
@@ -98,51 +76,38 @@ void my_light_process(struct SurviveObject *so, int sensor_id, int acode,
 	survive_default_light_process(so, sensor_id, acode, timeinsweep, timecode,
 								  length, lh);
 
-	if (acode == -1 || sensor_id < 0) {
+	if (acode == -1) {
 		write_to_output("%s S %d %d %d %u %u %u\n", so->codename, sensor_id, acode, timeinsweep, timecode, length, lh);
 		return;
 	}
 
-	int jumpoffset = sensor_id;
-
-	if (strcmp(so->codename, "WM0") == 0)
-		jumpoffset += 32;
-	else if (strcmp(so->codename, "WM1") == 0)
-		jumpoffset += 64;
-
 	const char *LH_ID = 0;
 	const char *LH_Axis = 0;
 
 	switch (acode) {
 	case 0:
 	case 2:
-		bufferpts[jumpoffset * 2 + 0] = (timeinsweep - 100000) / 500;
 		LH_ID = "L";
 		LH_Axis = "X";
 		break;
 	case 1:
 	case 3:
-		bufferpts[jumpoffset * 2 + 1] = (timeinsweep - 100000) / 500;
 		LH_ID = "L";
 		LH_Axis = "Y";
 		break;
 	case 4:
 	case 6:
-		bufferpts[jumpoffset * 2 + 0] = (timeinsweep - 100000) / 500;
 		LH_ID = "R";
 		LH_Axis = "X";
 		break;
 	case 5:
 	case 7:
-		bufferpts[jumpoffset * 2 + 1] = (timeinsweep - 100000) / 500;
 		LH_ID = "R";
 		LH_Axis = "Y";
 		break;
 	}
-
 	write_to_output("%s %s %s %d %d %d %u %u %u\n", so->codename, LH_ID, LH_Axis, sensor_id, acode, timeinsweep,
 					timecode, length, lh);
-	buffertimeto[jumpoffset] = 0;
 }
 
 void my_imu_process(struct SurviveObject *so, int mask, FLT *accelgyro,
@@ -152,47 +117,17 @@ void my_imu_process(struct SurviveObject *so, int mask, FLT *accelgyro,
 					accelgyro[1], accelgyro[2], accelgyro[3], accelgyro[4], accelgyro[5], id);
 }
 
-void *GuiThread(void *v) {
-	CNFGBGColor = 0x000000;
-	CNFGDialogColor = 0x444444;
-	CNFGSetup("Survive GUI Debug", 640, 480);
-
-	short screenx, screeny;
-	while (1) {
-		CNFGHandleInput();
-		CNFGClearFrame();
-		CNFGColor(0xFFFFFF);
-		CNFGGetDimensions(&screenx, &screeny);
-
-		int i;
-		for (i = 0; i < 32 * 3; i++) {
-			if (buffertimeto[i] < 50) {
-				uint32_t color = i * 3231349;
-				uint8_t r = color & 0xff;
-				uint8_t g = (color >> 8) & 0xff;
-				uint8_t b = (color >> 16) & 0xff;
-				r = (r * (5 - buffertimeto[i])) / 5;
-				g = (g * (5 - buffertimeto[i])) / 5;
-				b = (b * (5 - buffertimeto[i])) / 5;
-				CNFGColor((b << 16) | (g << 8) | r);
-				CNFGTackRectangle(bufferpts[i * 2 + 0], bufferpts[i * 2 + 1],
-								  bufferpts[i * 2 + 0] + 5,
-								  bufferpts[i * 2 + 1] + 5);
-				CNFGPenX = bufferpts[i * 2 + 0];
-				CNFGPenY = bufferpts[i * 2 + 1];
-				CNFGDrawText(buffermts, 2);
-				buffertimeto[i]++;
-			}
+int main(int argc, char **argv) {
+	if (argc > 1) {
+		output_file = fopen(argv[1], "w");
+		if (output_file == 0) {
+			fprintf(stderr, "Could not open %s for writing", argv[1]);
+			return -1;
 		}
-
-		CNFGSwapBuffers();
-		OGUSleep(10000);
+	} else {
+		output_file = stdout;
 	}
-}
-
-int SurviveThreadLoaded = 0;
 
-void *SurviveThread(void *junk) {
 	ctx = survive_init_with_config_cb(0, my_config_process);
 
 	survive_install_light_fn(ctx, my_light_process);
@@ -201,6 +136,7 @@ void *SurviveThread(void *junk) {
 	survive_install_raw_pose_fn(ctx, my_raw_pose_process);
 	survive_install_angle_fn(ctx, my_angle_process);
 	survive_install_info_fn(ctx, my_info_process);
+
 	survive_cal_install(ctx);
 
 	if (!ctx) {
@@ -208,35 +144,8 @@ void *SurviveThread(void *junk) {
 		exit(1);
 	}
 
-	SurviveThreadLoaded = 1;
-
 	while (survive_poll(ctx) == 0) {
 	}
 
 	return 0;
 }
-
-int main(int argc, char **argv) {
-	if (argc > 1) {
-		output_file = fopen(argv[1], "w");
-		if (output_file == 0) {
-			fprintf(stderr, "Could not open %s for writing", argv[1]);
-			return -1;
-		}
-	} else {
-		output_file = stdout;
-	}
-	SurviveThread(0);
-	/*
-	// Create the libsurvive thread
-	OGCreateThread(SurviveThread, 0);
-
-	// Wait for the survive thread to load
-	while (!SurviveThreadLoaded) {
-		OGUSleep(100);
-	}
-
-	// Run the Gui in the main thread
-	GuiThread(0);
-	*/
-}
diff --git a/redist/dclapack.h b/redist/dclapack.h
index 4e209d3..0950815 100644
--- a/redist/dclapack.h
+++ b/redist/dclapack.h
@@ -1,10 +1,6 @@
 #ifndef __DCLAPACK_H__
 #define __DCLAPACK_H__
 
-#ifndef ORDER
-#define ORDER 50
-#endif
-
 #ifndef FLOAT
 #define FLOAT float
 #endif
@@ -13,146 +9,207 @@
 
 #define _ABS(a)  ( (a)<=0 ? 0-(a) : (a) )
 
+// Tricky: If you want to use this with pointers, instead of 2D arrays, you will
+// need to #define DYNAMIC_INDEX, as well as, for all arrays, suffix their name
+// with 'c'
+
+#ifdef DYNAMIC_INDEX
+#define _(AM, O, P) AM[O * AM##c + P]
+#else
+#define _(AM, O, P) AM[O][P]
+#endif
+
 /*
- * Prints a matrix A (n by m)
+ * Prints a matrix A (m by n)  (with witdth Ac)
  */
-#define PRINT(A,n,m) { \
-    int i,j; \
-    printf(#A "\n"); \
-    for (i=0; i<n; i++) { \
-        for (j=0; j<m; j++) { \
-            printf("%4.3f\t", A[i][j]); \
-        } \
-        printf("\n"); \
-    } \
-    printf("\n"); \
-} 
+#define PRINT(A, m, n)                                                                                                 \
+	{                                                                                                                  \
+		printf(#A " %dx%d\n", m, n);                                                                                   \
+		for (int _i = 0; _i < (m); _i++) {                                                                             \
+			for (int _j = 0; _j < (n); _j++) {                                                                         \
+				printf("%4.3f ", _(A, _i, _j));                                                                        \
+			}                                                                                                          \
+			printf("\n");                                                                                              \
+		}                                                                                                              \
+		printf("\n");                                                                                                  \
+	}
 
 /*
- * Returns the identity matrix
+ * Returns the identity matrix (with size n x n, but width Ic)
  */
-#define IDENTITY(I,n) { \
-    int i,j; \
-    for (i=0; i<n; i++) { \
-        for (j=0; j<i; j++) { I[i][j]=0.0f; } \
-        I[i][i] = 1.0f; \
-        for (j=i+1; j<n; j++) { I[i][j]=0.0f; } \
-    } \
-}
+#define IDENTITY(I, m, n)                                                                                              \
+	{                                                                                                                  \
+		for (int _i = 0; _i < (m); _i++) {                                                                             \
+			for (int _j = 0; _j < _i; _j++) {                                                                          \
+				_(I, _i, _j) = 0.0f;                                                                                   \
+			}                                                                                                          \
+			_(I, _i, _i) = 1.0f;                                                                                       \
+			for (int _j = _i + 1; _j < (n); _j++) {                                                                    \
+				_(I, _i, _j) = 0.0f;                                                                                   \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 /*
- * B = Transpose(A)
- *  A is (n by m)
- *  B is (m by n)
+ * Returns the identity matrix (with size n x n, but width Ic)
  */
-#define TRANSP(A,B,n,m) { \
-   int i,j; \
-   for (i=0; i<n; i++) { \
-      for (j=0; j<m; j++) { \
-         B[j][i] = A[i][j]; \
-      } \
-   } \
-}
+#define ZERO(Z, m, n)                                                                                                  \
+	{                                                                                                                  \
+		for (int _i = 0; _i < (m); _i++)                                                                               \
+			for (int _j = 0; _j < (n); _j++)                                                                           \
+				_(Z, _i, _j) = 0.0f;                                                                                   \
+	}
+
+/*
+ * R = Transpose(A)
+ *  A is (m by n)
+ *  R is (n by m)
+ */
+#define TRANSP(R, A, m, n)                                                                                             \
+	{                                                                                                                  \
+		for (int _i = 0; _i < (m); _i++) {                                                                             \
+			for (int _j = 0; _j < (n); _j++) {                                                                         \
+				_(R, _j, _i) = _(A, _i, _j);                                                                           \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 /*
  * Calculate L,U of a matrix A with pivot table
  */
-#define LU(A,L,U,Piv,n) { \
-    int i,j,k,_tempi; float _tempf; \
-    for (i=0; i<n; i++) { Piv[i]=i; } \
-    for (i=0; i<n; i++) { \
-        for (j=0; j<n; j++) { \
-            U[i][j] = A[i][j]; \
-        } \
-    } \
-    IDENTITY(L,n); \
-    \
-    for (i=0; i<n-1; i++) { \
-        \
-        int max=i; \
-	for (j=i+1; j<n; j++) { \
-		if (_ABS(U[j][i]) > _ABS(U[max][i])) { max = j; } \
-	} \
-	_tempi=Piv[i]; Piv[i]=Piv[max]; Piv[max]=_tempi; \
-	for (k=i; k<n; k++) { \
-		_tempf=U[i][k]; U[i][k]=U[max][k]; U[max][k]=_tempf; \
-	} \
-	for (k=0; k<i; k++) { \
-		_tempf=L[i][k]; L[i][k]=L[max][k]; L[max][k]=_tempf; \
-	} \
-        \
-        FLOAT invDiag = 1.0 / U[i][i]; \
-        for (j=i+1; j<n; j++) { \
-            FLOAT scale = U[j][i] * invDiag; \
-            U[j][i] = 0.0; \
-            for (k=i+1; k<n; k++) { U[j][k] -= U[i][k]*scale; } \
-            L[j][i] = scale; \
-        } \
-    } \
-}
+#define LU(L, U, A, Piv, n)                                                                                            \
+	{                                                                                                                  \
+		int _i, _j, _k, _tempi;                                                                                        \
+		float _tempf;                                                                                                  \
+		for (_i = 0; _i < (n); _i++) {                                                                                 \
+			Piv[_i] = _i;                                                                                              \
+		}                                                                                                              \
+		for (_i = 0; _i < (n); _i++) {                                                                                 \
+			for (_j = 0; _j < (n); _j++) {                                                                             \
+				_(U, _i, _j) = _(A, _i, _j);                                                                           \
+			}                                                                                                          \
+		}                                                                                                              \
+		IDENTITY(L, n, n);                                                                                             \
+                                                                                                                       \
+		for (_i = 0; _i < (n)-1; _i++) {                                                                               \
+                                                                                                                       \
+			int max = _i;                                                                                              \
+			for (_j = _i + 1; _j < (n); _j++) {                                                                        \
+				if (_ABS(_(U, _j, _i)) > _ABS(_(U, max, _i))) {                                                        \
+					max = _j;                                                                                          \
+				}                                                                                                      \
+			}                                                                                                          \
+			_tempi = Piv[_i];                                                                                          \
+			Piv[_i] = Piv[max];                                                                                        \
+			Piv[max] = _tempi;                                                                                         \
+			for (_k = _i; _k < (n); _k++) {                                                                            \
+				_tempf = _(U, _i, _k);                                                                                 \
+				_(U, _i, _k) = _(U, max, _k);                                                                          \
+				_(U, max, _k) = _tempf;                                                                                \
+			}                                                                                                          \
+			for (_k = 0; _k < _i; _k++) {                                                                              \
+				_tempf = _(L, _i, _k);                                                                                 \
+				_(L, _i, _k) = _(L, max, _k);                                                                          \
+				_(L, max, _k) = _tempf;                                                                                \
+			}                                                                                                          \
+                                                                                                                       \
+			FLOAT invDiag = 1.0 / _(U, _i, _i);                                                                        \
+			for (_j = _i + 1; _j < (n); _j++) {                                                                        \
+				FLOAT scale = _(U, _j, _i) * invDiag;                                                                  \
+				_(U, _j, _i) = 0.0;                                                                                    \
+				for (_k = _i + 1; _k < (n); _k++) {                                                                    \
+					_(U, _j, _k) -= _(U, _i, _k) * scale;                                                              \
+				}                                                                                                      \
+				_(L, _j, _i) = scale;                                                                                  \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 /*
  * Pivots a matrix to a different matrix
- *  B = Pivot(A) given table 'Piv'
- *  A and B are (n by m)
+ *  R = Pivot(A) given table 'Piv'
+ *  A and R are (m by n)
  */
-#define PIVOT(A,B,Piv,n,m) { \
-    int i,j; \
-    for (j=0; j<n; j++) { \
-        for (i=0; i<m; i++) { \
-            B[j][i] = A[Piv[j]][i]; \
-        } \
-    } \
-}
+#define PIVOT(R, A, Piv, m, n)                                                                                         \
+	{                                                                                                                  \
+		for (int _i = 0; _i < (m); _i++) {                                                                             \
+			for (int _j = 0; _j < (n); _j++) {                                                                         \
+				_(R, _i, _j) = _(A, Piv[_i], _j);                                                                      \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 /*
  * Solve LX=B for matrix X and B
- *  L is n by n (lower triangular)
- *  B is n by m
+ *  L is m by m (lower triangular)
+ *  B is m by n
  */
-#define L_SUB(L,X,B,n,m) { \
-    int i,j,k; \
-    for (i=0; i<m; i++) { \
-        for (j=0; j<n; j++) { \
-            float sum=0.0; \
-            for (k=0; k<j; k++) { sum += L[j][k]*X[k][i]; } \
-            X[j][i] = (B[j][i] - sum) / L[j][j]; \
-        } \
-    } \
-}
+#define L_SUB(X, L, B, m, n)                                                                                           \
+	{                                                                                                                  \
+		for (int _i = 0; _i < (n); _i++) {                                                                             \
+			for (int _j = 0; _j < (m); _j++) {                                                                         \
+				float sum = 0.0;                                                                                       \
+				for (int _k = 0; _k < _j; _k++) {                                                                      \
+					sum += _(L, _j, _k) * _(X, _k, _i);                                                                \
+				}                                                                                                      \
+				_(X, _j, _i) = (_(B, _j, _i) - sum) / _(L, _j, _j);                                                    \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 /*
  * Solve UX=B for matrix X and B
- *  U is n by n (upper triangular)
- *  B is n by m
+ *  U is m by m (upper triangular)
+ *  B is m by n
  */
-#define U_SUB(U,X,B,n,m) { \
-    int i,j,k; \
-    for (i=0; i<m; i++) { \
-        for (j=n-1; j>=0; j--) { \
-            float sum=0.0; \
-            for (k=n-1; k>j; k--) { sum += U[j][k]*X[k][i]; } \
-            X[j][i] = (B[j][i] - sum) / U[j][j]; \
-        } \
-    } \
-}
+#define U_SUB(X, U, B, m, n)                                                                                           \
+	{                                                                                                                  \
+		for (int _i = 0; _i < (n); _i++) {                                                                             \
+			for (int _j = m - 1; _j >= 0; _j--) {                                                                      \
+				float sum = 0.0;                                                                                       \
+				for (int _k = n - 1; _k > _j; _k--) {                                                                  \
+					sum += _(U, _j, _k) * _(X, _k, _i);                                                                \
+				}                                                                                                      \
+				_(X, _j, _i) = (_(B, _j, _i) - sum) / _(U, _j, _j);                                                    \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 /*
  * Inverts a matrix X (n by n) using the method of LU decomposition
  */
-#define INV(A,Ainv,n) { \
-    FLOAT Ipiv[ORDER][ORDER]; \
-    FLOAT L[ORDER][ORDER]; \
-    FLOAT U[ORDER][ORDER]; \
-    FLOAT I[ORDER][ORDER]; \
-    FLOAT C[ORDER][ORDER]; \
-    int Piv[ORDER]; \
-    IDENTITY(I,n); \
-    LU(A,L,U,Piv,n); \
-    PIVOT(I,Ipiv,Piv,n,n); \
-    L_SUB(L,C,Ipiv,n,n); \
-    U_SUB(U,Ainv,C,n,n); \
-}
+
+#ifdef DYNAMIC_INDEX
+#define INV_SETUP(ORDER)                                                                                               \
+	FLOAT Ipiv[ORDER * ORDER];                                                                                         \
+	const int Ipivc = ORDER;                                                                                           \
+	FLOAT L[ORDER * ORDER];                                                                                            \
+	const int Lc = ORDER;                                                                                              \
+	FLOAT U[ORDER * ORDER];                                                                                            \
+	const int Uc = ORDER;                                                                                              \
+	FLOAT I[ORDER * ORDER];                                                                                            \
+	const int Ic = ORDER;                                                                                              \
+	FLOAT C[ORDER * ORDER];                                                                                            \
+	const int Cc = ORDER;
+#else
+#define INV_SETUP(ORDER)                                                                                               \
+	FLOAT Ipiv[ORDER][ORDER];                                                                                          \
+	FLOAT L[ORDER][ORDER];                                                                                             \
+	FLOAT U[ORDER][ORDER];                                                                                             \
+	FLOAT I[ORDER][ORDER];                                                                                             \
+	FLOAT C[ORDER][ORDER];
+#endif
+
+#define INV(Ainv, A, n, ORDER)                                                                                         \
+	{                                                                                                                  \
+		INV_SETUP(ORDER)                                                                                               \
+		int Piv[ORDER];                                                                                                \
+		IDENTITY(I, n, n);                                                                                             \
+		LU(L, U, A, Piv, n);                                                                                           \
+		PIVOT(Ipiv, I, Piv, n, n);                                                                                     \
+		L_SUB(C, L, Ipiv, n, n);                                                                                       \
+		U_SUB(Ainv, U, C, n, n);                                                                                       \
+	}
 
 /*
 PRINT(A,n,n); \
@@ -165,61 +222,61 @@ PRINT(Ainv,n,n); \
 */
 
 /*
- * Matrix Multiply C = A * B
- *  A (n by m)
+ * Matrix Multiply R = A * B
+ *  R (n by p)
+ *  A (m by n)
  *  B (m by p)
- *  C (n by p)
  */
-#define MUL(A,B,C,n,m,p) { \
-    int i,j,k; \
-    for (i=0; i<n; i++) { \
-        for (j=0; j<p; j++) { \
-            C[i][j] = 0.0f; \
-            for (k=0; k<m; k++) { \
-                C[i][j] += A[i][k] * B[k][j]; \
-            } \
-        } \
-    } \
-}
+#define MUL(R, A, B, m, n, p)                                                                                          \
+	{                                                                                                                  \
+		for (int _i = 0; _i < (m); _i++) {                                                                             \
+			for (int _j = 0; _j < p; _j++) {                                                                           \
+				_(R, _i, _j) = 0.0f;                                                                                   \
+				for (int _k = 0; _k < (n); _k++) {                                                                     \
+					_(R, _i, _j) += _(A, _i, _k) * _(B, _k, _j);                                                       \
+				}                                                                                                      \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 /*
- * Matrix Multiply D = A * B + C
- *  A (n by m)
- *  B (m by p)
- *  C (n by p)
- *  D (n by p)
+ * Matrix Multiply R = A * B + C
+ *  R (m by p)
+ *  A (m by n)
+ *  B (n by p)
+ *  C (m by p)
  */
-#define MULADD(A,B,C,D,n,m,p) { \
-    int i,j,k; \
-    for (i=0; i<n; i++) { \
-        for (j=0; j<p; j++) { \
-            D[i][j] = C[i][j]; \
-            for (k=0; k<m; k++) { \
-                D[i][j] += A[i][k] * B[k][j]; \
-            } \
-        } \
-    } \
-}
+#define MULADD(R, A, B, C, m, n, p)                                                                                    \
+	{                                                                                                                  \
+		for (int _i = 0; _i < (m); _i++) {                                                                             \
+			for (int _j = 0; _j < p; _j++) {                                                                           \
+				_(R, _i, _j) = _(C, _i, _j);                                                                           \
+				for (int _k = 0; _k < (n); _k++) {                                                                     \
+					_(R, _i, _j) += _(A, _i, _k) * _(B, _k, _j);                                                       \
+				}                                                                                                      \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 /*
- * Matrix Multiply D = alpha * A * B + beta * C
- *  A (n by m)
- *  B (m by p)
- *  C (n by p)
- *  D (n by p)
+ * Matrix Multiply R = alpha * A * B + beta * C
+ *  R (m by p)
+ *  A (m by n)
+ *  B (n by p)
+ *  C (m by p)
  */
-#define GMULADD(A,B,C,D,alpha,beta,n,m,p) { \
-    int i,j,k; \
-    float sum; \
-    for (i=0; i<n; i++) { \
-        for (j=0; j<p; j++) { \
-            sum = 0.0f; \
-            for (k=0; k<m; k++) { \
-                sum += A[i][k] * B[k][j]; \
-            } \
-            D[i][j] = alpha * sum + beta * C[i][j]; \
-        } \
-    } \
-}
+#define GMULADD(R, A, B, C, alpha, beta, m, n, p)                                                                      \
+	{                                                                                                                  \
+		float sum;                                                                                                     \
+		for (int _i = 0; _i < m; _i++) {                                                                               \
+			for (int _j = 0; _j < p; _j++) {                                                                           \
+				sum = 0.0f;                                                                                            \
+				for (int _k = 0; _k < n; _k++) {                                                                       \
+					sum += _(A, _i, _k) * _(B, _k, _j);                                                                \
+				}                                                                                                      \
+				_(R, _i, _j) = alpha * sum + beta * _(C, _i, _j);                                                      \
+			}                                                                                                          \
+		}                                                                                                              \
+	}
 
 #endif
diff --git a/redist/dclhelpers.c b/redist/dclhelpers.c
new file mode 100644
index 0000000..9ed7c5f
--- /dev/null
+++ b/redist/dclhelpers.c
@@ -0,0 +1,80 @@
+#include "dclhelpers.h"
+#define FLOAT DCL_FLOAT
+#define DYNAMIC_INDEX
+#include "dclapack.h"
+#include <alloca.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+void dclPrint(const DCL_FLOAT *PMATRIX, int PMATRIXc, int n, int m) { PRINT(PMATRIX, n, m); }
+
+void dclIdentity(DCL_FLOAT *I, int Ic, int m, int n) { IDENTITY(I, m, n); }
+
+/* Returns the zero matrix */
+void dclZero(DCL_FLOAT *Z, int Zc, int m, int n) { memset(Z, 0, m * n * sizeof(DCL_FLOAT)); }
+
+void dclTransp(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, int n, int m) { TRANSP(R, A, n, m); }
+
+void dclLU(DCL_FLOAT *L, int Lc, DCL_FLOAT *U, int Uc, const DCL_FLOAT *A, int Ac, int *Piv, int n) {
+	LU(L, U, A, Piv, n);
+}
+
+void dclPivot(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, int *Piv, int n, int m) { PIVOT(R, A, Piv, n, m); }
+
+void dclLSub(DCL_FLOAT *X, int Xc, const DCL_FLOAT *L, int Lc, const DCL_FLOAT *B, int Bc, int n, int m) {
+	L_SUB(X, L, B, n, m);
+}
+
+void dclUSub(DCL_FLOAT *X, int Xc, const DCL_FLOAT *U, int Uc, const DCL_FLOAT *B, int Bc, int n, int m) {
+	U_SUB(X, U, B, n, m);
+}
+
+void dclInv(DCL_FLOAT *Ainv, int Ainvc, const DCL_FLOAT *A, int Ac, int n) { INV(Ainv, A, n, n); }
+
+void dclMul(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, const DCL_FLOAT *B, int Bc, int n, int m, int p) {
+	MUL(R, A, B, n, m, p);
+}
+
+void dclMulAdd(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, const DCL_FLOAT *B, int Bc, const DCL_FLOAT *C, int Cc,
+			   int n, int m, int p) {
+	MULADD(R, A, B, C, n, m, p);
+}
+
+void dclGMulAdd(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, const DCL_FLOAT *B, int Bc, const DCL_FLOAT *C,
+				int Cc, DCL_FLOAT alpha, DCL_FLOAT beta, int n, int m, int p) {
+	GMULADD(R, A, B, C, alpha, beta, n, m, p);
+}
+
+/* dclGMulAdd( R, ((transA)?TRANS(A):A, (transB)?TRANS(B):B), C, alpha, beta, n, m, p ); */
+void dcldgemm(char transA, char transB, int m, int n, int k, DCL_FLOAT alpha, const DCL_FLOAT *A,
+			  int Ac, // must be n
+			  const DCL_FLOAT *B,
+			  int Bc, // must be m
+			  DCL_FLOAT beta, DCL_FLOAT *C,
+			  int Cc // must be n
+			  ) {
+	const DCL_FLOAT *ta;
+	const DCL_FLOAT *tb;
+	int tac = Ac;
+	int tbc = Bc;
+	if (transA) {
+		DCL_FLOAT *la = alloca(sizeof(DCL_FLOAT) * n * m);
+		const int lac = m;
+		TRANSP(la, A, n, m);
+		ta = la;
+		tac = lac;
+	} else
+		ta = A;
+
+	if (transB) {
+		DCL_FLOAT *lb = alloca(sizeof(DCL_FLOAT) * n * m);
+		const int lbc = m;
+		TRANSP(lb, B, n, m);
+		tb = lb;
+		tbc = lbc;
+	} else
+		tb = B;
+
+	GMULADD(C, ta, tb, C, alpha, beta, m, n, k);
+}
diff --git a/redist/dclhelpers.h b/redist/dclhelpers.h
new file mode 100644
index 0000000..00acdac
--- /dev/null
+++ b/redist/dclhelpers.h
@@ -0,0 +1,74 @@
+#ifndef _DCL_HELPERS_H
+#define _DCL_HELPERS_H
+
+#define DCL_FLOAT FLT
+
+// Use this macro to safely
+#define DMS(m) ((m)[0]), (sizeof((m)[0]) / sizeof((m)[0][0]))
+
+/* Prints matrix A of size[n][m] */
+void dclPrint(const DCL_FLOAT *A, int Ac, int n, int m);
+
+/* Returns the identity matrix */
+void dclIdentity(DCL_FLOAT *I, int Ic, int m, int n);
+
+/* Returns the zero matrix */
+void dclZero(DCL_FLOAT *I, int Ic, int m, int n);
+
+/* R = Transpose(A)
+	 A is (n by m)
+	 R is (m by n) */
+void dclTransp(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, int n, int m);
+
+/* Calculate L,U of a matrix A with pivot table; the pivot table is output. */
+void dclLU(DCL_FLOAT *L, int Lc, DCL_FLOAT *U, int Uc, const DCL_FLOAT *A, int Ac, int *Piv, int n);
+
+/* Pivots a matrix to a different matrix
+	R = Pivot(A) given table 'Piv'
+	A and R are (n by m) */
+void dclPivot(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, int *Piv, int n, int m);
+
+/* Solve LX=B for matrix X and B
+	L is n by n (lower triangular)
+	B is n by m */
+void dclLSub(DCL_FLOAT *X, int Xc, const DCL_FLOAT *L, int Lc, const DCL_FLOAT *B, int Bc, int n, int m);
+
+/* Solve UX=B for matrix X and B
+	U is n by n (upper triangular)
+	B is n by m */
+void dclUSub(DCL_FLOAT *X, int Xc, const DCL_FLOAT *U, int Uc, const DCL_FLOAT *B, int Bc, int n, int m);
+
+/* Inverts a matrix X (n by n) using the method of LU decomposition */
+void dclInv(DCL_FLOAT *Ainv, int Ainvc, const DCL_FLOAT *A, int Ac, int n);
+
+/* Matrix Multiply R = A * B
+	A (n by m)
+	B (m by p)
+	R (n by p) */
+void dclMul(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, const DCL_FLOAT *B, int Bc, int n, int m, int p);
+
+/* Matrix Multiply R = A * B + C
+	A (n by m)
+	B (m by p)
+	C (n by p)
+	R (n by p) */
+void dclMulAdd(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, const DCL_FLOAT *B, int Bc, const DCL_FLOAT *C, int Cc,
+			   int n, int m, int p);
+
+/* Matrix Multiply R = alpha * A * B + beta * C
+	A (n by m)
+	B (m by p)
+	C (n by p)
+	R (n by p) */
+void dclGMulAdd(DCL_FLOAT *R, int Rc, const DCL_FLOAT *A, int Ac, const DCL_FLOAT *B, int Bc, const DCL_FLOAT *C,
+				int Cc, DCL_FLOAT alpha, DCL_FLOAT beta, int n, int m, int p);
+
+/********************************
+ * Auxiliary functionality in C *
+ ********************************/
+
+// Matches dgemm from lapack.
+void dcldgemm(char transA, char transB, int m, int n, int k, DCL_FLOAT alpha, const DCL_FLOAT *A, int Ac,
+			  const DCL_FLOAT *B, int Bc, DCL_FLOAT beta, DCL_FLOAT *C, int Cc);
+
+#endif
diff --git a/redist/minimal_opencv.c b/redist/minimal_opencv.c
index e8a6214..50eb7df 100644
--- a/redist/minimal_opencv.c
+++ b/redist/minimal_opencv.c
@@ -44,11 +44,8 @@ void cvGEMM(const CvMat *src1, const CvMat *src2, double alpha, const CvMat *src
 		beta = 0;
 
 	cblas_dgemm(CblasRowMajor, (tABC & GEMM_1_T) ? CblasTrans : CblasNoTrans,
-				(tABC & GEMM_2_T) ? CblasTrans : CblasNoTrans, src1->rows, dst->cols, src1->cols, alpha,
-
-				src1->data.db, lda, src2->data.db, ldb, beta,
-
-				dst->data.db, dst->cols);
+				(tABC & GEMM_2_T) ? CblasTrans : CblasNoTrans, src1->rows, dst->cols, src1->cols, alpha, src1->data.db,
+				lda, src2->data.db, ldb, beta, dst->data.db, dst->cols);
 }
 
 void cvMulTransposed(const CvMat *src, CvMat *dst, int order, const CvMat *delta, double scale) {
@@ -67,11 +64,7 @@ void cvMulTransposed(const CvMat *src, CvMat *dst, int order, const CvMat *delta
 	lapack_int dstCols = dst->cols;
 
 	cblas_dgemm(CblasRowMajor, isAT ? CblasTrans : CblasNoTrans, isBT ? CblasTrans : CblasNoTrans, cols, dstCols, rows,
-				scale,
-
-				src->data.db, cols, src->data.db, cols, beta,
-
-				dst->data.db, dstCols);
+				scale, src->data.db, cols, src->data.db, cols, beta, dst->data.db, dstCols);
 }
 
 void *cvAlloc(size_t size) { return malloc(size); }
diff --git a/redist/test_dcl.c b/redist/test_dcl.c
new file mode 100644
index 0000000..ded2863
--- /dev/null
+++ b/redist/test_dcl.c
@@ -0,0 +1,155 @@
+// gcc -msse2 -O3 -ftree-vectorize test_dcl.c dclhelpers.c os_generic.c -DFLT=double -lpthread -lcblas && valgrind
+// ./a.out
+
+#include "dclhelpers.h"
+#include "os_generic.h"
+#include <assert.h>
+#include <cblas.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "minimal_opencv.h"
+
+void fill_random(FLT *A, int ld, int m, int n) {
+	assert(ld == n);
+	for (int y = 0; y < m; y++)
+		for (int x = 0; x < n; x++)
+			A[y * ld + x] = (rand()) / (double)RAND_MAX;
+}
+
+void test_dcldgemm_speed(const char *name, char transA, char transB, int m, int n, int k, DCL_FLOAT alpha,
+						 const DCL_FLOAT *A, int Ac, const DCL_FLOAT *B, int Bc, DCL_FLOAT beta) {
+	printf("%s speed test:\n", name);
+	double times[2];
+	FLT emo[2][m][n];
+
+	for (int z = 0; z < 2; z++) {
+		double start = OGGetAbsoluteTime();
+		for (int i = 0; i < 100000; i++) {
+			dclZero(DMS(emo[z]), m, n);
+
+			if (z == 0) {
+				dcldgemm(transA, transB, m, n, k, alpha, A, Ac, B, Bc, beta, DMS(emo[z]));
+			} else {
+
+				cblas_dgemm(CblasRowMajor, transA == 1 ? CblasTrans : CblasNoTrans,
+							transB == 1 ? CblasTrans : CblasNoTrans, m, n, k, alpha, A, Ac, B, Bc, beta, emo[z][0], n);
+			}
+		}
+
+		times[z] = OGGetAbsoluteTime() - start;
+	}
+
+	dclPrint(DMS(emo[0]), m, n);
+	dclPrint(DMS(emo[1]), m, n);
+	printf("dcl  Elapsed: %f\n", times[0]);
+	printf("cblas Elapsed: %f\n", times[1]);
+	printf("%fx difference\n", times[0] / times[1]);
+
+	for (int i = 0; i < m; i++) {
+		for (int j = 0; j < k; j++) {
+			assert(fabs(emo[0][i][j] - emo[1][i][j]) < .00001);
+		}
+	}
+}
+
+void compareToCblas() {
+	srand(0);
+	int m = 12;
+	int n = 20;
+	int k = 20;
+
+	FLT em1[m][n];
+	FLT em2[n][k];
+
+	fill_random(DMS(em1), m, n);
+	fill_random(DMS(em2), n, k);
+
+	dclPrint(DMS(em1), m, n);
+	dclPrint(DMS(em2), n, k);
+
+	test_dcldgemm_speed("Simple", 0, 0, m, n, k, 1.0, DMS(em1), DMS(em2), .1);
+}
+
+void compareToCblasTrans() {
+	srand(0);
+	int m = 12;
+	int n = 20;
+	int k = n;
+
+	FLT em1[m][n];
+
+	fill_random(DMS(em1), m, n);
+
+	dclPrint(DMS(em1), m, n);
+
+	CvMat Em1 = cvMat(m, n, CV_64F, em1);
+	FLT em1tem1[n][n];
+	CvMat Em1tEm1 = cvMat(n, n, CV_64F, em1tem1);
+	cvMulTransposed(&Em1, &Em1tEm1, 1, 0, 1);
+	print_mat(&Em1tEm1);
+
+	test_dcldgemm_speed("Trans", 1, 0,
+						n, // # of rows in OP(A) == em1' -- 20
+						n, // # of cols in OP(B) == em1 -- 20
+						m, // # of cols in OP(A) == em1' -- 12
+						1.0,
+						DMS(em1), // Note that LD stays the same
+						DMS(em1), 0);
+}
+
+int main() {
+	FLT A[2][4] = {{0, 1, 2, 3}, {4, 5, 6, 7}};
+	FLT B[4][2];
+	dclPrint(A[0], 4, 2, 4);
+	dclTransp(B[0], 2, A[0], 4, 2, 4);
+	dclPrint(B[0], 2, 4, 2);
+
+	int i, j;
+	for (i = 0; i < 8; i++) {
+		printf("%f\n", ((float *)(B[0]))[i]);
+	}
+
+	FLT M[3][3] = {{.32, 1, 0}, {0, 1, 2}, {1, 0, 1}};
+	FLT Mo[3][3];
+	dclInv(Mo[0], 3, M[0], 3, 3);
+	dclPrint(Mo[0], 3, 3, 3);
+
+	FLT MM[3][3];
+	dclMul(MM[0], 3, M[0], 3, Mo[0], 3, 3, 3, 3);
+
+	printf("The following should be an identity matrix\n");
+	dclPrint(MM[0], 3, 3, 3);
+
+	{
+		FLT A[3][4];
+		dclIdentity(DMS(A), 3, 4);
+		dclPrint(DMS(A), 3, 4);
+
+		FLT x[4][2] = {
+			{7, -7}, {8, -8}, {9, -9}, {10, -10},
+		};
+		FLT R[4][2];
+		dclZero(DMS(R), 4, 2);
+
+		// dclMul(R, 1, A[0], 4, x, 1, 4, 1, 3);
+		dcldgemm(0, 0, 3, 4, 2, 1, A[0], 4, x[0], 2, 0, R[0], 2);
+
+		dclPrint(DMS(x), 4, 2);
+		dclPrint(DMS(R), 3, 2);
+
+		for (int j = 0; j < 2; j++) {
+			for (int i = 0; i < 3; i++) {
+				printf("[%d][%d]\n", i, j);
+				assert(R[i][j] == x[i][j]);
+			}
+
+			assert(fabs(R[3][j]) < .0000001);
+		}
+	}
+
+	compareToCblas();
+	compareToCblasTrans();
+}
diff --git a/src/poser_daveortho.c b/src/poser_daveortho.c
index 9d02bb3..2bf57c6 100644
--- a/src/poser_daveortho.c
+++ b/src/poser_daveortho.c
@@ -193,17 +193,19 @@ void OrthoSolve(
     FLT invXXt[3][3];
     FLT SXt[2][3];
     FLT M[2][3];           // Morph matrix! (2 by 3)
-    TRANSP(X,Xt,3,nPoints);
-    MUL(X,Xt,XXt,3,nPoints,3);
-    MUL(S,Xt,SXt,2,nPoints,3);
-    INV(XXt,invXXt,3);
-    MUL(SXt,invXXt,M,2,3,3);
-//PRINT(M,2,3);
+	TRANSP(Xt, X, 3, nPoints);
+	MUL(XXt, X, Xt, 3, nPoints, 3);
+	MUL(SXt, S, Xt, 2, nPoints, 3);
+	INV(invXXt, XXt, 3, 3);
+	MUL(M, SXt, invXXt, 2, 3, 3);
+	// PRINT(M,2,3);
 
-// Double checking work
-FLT S_morph[2][SENSORS_PER_OBJECT];
-MUL(M,X,S_morph,2,3,nPoints);
-for (i=0; i<nPoints; i++) { S_morph[0][i]+=sbar[0]; S_morph[1][i]+=sbar[1]; }
+	// Double checking work
+	FLT S_morph[2][SENSORS_PER_OBJECT];
+	MUL(S_morph, M, X, 2, 3, nPoints);
+	for (i = 0; i < nPoints; i++) {
+		S_morph[0][i] += sbar[0];
+		S_morph[1][i] += sbar[1]; }
 
     //--------------------
     // Solve for the non-trivial vector
@@ -218,10 +220,10 @@ for (i=0; i<nPoints; i++) { S_morph[0][i]+=sbar[0]; S_morph[1][i]+=sbar[1]; }
     FLT B[3][1] = { {0.0}, {0.0}, {1.0} };
     FLT inv_uM[3][3];
     FLT uf[3][1];
-    INV(uM,inv_uM,3);
-    MUL(inv_uM,B,uf,3,3,1);
-    
-    //--------------------
+	INV(inv_uM, uM, 3, 3);
+	MUL(uf, inv_uM, B, 3, 3, 1);
+
+	//--------------------
     // Solve for unit length vector
     //  f that goes into the camera
     //--------------------
@@ -250,10 +252,10 @@ for (i=0; i<nPoints; i++) { S_morph[0][i]+=sbar[0]; S_morph[1][i]+=sbar[1]; }
     //  uhat,rhat
     //--------------------
     FLT uhat[2][1], rhat[2][1], fhat[2][1];
-    MUL(M,f,fhat,2,3,1);
-    MUL(M,u,uhat,2,3,1);
-    MUL(M,r,rhat,2,3,1);
-    FLT fhat_len = sqrt( fhat[0][0]*fhat[0][0] + fhat[1][0]*fhat[1][0] );
+	MUL(fhat, M, f, 2, 3, 1);
+	MUL(uhat, M, u, 2, 3, 1);
+	MUL(rhat, M, r, 2, 3, 1);
+	FLT fhat_len = sqrt( fhat[0][0]*fhat[0][0] + fhat[1][0]*fhat[1][0] );
     FLT uhat_len = sqrt( uhat[0][0]*uhat[0][0] + uhat[1][0]*uhat[1][0] );
     FLT rhat_len = sqrt( rhat[0][0]*rhat[0][0] + rhat[1][0]*rhat[1][0] );
     FLT urhat_len = 0.5 * (uhat_len + rhat_len);