From 9fb5cefa0819580b80b5859d38b0aa565d919c6d Mon Sep 17 00:00:00 2001 From: Ethan Smith-Coss Date: Fri, 30 May 2025 23:30:35 +0100 Subject: [PATCH] C implementation of KLT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The committed files are a fully functioning implementation of the Karhunen–Loève transform (KLT) as defined on Wikipedia, and verified using multiple languages - Python, R and MATLAB. Note. It may be common for libraries to call this algorithm 'covariance', 'eig' or similar to distiguish from the SVD alogrithm or otherwise. The source code is built entirely on the GNU GSL (2.7) library, to utilise BLAS optimised functionality. --- .gitignore | 55 ++++++++++++++++ Makefile | 64 +++++++++++++++++++ compile_flags.txt | 4 ++ include/matio.h | 19 ++++++ include/pca.h | 24 +++++++ src/main.c | 51 +++++++++++++++ src/matio.c | 44 +++++++++++++ src/pca.c | 157 ++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 418 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 compile_flags.txt create mode 100644 include/matio.h create mode 100644 include/pca.h create mode 100644 src/main.c create mode 100644 src/matio.c create mode 100644 src/pca.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7633c34 --- /dev/null +++ b/.gitignore @@ -0,0 +1,55 @@ +# ---> C +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf + +build/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d16d5bd --- /dev/null +++ b/Makefile @@ -0,0 +1,64 @@ +CC = gcc +INCLUDE_DIR = ./include +SOURCE_DIR = ./src +CFLAGS += -I$(INCLUDE_DIR) -ansi -std=c99 +LIBS += -lgsl -lopenblas -lm +# Debug flags and release flags for GCC +ifeq ($(BUILD),release) +CFLAGS += -O2 -s -DNDEBUG +_BUILD_DIR = ./build/release +_OBJECT_DIR = ./obj/release +else +CFLAGS += -O0 -g +_BUILD_DIR = ./build/debug +_OBJECT_DIR = ./obj/debug +endif + +# Check for required directories for GCC outputs +ifeq ($(wildcard $(OBJECT_DIR)/.),) +$(shell mkdir -p $(OBJECT_DIR)) +endif + +# Valgrind specific flags for debugging help +VALGRIND_ARGS += -s --leak-check=full --track-origins=yes \ + --show-leak-kinds=all + +# Get the system architecture if not specified +ifeq ($(ARCH),) +ARCH = $(shell uname -m) +BUILD_DIR = $(addsuffix $(ARCH), $(_BUILD_DIR)/) +OBJECT_DIR = $(addsuffix $(ARCH), $(_OBJECT_DIR)/) +endif + +# Create directories if they don't exist +ifeq ($(wildcard $(BUILD_DIR)/.),) +$(shell mkdir -p $(BUILD_DIR)) +endif +ifeq ($(wildcard $(OBJECT_DIR)/.),) +$(shell mkdir -p $(OBJECT_DIR)) +endif + +# Binary name for executable to compile +BINARY_NAME = main +# Source list files +SRC = pca matio +OBJ = $(patsubst %,$(OBJECT_DIR)/%.o,$(SRC) $(BINARY_NAME)) + + +$(OBJECT_DIR)/%.o: $(SOURCE_DIR)/%.c + $(CC) -c -o $@ $< $(CFLAGS) + +$(BUILD_DIR)/$(BINARY_NAME).$(ARCH): $(OBJ) + gcc -std=c99 $(CFLAGS) $(OBJ) $(LIBS) -o $(BUILD_DIR)/$(BINARY_NAME).$(ARCH) + +.PHONY : release +release: + $(MAKE) "BUILD=release" + +.PHONY : clean +clean: + rm -rf $(OBJECT_DIR)/*.o build + +.PHONY : valgrind +valgrind: + valgrind $(VALGRIND_ARGS) $(BUILD_DIR)/$(BINARY_NAME).$(ARCH) diff --git a/compile_flags.txt b/compile_flags.txt new file mode 100644 index 0000000..e812261 --- /dev/null +++ b/compile_flags.txt @@ -0,0 +1,4 @@ +-Iinclude/ +-ansi +-std=c99 +-Wall diff --git a/include/matio.h b/include/matio.h new file mode 100644 index 0000000..7deae74 --- /dev/null +++ b/include/matio.h @@ -0,0 +1,19 @@ +#include +#include + +#ifndef MATIO_H +#define MATIO_H + +void +print_matrix ( gsl_matrix *matrix ); + +void +print_matrix_complex ( gsl_matrix_complex *matrix ); + +void +print_vector ( gsl_vector *vec ); + +void +print_flatten ( gsl_matrix *matrix ); + +#endif diff --git a/include/pca.h b/include/pca.h new file mode 100644 index 0000000..2579440 --- /dev/null +++ b/include/pca.h @@ -0,0 +1,24 @@ +#include +#include + + +#ifndef PCA_H +#define PCA_H + +#define PCA_AUTO_ALGORITHM -1 +#define PCA_KLT_ALGORITHM 1 +#define PCA_SVD_ALGORITHM 2 +#define PCA_ALS_ALGORITHM 3 + +#define PCA_CE_MAGIC -1 + +int +pca (gsl_matrix *A, int nc, int algorithm, gsl_matrix *m); + +int +_klt (gsl_matrix *A, gsl_vector *eval, gsl_matrix *m); + +int +_internal_eigen_sign_correction (gsl_matrix *evec); + +#endif diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..1a9d19f --- /dev/null +++ b/src/main.c @@ -0,0 +1,51 @@ +#include +#include +/*** GSL library imports ***/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "matio.h" +#include "pca.h" + + +#ifndef DISABLE_ERROR_HANDLER +#define DISABLE_ERROR_HANDLER 1 +#endif + + +int main(int argc, char *argv[]) +{ + if (DISABLE_ERROR_HANDLER == 0) + (void)gsl_set_error_handler_off(); + + gsl_rng *rng = gsl_rng_alloc(gsl_rng_taus); + double sds[5] = { 5,2,10, 5, 3, }; + + int col, row = 0; + + gsl_matrix *X = gsl_matrix_alloc(50, 5); + for (col = 0; col < X->size2; col++) { + for (row = 0; row < X->size1; row++) { + int num = (int)(gsl_ran_gaussian(rng, sds[col]) * 1e6); + (void)gsl_matrix_set(X, row, col, (double)(num / 1e6)); + } + } + + gsl_matrix *m_pca = gsl_matrix_alloc(X->size1, 2); + pca(X, PCA_CE_MAGIC, PCA_KLT_ALGORITHM, m_pca); + + print_matrix(m_pca); + + gsl_rng_free(rng); + gsl_matrix_free(X); + gsl_matrix_free(m_pca); + + return EXIT_SUCCESS; +} diff --git a/src/matio.c b/src/matio.c new file mode 100644 index 0000000..ace0980 --- /dev/null +++ b/src/matio.c @@ -0,0 +1,44 @@ +#include +#include + +#include "matio.h" + +void print_matrix ( gsl_matrix *matrix ) { + printf("\n"); + for ( int i = 0 ; i < matrix->size1 ; i++ ) { + for ( int j = 0 ; j < matrix->size2 ; j++ ) { + printf(" %.6f ", gsl_matrix_get(matrix, i, j)); + } + printf("\n"); + } + printf("\n"); +} + +void print_matrix_complex ( gsl_matrix_complex *matrix ) { + printf("\n"); + for ( int i = 0 ; i < matrix->size1 ; i++ ) { + for ( int j = 0 ; j < matrix->size2 ; j++ ) { + printf(" %.6g ", gsl_matrix_complex_get(matrix, i, j)); + } + printf("\n"); + } + printf("\n"); +} + +void print_vector ( gsl_vector *vec ) { + printf("\n"); + for ( int i = 0 ; i < vec->size ; i++ ) { + printf(" %.6f ", gsl_vector_get(vec, i)); + } + printf("\n"); +} + +void print_flatten ( gsl_matrix *matrix ) { + printf("\n"); + for ( int i = 0 ; i < matrix->size1 ; i++ ) { + for (int j = 0; j < matrix->size2; j++) { + printf("%g,", gsl_matrix_get(matrix, i, j)); + } + } + printf("\n"); +} diff --git a/src/pca.c b/src/pca.c new file mode 100644 index 0000000..fc5a960 --- /dev/null +++ b/src/pca.c @@ -0,0 +1,157 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include "pca.h" + + +int pca (gsl_matrix *A, int nc, int algorithm, gsl_matrix *m) { + int i, exit_status = 0; + + gsl_vector *eigenvals = gsl_vector_alloc(A->size2); + gsl_matrix *components = gsl_matrix_alloc(A->size1, A->size2); + + switch (algorithm) { + case PCA_KLT_ALGORITHM: { + exit_status = _klt(A, eigenvals, components); + break; + } + default: exit_status = -1; + } + + // Eigenvector subset based on eigenvalue cumulative energy (min. 90%) + if (nc == PCA_CE_MAGIC) { + double ce = 0, gp = gsl_vector_sum(eigenvals); + for (i = 0; i < eigenvals->size; i++) { + ce += gsl_vector_get(eigenvals, i); + if (ce / gp >= 0.9) { + nc = i + 1; + break; + } + } + + // nc is different and possibly m is no longer sized correctly + gsl_matrix_free(m); + m = gsl_matrix_alloc(A->size1, nc); + } + + // select L first columns to reduce by (L < p) + gsl_vector *tmp = gsl_vector_alloc(m->size1); + for (i = 0; i < nc; i++) { + exit_status = gsl_matrix_get_col(tmp, components, i); + exit_status = gsl_matrix_set_col(m, i, tmp); + } + + gsl_vector_free(tmp); + gsl_vector_free(eigenvals); + gsl_matrix_free(components); + + + return exit_status; +} + + +int _klt (gsl_matrix *A, gsl_vector *eval, gsl_matrix *m) { + int exit_status, row, col = 0; + int size = A->size2; + + // Covariance matrix of the data + gsl_matrix *C = gsl_matrix_alloc(size, size); + exit_status = gsl_blas_dgemm(CblasTrans, + CblasNoTrans, + 1.0, + A, + A, + 0.0, + C); + + // Divide covariance matrix by 1/N-1 (Bessel's Correction) + for (col = 0; col < C->size2; col++) { + for (row = 0; row < C->size1; row++) { + gsl_matrix_set(C, + row, +col, + gsl_matrix_get(C, row, col) / (A->size1 - 1)); + } + } + + // allocate workspace for eigenvalue and eigenvector computation + gsl_eigen_symmv_workspace *eigen_work = gsl_eigen_symmv_alloc(size); + // vector/matrix allocation for eigenvalue/eigenvector + gsl_matrix *V = gsl_matrix_alloc(size, size); + + // compute the unordered eigenvalues and eigenvectors + exit_status = gsl_eigen_symmv(C, + eval, + V, + eigen_work); + + // sort the D and V in descending order of magnitude + exit_status = gsl_eigen_symmv_sort(eval, + V, + GSL_EIGEN_SORT_ABS_DESC); + + exit_status = _internal_eigen_sign_correction(V); + + // Project new data onto the new basis + exit_status = gsl_blas_dgemm(CblasNoTrans, + CblasNoTrans, + 1.0, + A, + V, + 0.0, + m); + + gsl_eigen_symmv_free(eigen_work); + gsl_matrix_free(V); + gsl_matrix_free(C); + + + return exit_status; +} + + +int _internal_eigen_sign_correction(gsl_matrix *evec) { + /* enforce eigenvector sign convention: the largest element in each column + * will have positive sign. + * @NOTE: This is not mentioned on the PCA Wiki page and it's not well + * documented. Found in the MATLAB code for `pca` as the final step. + * The sign of all the elements in the vector are flipped (vector points in + * the opposite direction). + * */ + + // argmax each column in the eigenvector matrix + size_t argmax; + double sign; + int size = evec->size2; + + gsl_vector *cols = gsl_vector_alloc(size); + gsl_matrix *sign_map = gsl_matrix_alloc(size, size); + + int col, row, j, exit_status = 0; + for (col = 0; col < evec->size2; col++) { + exit_status = gsl_matrix_get_col(cols, evec, col); + for (j = 0; j < cols->size; j++) { + gsl_vector_set(cols, + j, + fabs(gsl_vector_get(cols, j))); + } + argmax = gsl_vector_max_index(cols); + sign = gsl_matrix_get(evec, argmax, col) < 0 ? -1.0 : 1.0; + for (row = 0; row < sign_map->size1; row++) { + gsl_matrix_set(sign_map, row, col, sign); + } + } + exit_status = gsl_matrix_mul_elements(evec, sign_map); + + gsl_vector_free(cols); + gsl_matrix_free(sign_map); + + + return exit_status; +}