mmserv

Minimum Mean Square Error detection on RISC-V Vector Extention
git clone https://git.ea.contact/mmserv
Log | Files | Refs | README

commit 10633fb7052c199286994a5580317e064b582c21
parent 54f81882da338cfe10b71aa924dd0df1d73741a2
Author: Egor Achkasov <eaachkasov@edu.hse.ru>
Date:   Fri, 10 Jan 2025 11:55:09 +0100

Init x86 non-vector mmse (no nosqrt yet)

Diffstat:
Minclude/mmserv.h | 117+++++++++++++++++++++++++++++---------------------------------------------------
Mmain.c | 57++++++++++++++++++++++++++++++++++++++++++++++++---------
Msrc/mmserv.c | 700++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
3 files changed, 538 insertions(+), 336 deletions(-)

diff --git a/include/mmserv.h b/include/mmserv.h @@ -3,17 +3,16 @@ #include "define.h" -#include <stdint.h> // for int16_t, int32_t -#include <stddef.h> // for size_t +#include <stddef.h> /* for size_t */ /* * Typedefs */ typedef struct { - data_t re; - data_t im; -} complex; + data_t* re; + data_t* im; +} vcomplex; /** Calculate MMSE estimation of x in y = H*x + n * \param H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC] @@ -22,10 +21,10 @@ typedef struct { * \param x_MMSE output MMSE estimation of x. Shape [NUM_TX_ANT][NUM_SC] */ void mmse( - IN complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC], - IN complex y[NUM_RX_ANT][NUM_SC], - IN complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex x_MMSE[NUM_TX_ANT][NUM_SC]); + IN vcomplex *H, + IN vcomplex *y, + IN vcomplex *R, + OUT vcomplex *x_MMSE); /** Calculate MMSE estimation of x in y = H*x + n. Uses a sqrt-free Cholesky decomposition. * \param H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC] @@ -34,42 +33,10 @@ void mmse( * \param x_MMSE output MMSE estimation of x. Shape [NUM_TX_ANT][NUM_SC] */ void mmse_nosqrt( - IN complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC], - IN complex y[NUM_RX_ANT][NUM_SC], - IN complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex x_MMSE[NUM_TX_ANT][NUM_SC]); - -/* - * Complex operations - */ - -/** Create a complex number from real and imaginary parts */ -complex cmake(IN data_t re, IN data_t im); - -/** Complex multiplication */ -complex cmul(IN complex a, IN complex b); - -/** Complex absolute value squared */ -data_t cabs2(IN complex a); - -/** Complex addition */ -complex cadd(IN complex a, IN complex b); -void cadd_acc(IN complex a, IN complex b); - -/** Complex subtraction */ -complex csub(IN complex a, IN complex b); - -/** Complex conjugate */ -complex cconj(IN complex a); - -/** Complex division */ -complex cdiv(IN complex a, IN complex b); - -/** Square root of a natural number */ -data_t sqrt(IN data_t x); - -/** Complex root */ -complex csqrt(IN complex a); + IN vcomplex *H, + IN vcomplex *y, + IN vcomplex *R, + OUT vcomplex *x_MMSE); /* * Complex matrix operations @@ -80,16 +47,16 @@ complex csqrt(IN complex a); * \param AH output matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC] */ void cmat_hermitian_transpose_RxTx( - IN complex A[NUM_RX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex AH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]); + IN vcomplex *A, + OUT vcomplex *AH); /** Calculate the Hermitian transpose of a matrix A * \param A input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] * \param AH output matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] */ void cmat_hermitian_transpose_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex AH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]); + IN vcomplex *A, + OUT vcomplex *AH); /** Multiply two matrices A and B * \param A input matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC] @@ -97,9 +64,9 @@ void cmat_hermitian_transpose_TxTx( * \param result output matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] */ void cmatmul_TxRx_RxTx( - IN complex A[NUM_TX_ANT][NUM_RX_ANT][NUM_SC], - IN complex B[NUM_RX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]); + IN vcomplex *A, + IN vcomplex *B, + OUT vcomplex *result); /** Multiply two matrices A and B * \param A input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] @@ -107,9 +74,9 @@ void cmatmul_TxRx_RxTx( * \param result output matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] */ void cmatmul_TxTx_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]); + IN vcomplex *A, + IN vcomplex *B, + OUT vcomplex *result); /** Add two matrices A and B * \param A input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] @@ -117,9 +84,9 @@ void cmatmul_TxTx_TxTx( * \param result output matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] */ void cmatadd_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]); + IN vcomplex *A, + IN vcomplex *B, + OUT vcomplex *result); /** Perfom Cholesky decomposition of a square matrix A * such that A = L*L^H with Cholesky–Banachiewicz algorithm @@ -127,8 +94,8 @@ void cmatadd_TxTx( * \param L output lower triangular matrix L. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] */ void ccholesky_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]); + IN vcomplex *A, + OUT vcomplex *L); /** Perfom Cholesky decomposition of a square matrix A * such that A = L*D*L^H with Cholesky–Banachiewicz algorithm, @@ -137,12 +104,12 @@ void ccholesky_TxTx( * This function does not calculate use csqrt and sqrt functions. * \param A input square matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] * \param L output lower triangular matrix L. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \param D output diagonal matrix L. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] + * \param D output diagonal of the diagonal matrix D. Shape [NUM_TX_ANT][NUM_SC] */ void ccholesky_nosqrt_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]); + IN vcomplex *A, + OUT vcomplex *L, + OUT vcomplex *D); /** Multiply a matrix A with a vector b * \param A input matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC] @@ -150,9 +117,9 @@ void ccholesky_nosqrt_TxTx( * \param result output vector. Shape [NUM_TX_ANT][NUM_SC] */ void cmatvecmul_TxRx( - IN complex A[NUM_TX_ANT][NUM_RX_ANT][NUM_SC], - IN complex b[NUM_RX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_SC]); + IN vcomplex *A, + IN vcomplex *b, + OUT vcomplex *result); /** Find z from L*z = b with a forward substitution * \param L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] @@ -160,9 +127,9 @@ void cmatvecmul_TxRx( * \param result output vector z. Shape [NUM_TX_ANT][NUM_SC] */ void cforwardsub_TxTx( - IN complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - IN complex b[NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_SC]); + IN vcomplex *L, + IN vcomplex *b, + OUT vcomplex *result); /** Find x from U*x = b with a backward substitution * \param U upper triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] @@ -170,9 +137,9 @@ void cforwardsub_TxTx( * \param result output vector x. Shape [NUM_TX_ANT][NUM_SC] */ void cbackwardsub_TxTx( - IN complex U[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - IN complex b[NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_SC]); + IN vcomplex *U, + IN vcomplex *b, + OUT vcomplex *result); /** Calculate mean squared error between the original x and the estimated x * \param x original x. Shape [NUM_TX_ANT][NUM_SC] @@ -180,7 +147,7 @@ void cbackwardsub_TxTx( * \return mean squared error */ acc_t mse( - IN complex x[NUM_TX_ANT][NUM_SC], - IN complex x_MMSE[NUM_TX_ANT][NUM_SC]); + IN vcomplex *x, + IN vcomplex *x_MMSE); #endif /* __MMSERV_H */ diff --git a/main.c b/main.c @@ -1,19 +1,58 @@ #include "include/mmserv.h" -#include "../common/util.h" -#include "printf.h" +#include <stdio.h> /* for printf */ + +/* Import a binary file */ +#define IMPORT_BIN(sect, file, sym) asm (\ + ".section " #sect "\n" /* Change section */\ + ".balign 4\n" /* Word alignment */\ + ".global " #sym "\n" /* Export the object address */\ + #sym ":\n" /* Define the object label */\ + ".incbin \"" file "\"\n" /* Import the file */\ + ".balign 4\n" /* Word alignment */\ + ".section \".text\"\n") /* Restore section */ + +/* Import the data */ +IMPORT_BIN(".rodata", "data/x_re.bin", x_re); +IMPORT_BIN(".rodata", "data/x_im.bin", x_im); +IMPORT_BIN(".rodata", "data/H_re.bin", H_re); +IMPORT_BIN(".rodata", "data/H_im.bin", H_im); +IMPORT_BIN(".rodata", "data/R_re.bin", R_re); +IMPORT_BIN(".rodata", "data/R_im.bin", R_im); +IMPORT_BIN(".rodata", "data/y_re.bin", y_re); +IMPORT_BIN(".rodata", "data/y_im.bin", y_im); /* Transmitted signal */ -extern complex x[NUM_TX_ANT][NUM_SC]; +extern data_t x_re[NUM_TX_ANT][NUM_SC]; +extern data_t x_im[NUM_TX_ANT][NUM_SC]; /* Channel */ -extern complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC]; +extern data_t H_re[NUM_RX_ANT][NUM_TX_ANT][NUM_SC]; +extern data_t H_im[NUM_RX_ANT][NUM_TX_ANT][NUM_SC]; /* Noise covariance matrix */ -extern complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; +extern data_t R_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; +extern data_t R_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; /* Received signal */ -extern complex y[NUM_RX_ANT][NUM_SC]; +extern data_t y_re[NUM_RX_ANT][NUM_SC]; +extern data_t y_im[NUM_RX_ANT][NUM_SC]; int main() { - complex x_MMSE[NUM_TX_ANT][NUM_SC]; - mmse(H, y, R, x_MMSE); - printf("MSE: %f\n", mse(x, x_MMSE)); + /* Cast the data to vcomplex */ + vcomplex x, H, R, y; + x.re = (data_t *)x_re; + x.im = (data_t *)x_im; + H.re = (data_t *)H_re; + H.im = (data_t *)H_im; + R.re = (data_t *)R_re; + R.im = (data_t *)R_im; + y.re = (data_t *)y_re; + y.im = (data_t *)y_im; + + /* Calculate the MMSE approximation */ + data_t x_MMSE_re[NUM_TX_ANT][NUM_SC]; + data_t x_MMSE_im[NUM_TX_ANT][NUM_SC]; + vcomplex x_MMSE; + x_MMSE.re = (data_t *)x_MMSE_re; + x_MMSE.im = (data_t *)x_MMSE_im; + mmse(&H, &y, &R, &x_MMSE); + printf("%f\n", mse(&x, &x_MMSE)); } diff --git a/src/mmserv.c b/src/mmserv.c @@ -1,13 +1,30 @@ #include "../include/mmserv.h" -#include "../../common/rivec/vector_defines.h" -#include "riscv_vector.h" +#include <stddef.h> /* for size_t */ #ifdef DEBUG -#include "../../common/runtime.h" -#include "../../common/util.h" -#include "printf.h" -#include <stdio.h> +#include <stdint.h> /* for uint64_t */ +#include <stdio.h> /* for printf */ + +#ifdef _WIN32 +#include <intrin.h> +#else +#include <x86intrin.h> +#endif + +static uint64_t start = 0, end = 0; +void start_timer() +{ + start = __rdtsc(); +} +void stop_timer() +{ + end = __rdtsc(); +} +unsigned long long get_timer() +{ + return end - start; +} #define TIME(msg, func, ...) \ start_timer(); \ @@ -24,63 +41,20 @@ * Complex functions */ -complex cmake(IN data_t re, IN data_t im) -{ - complex t; - t.re = re; - t.im = im; - return t; -} -complex cmul(IN complex a, IN complex b) -{ - complex t; - t.re = a.re * b.re - a.im * b.im; - t.im = a.im * b.re + a.re * b.im; - return t; -} -data_t cabs2(IN complex a) -{ - return a.re * a.re + a.im * a.im; -} -complex cadd(IN complex a, IN complex b) -{ - complex t; - t.re = a.re + b.re; - t.im = a.im + b.im; - return t; -} -void cadd_acc(IN complex a, IN complex b) -{ - a.re += b.re; - a.im += b.im; -} -complex csub(IN complex a, IN complex b) -{ - complex t; - t.re = a.re - b.re; - t.im = a.im - b.im; - return t; -} -complex cconj(IN complex a) -{ - complex t; - t.re = a.re; - t.im = -a.im; - return t; -} -complex cdiv(IN complex a, IN complex b) +void cdiv( + IN data_t a_re, IN data_t a_im, + IN data_t b_re, IN data_t b_im, + OUT data_t *res_re, OUT data_t *res_im) { - complex t; - t.im = b.re * b.re + b.im * b.im; - t.re = (a.re * b.re + a.im * b.im) / t.im; - t.im = (a.im * b.re - a.re * b.im) / t.im; - return t; + *res_im = b_re * b_re + b_im * b_im; + *res_re = (a_re * b_re + a_im * b_im) / *res_im; + *res_im = (a_im * b_re - a_re * b_im) / *res_im; } data_t sqrt(IN data_t x) { data_t lo, hi, mid; - data_t eps = 1e-6; + data_t eps = 1e-4; if (x < eps) return 0; @@ -93,27 +67,33 @@ data_t sqrt(IN data_t x) hi = x; } while (hi - lo > eps){ - mid = (lo + hi) / 2; + mid = (lo + hi) / 2.f; if (mid * mid > x) hi = mid; else lo = mid; } return mid; } -complex csqrt(IN complex a) +void csqrt( + IN data_t re, IN data_t im, + OUT data_t *res_re, OUT data_t *res_im) { - if (a.im == 0){ - if (a.re < 0) - return cmake(0, sqrt(-a.re)); - else - return cmake(sqrt(a.re), 0); + if (im == 0){ + if (re < 0) { + *res_re = 0; + *res_im = sqrt(-re); + return; + } + else { + *res_re = sqrt(re); + *res_im = 0; + return; + } } - data_t length = sqrt(cabs2(a)); - complex res; - res.re = sqrt((length + a.re) / 2); - res.im = sqrt((length - a.re) / 2); - if (a.im < 0) - res.re = -res.re; - return res; + data_t length = sqrt(re * re + im * im); + *res_re = sqrt((length + re) / 2.f); + *res_im = sqrt((length - re) / 2.f); + if (im < 0.f) + *res_re = -*res_re; } /* @@ -121,197 +101,357 @@ complex csqrt(IN complex a) */ void cmat_hermitian_transpose_RxTx( - IN complex A[NUM_RX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex AH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]) + IN vcomplex *A, + OUT vcomplex *AH) { - uint32_t i, j, k; + size_t i, j, k, off_ijk = 0, off_jik; for (i = 0; i < NUM_RX_ANT; ++i) - for (j = 0; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_SC; ++k) - AH[i][j][k] = cconj(A[j][i][k]); + for (j = 0; j < NUM_TX_ANT; ++j) { + off_jik = j * NUM_RX_ANT * NUM_SC + i * NUM_SC; + for (k = 0; k < NUM_SC; ++k) { + AH->re[off_jik] = A->re[off_ijk]; + AH->im[off_jik] = -A->im[off_ijk]; + ++off_ijk; + ++off_jik; + } + } } void cmat_hermitian_transpose_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex AH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]) + IN vcomplex *A, + OUT vcomplex *AH) { - uint32_t i, j, k; + size_t i, j, k, off_ijk = 0, off_jik; for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_SC; ++k) - AH[i][j][k] = cconj(A[j][i][k]); + for (j = 0; j < NUM_TX_ANT; ++j) { + off_jik = j * NUM_TX_ANT * NUM_SC + i * NUM_SC; + for (k = 0; k < NUM_SC; ++k) { + AH->re[off_jik] = A->re[off_ijk]; + AH->im[off_jik] = -A->im[off_ijk]; + ++off_ijk; + ++off_jik; + } + } } void cmatmul_TxRx_RxTx( - IN complex A[NUM_TX_ANT][NUM_RX_ANT][NUM_SC], - IN complex B[NUM_RX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]) + IN vcomplex *A, + IN vcomplex *B, + OUT vcomplex *result) { - uint32_t i, j, k, l; - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_SC; ++k) - result[i][j][k] = cmake(0, 0); - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_RX_ANT; ++k) - for (l = 0; l < NUM_SC; ++l) - result[i][j][l] = cadd(result[i][j][l], cmul(A[i][k][l], B[k][j][l])); + size_t i, j, k, l; + size_t off_ijk = 0, off_ilk, off_ljk; + data_t A_re, A_im, B_re, B_im; + + for (i = 0; i < NUM_TX_ANT * NUM_TX_ANT * NUM_SC; ++i) + result->re[i] = result->im[i] = 0.f; + + for (i = 0; i != NUM_TX_ANT; ++i) { + for (j = 0; j != NUM_RX_ANT; ++j) { + for (k = 0; k != NUM_SC; ++k) { + off_ilk = i * NUM_RX_ANT * NUM_SC + k; + off_ljk = j * NUM_SC + k; + for (l = 0; l != NUM_RX_ANT; ++l) { + A_re = A->re[off_ilk]; + A_im = A->im[off_ilk]; + B_re = B->re[off_ljk]; + B_im = B->im[off_ljk]; + result->re[off_ijk] += A_re * B_re - A_im * B_im; + result->im[off_ijk] += A_re * B_im + A_im * B_re; + off_ilk += NUM_SC; + off_ljk += NUM_TX_ANT * NUM_SC; + } + ++off_ijk; + } + } + } } void cmatmul_TxTx_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]) + IN vcomplex *A, + IN vcomplex *B, + OUT vcomplex *result) { - uint32_t i, j, k, l; - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_SC; ++k) - result[i][j][k] = cmake(0, 0); - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_TX_ANT; ++k) - for (l = 0; l < NUM_SC; ++l) - result[i][j][l] = cadd(result[i][j][l], cmul(A[i][k][l], B[k][j][l])); + size_t i, j, k, l; + size_t off_ijk = 0, off_ilk, off_ljk; + data_t A_re, A_im, B_re, B_im; + + for (i = 0; i < NUM_TX_ANT * NUM_TX_ANT * NUM_SC; ++i) + result->re[i] = result->im[i] = 0.f; + + for (i = 0; i != NUM_TX_ANT; ++i) { + for (j = 0; j != NUM_TX_ANT; ++j) { + for (k = 0; k != NUM_SC; ++k) { + off_ilk = i * NUM_TX_ANT * NUM_SC + k; + off_ljk = j * NUM_SC + k; + for (l = 0; l != NUM_TX_ANT; ++l) { + A_re = A->re[off_ilk]; + A_im = A->im[off_ilk]; + B_re = B->re[off_ljk]; + B_im = B->im[off_ljk]; + result->re[off_ijk] += A_re * B_re - A_im * B_im; + result->im[off_ijk] += A_re * B_im + A_im * B_re; + off_ilk += NUM_SC; + off_ljk += NUM_TX_ANT * NUM_SC; + } + ++off_ijk; + } + } + } } void cmatadd_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]) + IN vcomplex *A, + IN vcomplex *B, + OUT vcomplex *result) { - uint32_t i, j, k; - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_SC; ++k) - result[i][j][k] = cadd(A[i][j][k], B[i][j][k]); + size_t off_ijk = 0, off_result = 0; + for (;off_ijk != NUM_TX_ANT * NUM_TX_ANT * NUM_SC;) { + result->re[off_result] = A->re[off_ijk] + B->re[off_ijk]; + result->im[off_result] = A->im[off_ijk] + B->im[off_ijk]; + ++off_ijk; + ++off_result; + } } void ccholesky_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]) + IN vcomplex *A, + OUT vcomplex *L) { - uint32_t i, j, k, l; - complex sum; + size_t i, j, k, l; + size_t off_ijk, off_ilk, off_jlk, off_jjk; + data_t sum_re, sum_im; + data_t t1_re, t1_im, t2_re, t2_im; + /* Init upper triangle with zeros */ for (i = 0; i < NUM_TX_ANT; ++i) - for (j = i+1; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_SC; ++k) - L[i][j][k] = cmake(0, 0); + for (j = i+1; j < NUM_TX_ANT; ++j) { + off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC; + for (k = 0; k < NUM_SC; ++k) { + L->re[off_ijk] = L->im[off_ijk] = 0; + ++off_ijk; + } + } /* Calculate diagonal and lower triangular entries */ for (i = 0; i < NUM_TX_ANT; ++i) for (j = 0; j < i+1; ++j) for (k = 0; k < NUM_SC; ++k){ - sum = A[i][j][k]; - for (l = 0; l < j; ++l) - sum = csub(sum, cmul(L[i][l][k], cconj(L[j][l][k]))); + off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC + k; + sum_re = A->re[off_ijk]; + sum_im = A->im[off_ijk]; + for (l = 0; l < j; ++l) { + off_ilk = i * NUM_TX_ANT * NUM_SC + l * NUM_SC + k; + off_jlk = j * NUM_TX_ANT * NUM_SC + l * NUM_SC + k; + t1_re = L->re[off_ilk]; + t1_im = L->im[off_ilk]; + t2_re = L->re[off_jlk]; + t2_im = L->im[off_jlk]; + sum_re -= t1_re * t2_re + t1_im * t2_im; + sum_im -= t1_im * t2_re - t1_re * t2_im; + } if (i == j) - L[i][j][k] = csqrt(sum); - else - L[i][j][k] = cdiv(sum, L[j][j][k]); + csqrt( + sum_re, sum_im, + &L->re[off_ijk], + &L->im[off_ijk]); + else { + off_jjk = j * NUM_TX_ANT * NUM_SC + j * NUM_SC + k; + cdiv( + sum_re, sum_im, + L->re[off_jjk], + L->im[off_jjk], + &L->re[off_ijk], + &L->im[off_ijk]); + } } } void ccholesky_nosqrt_TxTx( - IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]) + IN vcomplex *A, + OUT vcomplex *L, + OUT vcomplex *D) { - uint32_t i, j, k, l; - complex sum, t; + size_t i, j, k, l; + size_t off_iik, off_ijk, off_ilk, off_jlk; + size_t off_ik, off_jk; + data_t t1_re, t1_im, t2_re, t2_im; + data_t sum_re, sum_im; + /* Init zeros in upper triangles of L and D */ - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = i+1; j < NUM_TX_ANT; ++j) - for (k = 0; k < NUM_SC; ++k) - D[i][j][k] = L[i][j][k] = cmake(0, 0); + for (off_ijk = 0; off_ijk < NUM_TX_ANT * NUM_TX_ANT * NUM_SC; ++off_ijk) + L->re[off_ijk] = L->im[off_ijk] = D->re[off_ijk] = D->im[off_ijk] = 0; /* Init zeros in lower triangle of D */ - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < i; ++j) - for (k = 0; k < NUM_SC; ++k) - D[i][j][k] = cmake(0, 0); /* Init ones in the diagonal of the result */ - for (i = 0; i < NUM_TX_ANT; ++i) - for (k = 0; k < NUM_SC; ++k) - L[i][i][k] = cmake(1, 0); + for (off_ik = 0; off_ik < NUM_TX_ANT * NUM_SC; ++off_ik) { + L->im[off_ik] = D->re[off_ik] = D->im[off_ik] = 0; + L->re[off_ik] = 1; + } /* Calculate the lower triangle of the result */ - for (i = 0; i < NUM_TX_ANT; ++i){ - /* L_ij = (A_ij - \sum_{l=0}^{j-1} L_il L_jl^* D_k) / D_j */ - for (j = 0; j < i; ++j){ - sum.re = 0; - sum.im = 0; - for (l = 0; l < j; ++l) - for (k = 0; k < NUM_SC; ++k){ - /* TODO there is one sum for all scs what leads to contamination. Fix that */ - t = cmul(L[i][l][k], cconj(L[j][l][k])); - t = cmul(t, D[l][l][k]); - cadd_acc(sum, t); + for (k = 0; k != NUM_SC; ++k) { + for (i = 0; i != NUM_TX_ANT; ++i) { + off_iik = i * NUM_TX_ANT * NUM_SC + i * NUM_SC + k; + /* L_ij = (A_ij - \sum_{l=0}^{j-1} L_il L_jl^* D_k) / D_j */ + for (j = 0; j != i; ++j) { + for (l = 0; l != j; ++l) { + off_ilk = i * NUM_TX_ANT * NUM_SC + l * NUM_SC + k; + off_jlk = j * NUM_TX_ANT * NUM_SC + l * NUM_SC + k; + t1_re = L->re[off_ilk]; + t1_im = L->im[off_ilk]; + t2_re = L->re[off_jlk]; + t2_im = L->im[off_jlk]; + sum_re += t1_re * t2_re + t1_im * t2_im; + sum_im += t1_im * t2_re - t1_re * t2_im; } - for (k = 0; k < NUM_SC; ++k){ - L[i][j][k] = csub(A[i][j][k], sum); - L[i][j][k] = cdiv(L[i][j][k], D[j][j][k]); + off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC + k; + off_jk = j * NUM_SC + k; + t1_re = A->re[off_ijk] - sum_re; + t1_im = A->im[off_ijk] - sum_im; + cdiv( + t1_re, t1_im, + D->re[off_jk], D->im[off_jk], + &L->re[off_ijk], + &L->im[off_ijk]); } - } - /* D_i = A_ii - \sum_{j=0}^{i-1} L_ij L_ij^* D_j */ - sum.re = 0; - sum.im = 0; - for (j = 0; j < i; ++j) - for (k = 0; k < NUM_SC; ++k){ - /* TODO there is one sum for all scs what leads to contamination. Fix that */ - t = cmul(L[i][j][k], cconj(L[i][j][k])); - t = cmul(t, D[j][j][k]); - cadd_acc(sum, t); + /* D_i = A_ii - \sum_{j=0}^{i-1} L_ij L_ij^* D_j */ + sum_re = sum_im = 0; + off_ik = i * NUM_SC + k; + for (j = 0; j != i; ++j) { + off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC + k; + off_jk = j * NUM_SC + k; + t1_re = L->re[off_ijk]; + t1_im = L->im[off_ijk]; + t2_re = t1_re * t1_re + t1_im * t1_im; + t2_im = t1_re * t1_im - t1_im * t1_re; + t1_re = D->re[off_jk]; + t1_im = D->im[off_jk]; + sum_re += t2_re * t1_re - t2_im * t1_im; + sum_im += t2_re * t1_im + t2_im * t1_re; } - for (k = 0; k < NUM_SC; ++k) - D[i][i][k] = csub(A[i][i][k], sum); + D->re[off_ik] = A->re[off_iik] - sum_re; + D->im[off_ik] = A->im[off_iik] - sum_im; + } } } void cmatvecmul_TxRx( - IN complex A[NUM_TX_ANT][NUM_RX_ANT][NUM_SC], - IN complex b[NUM_RX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_SC]) + IN vcomplex *A, + IN vcomplex *b, + OUT vcomplex *result) { - uint32_t i, j, k; - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_SC; ++j) - result[i][j] = cmake(0., 0.); - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_RX_ANT; ++j) - for (k = 0; k < NUM_SC; ++k) - result[i][k] = cadd(result[i][k], cmul(A[i][j][k], b[j][k])); + size_t i, j, k; + size_t off_ik, off_ijk = 0, off_jk; + size_t off_ik_bck; + data_t A_re, A_im, b_re, b_im; + + for (i = 0; i < NUM_TX_ANT * NUM_SC; ++i) + result->re[i] = result->im[i] = 0.f; + + for (i = 0; i < NUM_TX_ANT; ++i) { + off_jk = 0; + off_ik_bck = i * NUM_SC; + for (j = 0; j < NUM_RX_ANT; ++j) { + off_ik = off_ik_bck; + for (k = 0; k < NUM_SC; ++k) { + A_re = A->re[off_ijk]; + A_im = A->im[off_ijk]; + b_re = b->re[off_jk]; + b_im = b->im[off_jk]; + result->re[off_ik] += A_re * b_re - A_im * b_im; + result->im[off_ik] += A_re * b_im + A_im * b_re; + ++off_ik; + ++off_ijk; + ++off_jk; + } + } + } } void cforwardsub_TxTx( - IN complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - IN complex b[NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_SC]) + IN vcomplex *L, + IN vcomplex *b, + OUT vcomplex *result) { - uint32_t i, j, k; - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_SC; ++j){ - result[i][j] = b[i][j]; - for (k = 0; k < i; ++k) - result[i][j] = csub(result[i][j], cmul(L[i][k][j], result[k][j])); - result[i][j] = cdiv(result[i][j], L[i][i][j]); + size_t i, j, k; + size_t off_ij = 0, off_ikj, off_kj, off_iij; + size_t off_ikj_bck; + data_t L_re, L_im, result_re, result_im; + + for (i = 0; i < NUM_TX_ANT; ++i) { + off_ij = i * NUM_SC; + off_ikj_bck = i * NUM_TX_ANT * NUM_SC; + off_iij = i * NUM_TX_ANT * NUM_SC + i * NUM_SC; + for (j = 0; j < NUM_SC; ++j) { + result->re[off_ij] = b->re[off_ij]; + result->im[off_ij] = b->im[off_ij]; + + off_ikj = off_ikj_bck; + off_kj = j; + for (k = 0; k < i; ++k) { + L_re = L->re[off_ikj]; + L_im = L->im[off_ikj]; + result_re = result->re[off_kj]; + result_im = result->im[off_kj]; + result->re[off_ij] -= L_re * result_re - L_im * result_im; + result->im[off_ij] -= L_re * result_im + L_im * result_re; + off_kj += NUM_SC; + off_ikj += NUM_SC; + } + + result_re = result->re[off_ij]; + result_im = result->im[off_ij]; + cdiv( + result_re, result_im, + L->re[off_iij], L->im[off_iij], + &result->re[off_ij], + &result->im[off_ij]); + + ++off_ij; + ++off_ikj_bck; + ++off_iij; } + } } void cbackwardsub_TxTx( - IN complex U[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - IN complex b[NUM_TX_ANT][NUM_SC], - OUT complex result[NUM_TX_ANT][NUM_SC]) + IN vcomplex *U, + IN vcomplex *b, + OUT vcomplex *result) { - uint32_t i, j, k; - for (i = NUM_TX_ANT; i != (uint32_t)-1; --i) + /* TODO optimize offsets */ + + size_t i, j, k; + size_t off_ij, off_ikj, off_kj, off_iij; + data_t U_re, U_im, result_re, result_im; + + for (i = NUM_TX_ANT - 1; i != (size_t)-1; --i) { for (j = 0; j < NUM_SC; ++j){ - result[i][j] = b[i][j]; - for (k = i+1; k < NUM_TX_ANT; ++k) - result[i][j] = csub(result[i][j], cmul(U[i][k][j], result[k][j])); - result[i][j] = cdiv(result[i][j], U[i][i][j]); + off_ij = i * NUM_SC + j; + off_iij = i * NUM_TX_ANT * NUM_SC + i * NUM_SC + j; + result->re[off_ij] = b->re[off_ij]; + result->im[off_ij] = b->im[off_ij]; + + for (k = i+1; k < NUM_TX_ANT; ++k){ + off_ikj = i * NUM_TX_ANT * NUM_SC + k * NUM_SC + j; + off_kj = k * NUM_SC + j; + U_re = U->re[off_ikj]; + U_im = U->im[off_ikj]; + result_re = result->re[off_kj]; + result_im = result->im[off_kj]; + result->re[off_ij] -= U_re * result_re - U_im * result_im; + result->im[off_ij] -= U_re * result_im + U_im * result_re; + } + + result_re = result->re[off_ij]; + result_im = result->im[off_ij]; + cdiv( + result_re, result_im, + U->re[off_iij], U->im[off_iij], + &result->re[off_ij], + &result->im[off_ij]); } + } } /* @@ -319,124 +459,180 @@ void cbackwardsub_TxTx( */ void mmse( - IN complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC], - IN complex y[NUM_RX_ANT][NUM_SC], - IN complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex x_MMSE[NUM_TX_ANT][NUM_SC]) + IN vcomplex *H, + IN vcomplex *y, + IN vcomplex *R, + OUT vcomplex *x_MMSE) { - complex HH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; - complex HH_H[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - complex HHy[NUM_TX_ANT][NUM_SC]; - complex z[NUM_TX_ANT][NUM_SC]; - complex LH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + vcomplex HH, HH_H, L, HHy, z, LH; + + data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; + data_t HH_im[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; + data_t HH_H_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t HH_H_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t L_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t L_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t HHy_re[NUM_TX_ANT][NUM_SC]; + data_t HHy_im[NUM_TX_ANT][NUM_SC]; + data_t z_re[NUM_TX_ANT][NUM_SC]; + data_t z_im[NUM_TX_ANT][NUM_SC]; + data_t LH_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t LH_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + + HH.re = (data_t *)HH_re; + HH.im = (data_t *)HH_im; + HH_H.re = (data_t *)HH_H_re; + HH_H.im = (data_t *)HH_H_im; + L.re = (data_t *)L_re; + L.im = (data_t *)L_im; + HHy.re = (data_t *)HHy_re; + HHy.im = (data_t *)HHy_im; + z.re = (data_t *)z_re; + z.im = (data_t *)z_im; + LH.re = (data_t *)LH_re; + LH.im = (data_t *)LH_im; /* H^H */ TIME( "Hermitian transpose (RxTx): %ld\n", cmat_hermitian_transpose_RxTx, - H, HH); + H, &HH); /* H^H*H */ TIME( "Matmul (TxRx x RxTx): %ld\n", cmatmul_TxRx_RxTx, - HH, H, HH_H); + &HH, H, &HH_H); /* H^H*H + R */ TIME( "Matadd (TxTx + TxTx): %ld\n", cmatadd_TxTx, - HH_H, R, HH_H); + &HH_H, R, &HH_H); /* L: (H^H*H + R) = L*L^H */ TIME( "Cholesky (TxTx): %ld\n", ccholesky_TxTx, - HH_H, L); + &HH_H, &L); /* z: L*z = H^H*y */ TIME( "Matrix-vector multiplication (TxRx x Rx): %ld\n", cmatvecmul_TxRx, - HH, y, HHy); + &HH, y, &HHy); TIME( "Forward substitution (TxTx): %ld\n", cforwardsub_TxTx, - L, HHy, z); + &L, &HHy, &z); /* x_MMSE: L^H*x_MMSE = z */ TIME( "Hermitian transpose (TxTx): %ld\n", cmat_hermitian_transpose_TxTx, - L, LH); + &L, &LH); TIME( "Backward substitution (TxTx): %ld\n", cbackwardsub_TxTx, - LH, z, x_MMSE); + &LH, &z, x_MMSE); } void mmse_nosqrt( - IN complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC], - IN complex y[NUM_RX_ANT][NUM_SC], - IN complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC], - OUT complex x_MMSE[NUM_TX_ANT][NUM_SC]) + IN vcomplex *H, + IN vcomplex *y, + IN vcomplex *R, + OUT vcomplex *x_MMSE) { - /* H^H */ - complex HH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; - complex HH_H[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - /* TODO D should be a vector to save memory */ - complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - complex HHy[NUM_TX_ANT][NUM_SC]; - complex z[NUM_TX_ANT][NUM_SC]; - complex LH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + vcomplex HH, HH_H, L, D, HHy, z, LH; + data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; + data_t HH_im[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; + data_t HH_H_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t HH_H_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t L_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t L_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t D_re[NUM_TX_ANT][NUM_SC]; + data_t D_im[NUM_TX_ANT][NUM_SC]; + data_t HHy_re[NUM_TX_ANT][NUM_SC]; + data_t HHy_im[NUM_TX_ANT][NUM_SC]; + data_t z_re[NUM_TX_ANT][NUM_SC]; + data_t z_im[NUM_TX_ANT][NUM_SC]; + data_t LH_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + data_t LH_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + + HH.re = (data_t *)HH_re; + HH.im = (data_t *)HH_im; + HH_H.re = (data_t *)HH_H_re; + HH_H.im = (data_t *)HH_H_im; + L.re = (data_t *)L_re; + L.im = (data_t *)L_im; + D.re = (data_t *)D_re; + D.im = (data_t *)D_im; + HHy.re = (data_t *)HHy_re; + HHy.im = (data_t *)HHy_im; + z.re = (data_t *)z_re; + z.im = (data_t *)z_im; + LH.re = (data_t *)LH_re; + LH.im = (data_t *)LH_im; + + /* H^H */ TIME( "Hermitian transpose (RxTx): %ld\n", cmat_hermitian_transpose_RxTx, - H, HH); + H, &HH); TIME( "Matmul (TxRx x RxTx): %ld\n", cmatmul_TxRx_RxTx, - HH, H, HH_H); + &HH, H, &HH_H); TIME( "Matadd (TxTx + TxTx): %ld\n", cmatadd_TxTx, - HH_H, R, HH_H); + &HH_H, R, &HH_H); /* L, D: (H^H*H + R) = L*D*L^H */ TIME( "Cholesky nosqrt (TxTx): %ld\n", ccholesky_nosqrt_TxTx, - HH_H, L, D); + &HH_H, &L, &D); /* z: L*z = H^H*y */ TIME( "Matrix-vector multiplication (TxRx x Rx): %ld\n", cmatvecmul_TxRx, - HH, y, HHy); + &HH, y, &HHy); TIME( "Forward substitution (TxTx): %ld\n", cforwardsub_TxTx, - L, HHy, z); + &L, &HHy, &z); /* x_MMSE: L^H*x_MMSE = D^-1*z */ start_timer(); - for (uint32_t i = 0; i != NUM_TX_ANT; ++i) - for (uint32_t j = 0; j != NUM_SC; ++j) - z[i][j] = cdiv(z[i][j], D[i][i][j]); + data_t z_t_re, z_t_im; + for (size_t off_ij = 0; off_ij < NUM_TX_ANT * NUM_SC; ++off_ij) { + z_t_re = z.re[off_ij]; + z_t_im = z.im[off_ij]; + cdiv( + z_t_re, z_t_im, + D.re[off_ij], + D.im[off_ij], + &z.re[off_ij], + &z.im[off_ij]); + } stop_timer(); - printf("Division (TxTx): %ld\n", get_timer()); + printf("Division (Tx): %ld\n", get_timer()); TIME( "Hermitian transpose (TxTx): %ld\n", cmat_hermitian_transpose_TxTx, - L, LH); + &L, &LH); TIME( "Backward substitution (TxTx): %ld\n", cbackwardsub_TxTx, - LH, z, x_MMSE); + &LH, &z, x_MMSE); } acc_t mse( - IN complex x[NUM_TX_ANT][NUM_SC], - IN complex x_MMSE[NUM_TX_ANT][NUM_SC]) + IN vcomplex *x, + IN vcomplex *x_MMSE) { - acc_t sum = 0; - for (uint32_t i = 0; i != NUM_TX_ANT; ++i) - for (uint32_t j = 0; j != NUM_SC; ++j) - sum += cabs2(csub(x[i][j], x_MMSE[i][j])); - return sum / (NUM_TX_ANT * NUM_SC); + acc_t sum = 0.; + size_t off_ij = 0; + data_t sub1, sub2; + for (; off_ij != NUM_TX_ANT * NUM_SC; ++off_ij) { + sub1 = x->re[off_ij] - x_MMSE->re[off_ij]; + sub2 = x->im[off_ij] - x_MMSE->im[off_ij]; + sum += (sub1 * sub1 + sub2 * sub2) / (data_t)(NUM_TX_ANT * NUM_SC); + } + return sum; }