mmserv

Minimum Mean Square Error detection on RISC-V Vector Extention
git clone https://git.ea.contact/mmserv
Log | Files | Refs | README

commit 4f5d4d765d3fcf7e788ce021ff505c6fe88320f5
parent 256ec6ce8ac12fa122825fe5aeb4fdea811642b5
Author: Egor Achkasov <eaachkasov@edu.hse.ru>
Date:   Thu, 13 Feb 2025 21:17:18 +0100

Use extern functions instead of header decls; Fix cback and cforw; Use static global variables instead of args

Diffstat:
Minclude/define.h | 4++++
Dinclude/mmserv.h | 58----------------------------------------------------------
Mmain.c | 38+++++++++++++++++++-------------------
Msrc/mmserv.c | 516+++++++++++++++++++++++++++----------------------------------------------------
4 files changed, 197 insertions(+), 419 deletions(-)

diff --git a/include/define.h b/include/define.h @@ -14,5 +14,9 @@ typedef float data_t; typedef float acc_t; +typedef struct { + data_t *re; + data_t *im; +} vcomplex; #endif diff --git a/include/mmserv.h b/include/mmserv.h @@ -1,58 +0,0 @@ -#ifndef __MMSERV_H -#define __MMSERV_H - -#include "define.h" - -#include <stddef.h> /* for size_t */ - -/* - * Typedefs - */ - -typedef struct { - data_t* re; - data_t* im; -} vcomplex; - -/* - * MMSE - */ - -/** Calculate MMSE estimation of x in y = H*x + n - * \param H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC] - * \param y received signal. Shape [NUM_RX_ANT][NUM_SC] - * \param R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \param x_MMSE output MMSE estimation of x. Shape [NUM_TX_ANT][NUM_SC] - */ -void mmse( - IN vcomplex *H, - IN vcomplex *y, - IN vcomplex *R, - OUT vcomplex *x_MMSE); - -/** Calculate MMSE estimation of x in y = H*x + n. Uses a sqrt-free Cholesky decomposition. - * \param H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC] - * \param y received signal. Shape [NUM_RX_ANT][NUM_SC] - * \param R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \param x_MMSE output MMSE estimation of x. Shape [NUM_TX_ANT][NUM_SC] - */ -void mmse_nosqrt( - IN vcomplex *H, - IN vcomplex *y, - IN vcomplex *R, - OUT vcomplex *x_MMSE); - -/* - * MSE - */ - -/** Calculate mean squared error between the original x and the estimated x - * \param x original x. Shape [NUM_TX_ANT][NUM_SC] - * \param x_MMSE estimated x. Shape [NUM_TX_ANT][NUM_SC] - * \return mean squared error - */ -acc_t mse( - IN vcomplex *x, - IN vcomplex *x_MMSE); - -#endif /* __MMSERV_H */ diff --git a/main.c b/main.c @@ -1,7 +1,12 @@ -#include "include/mmserv.h" +#include "include/define.h" #include "../common/printf.h" +/* extern functions */ +extern void mmse(); +extern acc_t mse(); + +/* Raw data */ /* Transmitted signal */ extern data_t x_re[NUM_TX_ANT][NUM_SC]; extern data_t x_im[NUM_TX_ANT][NUM_SC]; @@ -15,26 +20,21 @@ extern data_t R_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; extern data_t y_re[NUM_RX_ANT][NUM_SC]; extern data_t y_im[NUM_RX_ANT][NUM_SC]; -int main() { - /* Cast the data to vcomplex */ - vcomplex x, H, R, y; - x.re = (data_t *)x_re; - x.im = (data_t *)x_im; - H.re = (data_t *)H_re; - H.im = (data_t *)H_im; - R.re = (data_t *)R_re; - R.im = (data_t *)R_im; - y.re = (data_t *)y_re; - y.im = (data_t *)y_im; +/* Same data but casted to vcomplex */ +vcomplex g_x = { .re = (data_t *)x_re, .im = (data_t *)x_im }; +vcomplex g_H = { .re = (data_t *)H_re, .im = (data_t *)H_im }; +vcomplex g_R = { .re = (data_t *)R_re, .im = (data_t *)R_im }; +vcomplex g_y = { .re = (data_t *)y_re, .im = (data_t *)y_im }; +/* MMSE approximation will be stored in this global*/ +data_t x_MMSE_re[NUM_TX_ANT][NUM_SC]; +data_t x_MMSE_im[NUM_TX_ANT][NUM_SC]; +vcomplex g_x_MMSE = { .re = (data_t *)x_MMSE_re, .im = (data_t *)x_MMSE_im }; + +int main() { /* Calculate the MMSE approximation */ - data_t x_MMSE_re[NUM_TX_ANT][NUM_SC]; - data_t x_MMSE_im[NUM_TX_ANT][NUM_SC]; - vcomplex x_MMSE; - x_MMSE.re = (data_t *)x_MMSE_re; - x_MMSE.im = (data_t *)x_MMSE_im; - mmse(&H, &y, &R, &x_MMSE); - printf("MSE: %f\n", mse(&x, &x_MMSE)); + mmse(); + printf("MSE: %f\n", mse()); printf("Shutting down\n"); } diff --git a/src/mmserv.c b/src/mmserv.c @@ -1,4 +1,4 @@ -#include "../include/mmserv.h" +#include "../include/define.h" #include <stddef.h> /* for size_t */ #include <stdint.h> /* for uint64_t */ @@ -13,11 +13,11 @@ static uint64_t g_timer; void start_timer() { - asm volatile("rdcycle %0" : "=r"(g_timer)); + __asm__ volatile("rdcycle %0" : "=r"(g_timer)); } void stop_timer() { - asm volatile( + __asm__ volatile( "rdcycle t0\n" "sub %0, t0, %0" : "+r"(g_timer) @@ -40,6 +40,36 @@ uint64_t get_timer() #endif /* + * Global variables + */ + +/* Externs */ +extern vcomplex g_x, g_H, g_R, g_y; +extern vcomplex g_x_MMSE; + +/* Raw data */ +data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; +data_t HH_im[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; +data_t HH_H_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; +data_t HH_H_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; +data_t L_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; +data_t L_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; +data_t HHy_re[NUM_TX_ANT][NUM_SC]; +data_t HHy_im[NUM_TX_ANT][NUM_SC]; +data_t z_re[NUM_TX_ANT][NUM_SC]; +data_t z_im[NUM_TX_ANT][NUM_SC]; +data_t LH_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; +data_t LH_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; + +/* Same data but casted to vcomplex */ +vcomplex g_HH = { .re = (data_t *)HH_re, .im = (data_t *)HH_im }; +vcomplex g_HH_H = { .re = (data_t *)HH_H_re, .im = (data_t *)HH_H_im }; +vcomplex g_L = { .re = (data_t *)L_re, .im = (data_t *)L_im }; +vcomplex g_HHy = { .re = (data_t *)HHy_re, .im = (data_t *)HHy_im }; +vcomplex g_z = { .re = (data_t *)z_re, .im = (data_t *)z_im }; +vcomplex g_LH = { .re = (data_t *)LH_re, .im = (data_t *)LH_im }; + +/* * Complex functions */ @@ -55,25 +85,8 @@ void cdiv( data_t sqrt(IN data_t x) { - data_t lo, hi, mid; - data_t eps = 1e-4; - if (x < eps) - return 0; - - if (x < 1.){ - lo = x; - hi = 1; - } - else{ - lo = 1; - hi = x; - } - while (hi - lo > eps){ - mid = (lo + hi) / 2.f; - if (mid * mid > x) hi = mid; - else lo = mid; - } - return mid; + __asm__ volatile("fsqrt.s %0, %1" : "=f"(x) : "f"(x)); + return x; } void csqrt( IN data_t re, IN data_t im, @@ -102,57 +115,40 @@ void csqrt( * Complex matrix operations */ -void cmat_hermitian_transpose_TxTx( - IN vcomplex *A, - OUT vcomplex *AH) -{ - size_t i, j, k, off_ijk = 0, off_jik; - for (i = 0; i < NUM_TX_ANT; ++i) - for (j = 0; j < NUM_TX_ANT; ++j) { - off_jik = j * NUM_TX_ANT * NUM_SC + i * NUM_SC; - for (k = 0; k < NUM_SC; ++k) { - AH->re[off_jik] = A->re[off_ijk]; - AH->im[off_jik] = -A->im[off_ijk]; - ++off_ijk; - ++off_jik; - } - } -} - -/** Complex Gram matrix A^H*A and add complex matrix R (A^H*A + R) - * \param A matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC] - * \param R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \param result output Gram matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] +/** Complex Gram matrix H^H*H and add complex matrix R + * + * HH_H = H^H*H + R + * + * \global g_H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC] + * \global g_R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] + * \global g_HH_H output Gram matrix + R. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] */ -void cmatgram_TxRx_cadd( - IN vcomplex *A, - IN vcomplex *R, - OUT vcomplex *result) +void cmatgram_TxRx_cadd() { size_t t1, t2, r; size_t sz, vl; size_t off_sc, off_A, off_AH; - size_t off_result_L, off_result_U; + size_t off_HH_H_L, off_HH_H_U; data_t *A_re, *A_im, *AH_re, *AH_im; data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im; - data_t *result_L_re, *result_L_im, *result_U_re, *result_U_im; + data_t *HH_H_L_re, *HH_H_L_im, *HH_H_U_re, *HH_H_U_im; for (t1 = 0; t1 != NUM_TX_ANT; ++t1) for (t2 = t1; t2 != NUM_TX_ANT; ++t2) { off_sc = 0; - off_result_L = t1 * NUM_TX_ANT * NUM_SC + t2 * NUM_SC; - off_result_U = t2 * NUM_TX_ANT * NUM_SC + t1 * NUM_SC; - result_L_re = &result->re[off_result_L]; - result_L_im = &result->im[off_result_L]; - result_U_re = &result->re[off_result_U]; - result_U_im = &result->im[off_result_U]; + off_HH_H_L = t1 * NUM_TX_ANT * NUM_SC + t2 * NUM_SC; + off_HH_H_U = t2 * NUM_TX_ANT * NUM_SC + t1 * NUM_SC; + HH_H_L_re = &g_HH_H.re[off_HH_H_L]; + HH_H_L_im = &g_HH_H.im[off_HH_H_L]; + HH_H_U_re = &g_HH_H.re[off_HH_H_U]; + HH_H_U_im = &g_HH_H.im[off_HH_H_U]; sz = NUM_SC; while (sz > 0) { - /* Initialize result registers */ - /* v0 - result real part */ - /* v1 - result imaginary part */ - asm volatile( + /* Initialize HH_H registers */ + /* v0 - HH_H real part */ + /* v1 - HH_H imaginary part */ + __asm__ volatile( "vsetvli %0, %1, e32, m1, ta, ma\n" "vmv.v.i v0, 0\n" "vmv.v.i v1, 0\n" @@ -161,17 +157,17 @@ void cmatgram_TxRx_cadd( for (r = 0; r != NUM_RX_ANT; ++r) { off_A = r * NUM_TX_ANT * NUM_SC + t1 * NUM_SC + off_sc; off_AH = r * NUM_TX_ANT * NUM_SC + t2 * NUM_SC + off_sc; - A_re = &A->re[off_A]; - A_im = &A->im[off_A]; - AH_re = &A->re[off_AH]; - AH_im = &A->im[off_AH]; + A_re = &g_H.re[off_A]; + A_im = &g_H.im[off_A]; + AH_re = &g_H.re[off_AH]; + AH_im = &g_H.im[off_AH]; /* Calculate A^H*A */ /* v2 - A real part */ /* v3 - A imaginary part */ /* v4 - A^H real part */ /* v5 - A^H imaginary part */ - asm volatile( + __asm__ volatile( "vle32.v v2, (%0)\n" "vle32.v v3, (%1)\n" "vle32.v v4, (%2)\n" @@ -190,9 +186,9 @@ void cmatgram_TxRx_cadd( /* Add R */ /* v2 - R real part */ /* v3 - R imaginary part */ - R_U_re = &R->re[off_result_U]; - R_U_im = &R->im[off_result_U]; - asm volatile( + R_U_re = &g_R.re[off_HH_H_U]; + R_U_im = &g_R.im[off_HH_H_U]; + __asm__ volatile( "vle32.v v2, (%0)\n" "vle32.v v3, (%1)\n" "vfadd.vv v2, v2, v0\n" @@ -200,12 +196,11 @@ void cmatgram_TxRx_cadd( "vse32.v v2, (%2)\n" "vse32.v v3, (%3)\n" : - : "r"(R_U_re), "r"(R_U_im), "r"(result_U_re), "r"(result_U_im) - : "v0", "v1", "v2", "v3"); + : "r"(R_U_re), "r"(R_U_im), "r"(HH_H_U_re), "r"(HH_H_U_im)); if (t1 != t2) { - R_L_re = &R->re[off_result_L]; - R_L_im = &R->im[off_result_L]; - asm volatile( + R_L_re = &g_R.re[off_HH_H_L]; + R_L_im = &g_R.im[off_HH_H_L]; + __asm__ volatile( /* Lower triangle */ "vle32.v v2, (%0)\n" "vle32.v v3, (%1)\n" @@ -214,7 +209,7 @@ void cmatgram_TxRx_cadd( "vse32.v v2, (%2)\n" "vse32.v v3, (%3)\n" : - : "r"(R_L_re), "r"(R_L_im), "r"(result_L_re), "r"(result_L_im) + : "r"(R_L_re), "r"(R_L_im), "r"(HH_H_L_re), "r"(HH_H_L_im) ); } @@ -224,9 +219,14 @@ void cmatgram_TxRx_cadd( } } -void ccholesky_TxTx( - IN vcomplex *A, - OUT vcomplex *L) +/** Complex Cholesky decomposition L of a Hermitian positive-definite matrix HH_H + * + * HH_H = L*L^H + * + * \global g_HH_H matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] + * \global g_L output lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] + */ +void ccholesky_TxTx() { size_t i, j, k, l; size_t off_ijk, off_ilk, off_jlk, off_jjk; @@ -238,7 +238,7 @@ void ccholesky_TxTx( for (j = i+1; j < NUM_TX_ANT; ++j) { off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC; for (k = 0; k < NUM_SC; ++k) { - L->re[off_ijk] = L->im[off_ijk] = 0; + g_L.re[off_ijk] = g_L.im[off_ijk] = 0; ++off_ijk; } } @@ -248,31 +248,31 @@ void ccholesky_TxTx( for (j = 0; j < i+1; ++j) for (k = 0; k < NUM_SC; ++k){ off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC + k; - sum_re = A->re[off_ijk]; - sum_im = A->im[off_ijk]; + sum_re = g_HH_H.re[off_ijk]; + sum_im = g_HH_H.im[off_ijk]; for (l = 0; l < j; ++l) { off_ilk = i * NUM_TX_ANT * NUM_SC + l * NUM_SC + k; off_jlk = j * NUM_TX_ANT * NUM_SC + l * NUM_SC + k; - t1_re = L->re[off_ilk]; - t1_im = L->im[off_ilk]; - t2_re = L->re[off_jlk]; - t2_im = L->im[off_jlk]; + t1_re = g_L.re[off_ilk]; + t1_im = g_L.im[off_ilk]; + t2_re = g_L.re[off_jlk]; + t2_im = g_L.im[off_jlk]; sum_re -= t1_re * t2_re + t1_im * t2_im; sum_im -= t1_im * t2_re - t1_re * t2_im; } if (i == j) csqrt( sum_re, sum_im, - &L->re[off_ijk], - &L->im[off_ijk]); + &g_L.re[off_ijk], + &g_L.im[off_ijk]); else { off_jjk = j * NUM_TX_ANT * NUM_SC + j * NUM_SC + k; cdiv( sum_re, sum_im, - L->re[off_jjk], - L->im[off_jjk], - &L->re[off_ijk], - &L->im[off_ijk]); + g_L.re[off_jjk], + g_L.im[off_jjk], + &g_L.re[off_ijk], + &g_L.im[off_ijk]); } } } @@ -345,24 +345,27 @@ void ccholesky_nosqrt_TxTx( } } -void cmatvecmul_TxRx( - IN vcomplex *A, - IN vcomplex *b, - OUT vcomplex *result) +/** Complex matrix-vector multiplication HHy = HH*y + * + * \global g_HH matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC] + * \global g_y vector. Shape [NUM_RX_ANT][NUM_SC] + * \global g_HHy output vector. Shape [NUM_TX_ANT][NUM_SC] + */ +void cmatvecmul_TxRx() { - size_t i, j, k; - size_t off_A, off_b, off_result, off_sc; + size_t i, j; + size_t off_HH, off_y, off_HHy, off_sc; size_t sz, vl; for (i = 0; i < NUM_TX_ANT; ++i) { - off_result = i * NUM_SC; + off_HHy = i * NUM_SC; off_sc = 0; sz = NUM_SC; /* Initialize result registers */ - /* v0 - result real part */ - /* v1 - result imaginary part */ - asm volatile( + /* v0 - HHy real part */ + /* v1 - HHy imaginary part */ + __asm__ volatile( "vsetvli %0, %1, e32, m1, ta, ma\n" "vmv.v.i v0, 0\n" "vmv.v.i v1, 0\n" @@ -372,9 +375,9 @@ void cmatvecmul_TxRx( while (sz > 0) { for (j = 0; j < NUM_RX_ANT; ++j) { - off_A = i * NUM_RX_ANT * NUM_SC + j * NUM_SC + off_sc; - off_b = j * NUM_SC + off_sc; - asm volatile( + off_HH = i * NUM_RX_ANT * NUM_SC + j * NUM_SC + off_sc; + off_y = j * NUM_SC + off_sc; + __asm__ volatile( "vle32.v v2, (%0)\n" "vle32.v v3, (%1)\n" "vle32.v v4, (%2)\n" @@ -386,71 +389,69 @@ void cmatvecmul_TxRx( "vfmacc.vv v1, v3, v4\n" "vfmacc.vv v1, v2, v5\n" : - : "r"(&A->re[off_A]), "r"(&A->im[off_A]), - "r"(&b->re[off_b]), "r"(&b->im[off_b]) + : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]), + "r"(&g_y.re[off_y]), "r"(&g_y.im[off_y]) ); } /* Store result */ - asm volatile( + __asm__ volatile( "vse32.v v0, (%0)\n" "vse32.v v1, (%1)\n" : - : "r"(&result->re[off_result]), "r"(&result->im[off_result]) + : "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy]) ); sz -= vl; - off_result += vl; + off_HHy += vl; off_sc += vl; } } } -/** Complex forward substitution L*z = b +/** Complex forward substitution L*z = HHy * - * z_i = (b_i - \sum_{k=0}^{i-1} L_{ik} z_k) / L_{ii} + * z_i = (HHy_i - \sum_{k=0}^{i-1} L_{ik} z_k) / L_{ii} * - * \param L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \param b vector. Shape [NUM_TX_ANT][NUM_SC] - * \param result output vector. Shape [NUM_TX_ANT][NUM_SC] + * \global g_L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] + * \global g_HHy vector. Shape [NUM_TX_ANT][NUM_SC] + * \global g_z output vector. Shape [NUM_TX_ANT][NUM_SC] */ -void cforwardsub_TxTx( - IN vcomplex *L, - IN vcomplex *b, - OUT vcomplex *result) +void cforwardsub_TxTx() { size_t i, j; size_t sz, vl; size_t off_sc; for (i = 0; i != NUM_TX_ANT; ++i) { - printf("i: %d\n", i); off_sc = 0; sz = NUM_SC; while (sz > 0) { - printf("sz: %d\n", sz); + // printf("sz: %lu\n", sz); + // printf("vl: %lu\n", vl); /* Initialize result registers as b */ /* v0 - result real part */ /* v1 - result imaginary part */ - asm volatile ( + __asm__ volatile ( "vsetvli %0, %1, e32, m1, ta, ma\n" - "vle32.v v0, (%2)\n" - "vle32.v v1, (%3)\n" : "=r"(vl) - : "r"(sz), - "r"(&b->re[i * NUM_SC + off_sc]), - "r"(&b->im[i * NUM_SC + off_sc]) - ); + : "r"(sz)); + __asm__ volatile ( + "vle32.v v0, (%0)\n" + "vle32.v v1, (%1)\n" + : + : "r"(&g_HHy.re[i * NUM_SC + off_sc]), + "r"(&g_HHy.im[i * NUM_SC + off_sc])); + // printf("vl: %lu\n", vl); for (j = 0; j != i; ++j) { - printf("j: %d\n", j); /* b - sum L_ij * z_j */ /* v2 - L real part */ /* v3 - L imaginary part */ /* v4 - result_j real part */ /* v5 - result_j imaginary part */ - asm volatile ( + __asm__ volatile ( "vle32.v v2, (%0)\n" "vle32.v v3, (%1)\n" "vle32.v v4, (%2)\n" @@ -462,17 +463,17 @@ void cforwardsub_TxTx( "vfnmsac.vv v1, v3, v4\n" "vfnmsac.vv v1, v2, v5\n" : - : "r"(&L->re[i * NUM_TX_ANT * NUM_SC + j * NUM_SC + off_sc]), - "r"(&L->im[i * NUM_TX_ANT * NUM_SC + j * NUM_SC + off_sc]), - "r"(&result->re[j * NUM_SC + off_sc]), - "r"(&result->im[j * NUM_SC + off_sc]) + : "r"(&g_L.re[i * NUM_TX_ANT * NUM_SC + j * NUM_SC + off_sc]), + "r"(&g_L.im[i * NUM_TX_ANT * NUM_SC + j * NUM_SC + off_sc]), + "r"(&g_z.re[j * NUM_SC + off_sc]), + "r"(&g_z.im[j * NUM_SC + off_sc]) ); } /* Divide by L_ii */ /* v2 - L_ii real part */ /* v3 - L_ii imaginary part */ - asm volatile ( + __asm__ volatile ( "vle32.v v2, (%0)\n" "vle32.v v3, (%1)\n" /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */ @@ -490,10 +491,10 @@ void cforwardsub_TxTx( "vse32.v v0, (%2)\n" "vse32.v v1, (%3)\n" : - : "r"(&L->re[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), - "r"(&L->im[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), - "r"(&result->re[i * NUM_SC + off_sc]), - "r"(&result->im[i * NUM_SC + off_sc]) + : "r"(&g_L.re[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_L.im[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_z.re[i * NUM_SC + off_sc]), + "r"(&g_z.im[i * NUM_SC + off_sc]) ); off_sc += vl; @@ -502,18 +503,15 @@ void cforwardsub_TxTx( } } -/** Complex backward substitution L^H*z = b +/** Complex backward substitution L^H*x_MMSE = z * - * z_i = (b_i - \sum_{j=i+1}^{n-1} L_{ji} z_k) / L_{ii} + * x_i = (z_i - \sum_{j=i+1}^{n-1} L_{ji} x_k) / L_{ii} * - * \param L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \param b vector. Shape [NUM_TX_ANT][NUM_SC] - * \param result output vector. Shape [NUM_TX_ANT][NUM_SC] + * \global g_L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] + * \global g_z rhs vector. Shape [NUM_TX_ANT][NUM_SC] + * \global g_x_MMSE output vector. Shape [NUM_TX_ANT][NUM_SC] */ -void cbackwardsub_TxTx( - IN vcomplex *L, - IN vcomplex *b, - OUT vcomplex *result) +void cbackwardsub_TxTx() { size_t i, j; size_t sz, vl; @@ -524,26 +522,27 @@ void cbackwardsub_TxTx( sz = NUM_SC; while (sz > 0){ - /* Initialize result registers as b */ + /* Initialize result registers as z */ /* v0 - result real part */ /* v1 - result imaginary part */ - asm volatile( + __asm__ volatile( "vsetvli %0, %1, e32, m1, ta, ma\n" - "vle32.v v0, (%2)\n" - "vle32.v v1, (%3)\n" : "=r"(vl) - : "r"(sz), - "r"(&b->re[i * NUM_SC + off_sc]), - "r"(&b->im[i * NUM_SC + off_sc]) - ); + : "r"(sz)); + __asm__ volatile( + "vle32.v v0, (%0)\n" + "vle32.v v1, (%1)\n" + : + : "r"(&g_z.re[i * NUM_SC + off_sc]), + "r"(&g_z.im[i * NUM_SC + off_sc])); for (j = i + 1; j < NUM_TX_ANT; ++j) { /* b - sum L_ji * z_j */ /* v2 - L real part */ /* v3 - L imaginary part */ - /* v4 - result_j real part */ - /* v5 - result_j imaginary part */ - asm volatile( + /* v4 - x_MMSE_j real part */ + /* v5 - x_MMSE_j imaginary part */ + __asm__ volatile( "vle32.v v2, (%0)\n" "vle32.v v3, (%1)\n" "vle32.v v4, (%2)\n" @@ -555,17 +554,17 @@ void cbackwardsub_TxTx( "vfnmsac.vv v1, v3, v4\n" "vfnmsac.vv v1, v2, v5\n" : - : "r"(&L->re[j * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), - "r"(&L->im[j * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), - "r"(&result->re[j * NUM_SC + off_sc]), - "r"(&result->im[j * NUM_SC + off_sc]) + : "r"(&g_L.re[j * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_L.im[j * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_x_MMSE.re[j * NUM_SC + off_sc]), + "r"(&g_x_MMSE.im[j * NUM_SC + off_sc]) ); } /* Divide by L_ii */ /* v2 - L_ii real part */ /* v3 - L_ii imaginary part */ - asm volatile ( + __asm__ volatile ( "vle32.v v2, (%0)\n" "vle32.v v3, (%1)\n" /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */ @@ -579,224 +578,57 @@ void cbackwardsub_TxTx( "vfmul.vv v6, v1, v2\n" "vfnmsac.vv v6, v0, v3\n" "vfdiv.vv v1, v6, v4\n" - /* store result */ + /* store HH_H */ "vse32.v v0, (%2)\n" "vse32.v v1, (%3)\n" : - : "r"(&L->re[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), - "r"(&L->im[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), - "r"(&result->re[i * NUM_SC + off_sc]), - "r"(&result->im[i * NUM_SC + off_sc]) + : "r"(&g_L.re[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_L.im[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_x_MMSE.re[i * NUM_SC + off_sc]), + "r"(&g_x_MMSE.im[i * NUM_SC + off_sc]) ); sz -= vl; off_sc += vl; } } - - - /* TODO optimize offsets */ - - size_t i, j, k; - size_t off_ij, off_ikj, off_kj, off_iij; - data_t U_re, U_im, result_re, result_im; - - for (i = NUM_TX_ANT - 1; i != (size_t)-1; --i) { - for (j = 0; j < NUM_SC; ++j){ - off_ij = i * NUM_SC + j; - off_iij = i * NUM_TX_ANT * NUM_SC + i * NUM_SC + j; - result->re[off_ij] = b->re[off_ij]; - result->im[off_ij] = b->im[off_ij]; - - for (k = i+1; k < NUM_TX_ANT; ++k){ - off_ikj = i * NUM_TX_ANT * NUM_SC + k * NUM_SC + j; - off_kj = k * NUM_SC + j; - U_re = U->re[off_ikj]; - U_im = U->im[off_ikj]; - result_re = result->re[off_kj]; - result_im = result->im[off_kj]; - result->re[off_ij] -= U_re * result_re - U_im * result_im; - result->im[off_ij] -= U_re * result_im + U_im * result_re; - } - - result_re = result->re[off_ij]; - result_im = result->im[off_ij]; - cdiv( - result_re, result_im, - U->re[off_iij], U->im[off_iij], - &result->re[off_ij], - &result->im[off_ij]); - } - } } /* * MMSE */ -void mmse( - IN vcomplex *H, - IN vcomplex *y, - IN vcomplex *R, - OUT vcomplex *x_MMSE) +void mmse() { - vcomplex HH, HH_H, L, HHy, z, LH; - - data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; - data_t HH_im[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; - data_t HH_H_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t HH_H_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t L_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t L_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t HHy_re[NUM_TX_ANT][NUM_SC]; - data_t HHy_im[NUM_TX_ANT][NUM_SC]; - data_t z_re[NUM_TX_ANT][NUM_SC]; - data_t z_im[NUM_TX_ANT][NUM_SC]; - data_t LH_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t LH_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - - HH.re = (data_t *)HH_re; - HH.im = (data_t *)HH_im; - HH_H.re = (data_t *)HH_H_re; - HH_H.im = (data_t *)HH_H_im; - L.re = (data_t *)L_re; - L.im = (data_t *)L_im; - HHy.re = (data_t *)HHy_re; - HHy.im = (data_t *)HHy_im; - z.re = (data_t *)z_re; - z.im = (data_t *)z_im; - LH.re = (data_t *)LH_re; - LH.im = (data_t *)LH_im; - /* H^H*H + R */ TIME( "Gram and add (RxTx x TxRx + TxTx): %ld\n", - cmatgram_TxRx_cadd, - H, R, &HH_H); + cmatgram_TxRx_cadd); /* L: (H^H*H + R) = L*L^H */ TIME( "Cholesky (TxTx): %ld\n", - ccholesky_TxTx, - &HH_H, &L); + ccholesky_TxTx); /* z: L*z = H^H*y */ TIME( "Matrix-vector multiplication (TxRx x Rx): %ld\n", - cmatvecmul_TxRx, - &HH, y, &HHy); + cmatvecmul_TxRx); TIME( "Forward substitution (TxTx): %ld\n", - cforwardsub_TxTx, - &L, &HHy, &z); + cforwardsub_TxTx); /* x_MMSE: L^H*x_MMSE = z */ TIME( "Backward substitution (TxTx): %ld\n", - cbackwardsub_TxTx, - &L, &z, x_MMSE); -} - -void mmse_nosqrt( - IN vcomplex *H, - IN vcomplex *y, - IN vcomplex *R, - OUT vcomplex *x_MMSE) -{ - vcomplex HH, HH_H, L, D, HHy, z, LH; - - data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; - data_t HH_im[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; - data_t HH_H_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t HH_H_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t L_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t L_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t D_re[NUM_TX_ANT][NUM_SC]; - data_t D_im[NUM_TX_ANT][NUM_SC]; - data_t HHy_re[NUM_TX_ANT][NUM_SC]; - data_t HHy_im[NUM_TX_ANT][NUM_SC]; - data_t z_re[NUM_TX_ANT][NUM_SC]; - data_t z_im[NUM_TX_ANT][NUM_SC]; - data_t LH_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - data_t LH_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - - HH.re = (data_t *)HH_re; - HH.im = (data_t *)HH_im; - HH_H.re = (data_t *)HH_H_re; - HH_H.im = (data_t *)HH_H_im; - L.re = (data_t *)L_re; - L.im = (data_t *)L_im; - D.re = (data_t *)D_re; - D.im = (data_t *)D_im; - HHy.re = (data_t *)HHy_re; - HHy.im = (data_t *)HHy_im; - z.re = (data_t *)z_re; - z.im = (data_t *)z_im; - LH.re = (data_t *)LH_re; - LH.im = (data_t *)LH_im; - - /* NOT IMPLEMENTED */ - - #if 0 - TIME( - "Hermitian transpose (RxTx): %ld\n", - cmat_hermitian_transpose_RxTx, - H, &HH); - TIME( - "Matmul (TxRx x RxTx): %ld\n", - cmatmul_TxRx_RxTx, - &HH, H, &HH_H); - TIME( - "Matadd (TxTx + TxTx): %ld\n", - cmatadd_TxTx, - &HH_H, R, &HH_H); - /* L, D: (H^H*H + R) = L*D*L^H */ - TIME( - "Cholesky nosqrt (TxTx): %ld\n", - ccholesky_nosqrt_TxTx, - &HH_H, &L, &D); - /* z: L*z = H^H*y */ - TIME( - "Matrix-vector multiplication (TxRx x Rx): %ld\n", - cmatvecmul_TxRx, - &HH, y, &HHy); - TIME( - "Forward substitution (TxTx): %ld\n", - cforwardsub_TxTx, - &L, &HHy, &z); - /* x_MMSE: L^H*x_MMSE = D^-1*z */ - start_timer(); - data_t z_t_re, z_t_im; - for (size_t off_ij = 0; off_ij < NUM_TX_ANT * NUM_SC; ++off_ij) { - z_t_re = z.re[off_ij]; - z_t_im = z.im[off_ij]; - cdiv( - z_t_re, z_t_im, - D.re[off_ij], - D.im[off_ij], - &z.re[off_ij], - &z.im[off_ij]); - } - stop_timer(); - printf("Division (Tx): %ld\n", get_timer()); - TIME( - "Hermitian transpose (TxTx): %ld\n", - cmat_hermitian_transpose_TxTx, - &L, &LH); - TIME( - "Backward substitution (TxTx): %ld\n", - cbackwardsub_TxTx, - &LH, &z, x_MMSE); - #endif + cbackwardsub_TxTx); } -acc_t mse( - IN vcomplex *x, - IN vcomplex *x_MMSE) +acc_t mse() { acc_t sum = 0.; size_t off_ij = 0; data_t sub1, sub2; for (; off_ij != NUM_TX_ANT * NUM_SC; ++off_ij) { - sub1 = x->re[off_ij] - x_MMSE->re[off_ij]; - sub2 = x->im[off_ij] - x_MMSE->im[off_ij]; + sub1 = g_x.re[off_ij] - g_x_MMSE.re[off_ij]; + sub2 = g_x.im[off_ij] - g_x_MMSE.im[off_ij]; sum += (sub1 * sub1 + sub2 * sub2) / (data_t)(NUM_TX_ANT * NUM_SC); } return sum;