mmserv

Minimum Mean Square Error detection on RISC-V Vector Extention
git clone https://git.ea.contact/mmserv
Log | Files | Refs | README

commit a5854e1a8f800fb018e07037e74f77536fefefdd
parent 006841ff46ec83e606d11a7f4ec1d0eb9414c11b
Author: Egor Achkasov <eaachkasov@gmail.com>
Date:   Fri, 23 May 2025 01:42:32 +0200

Reimplement the rvv solutions lost in merge

Diffstat:
Msrc/cbackwardsub.c | 136+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
Msrc/ccholesky.c | 205++++++++++++++++++++++++++++++++++++++++---------------------------------------
Msrc/cforwardsub.c | 125+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Msrc/cmatgram.c | 174+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
Msrc/cmatvecmul.c | 68+++++++++++++++++++++++++++++++++++++++++---------------------------
5 files changed, 392 insertions(+), 316 deletions(-)

diff --git a/src/cbackwardsub.c b/src/cbackwardsub.c @@ -4,8 +4,8 @@ /** Complex backward substitution L^H*x_MMSE = z * - * x_MMSE_t = (z_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) / L_{t t} (for LL / float solution) - * x_MMSE_t = (z_t / D_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) (for LDL / fixed solution) + * x_{MMSE}_t = (z_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_{MMSE}_{tt}) / L_{t t} (for LL / float solution) + * x_{MMSE}_t = (z_t / D_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_{MMSE}_{tt}) (for LDL / fixed solution) * * \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC] * \global g_D diagonal matrix (only if DATA_TYPE_fixed is defined). Shape [NUM_TX][NUM_SC] @@ -14,10 +14,12 @@ */ void cbackwardsub() { -#if defined(ARCH_x86) || defined(ARCH_rv) size_t t, tt, s; size_t off_L, off_z, off_x_MMSE, off_D; + +#if defined(ARCH_x86) || defined(ARCH_rv) acc_t sum_re, sum_im; + for (t = NUM_TX - 1; t != (size_t)-1; --t) { for (s = 0; s < NUM_SC; ++s) { sum_re = sum_im = 0; @@ -46,86 +48,102 @@ void cbackwardsub() #endif } } + #elif defined(ARCH_rvv) - size_t i, j; size_t sz, vl; - size_t off_sc; - for (i = NUM_TX - 1; i != (size_t)-1; --i) { - off_sc = 0; + for (t = NUM_TX - 1; t != (size_t)-1; --t) { sz = NUM_SC; + s = 0; while (sz > 0){ - /* Initialize result registers as z */ - /* v0 - result real part */ - /* v1 - result imaginary part */ + /* Initialize x_MMSE as z */ + /* v0 - x_MMSE real part */ + /* v4 - x_MMSE imaginary part */ + off_z = t * NUM_SC + s; __asm__ volatile( - "vsetvli %0, %1, e32, m1, ta, ma\n" + "vsetvli %0, %1, e32, m4, ta, ma\n" + "vle32.v v0, (%2)\n" + "vle32.v v4, (%3)\n" : "=r"(vl) - : "r"(sz)); - __asm__ volatile( - "vle32.v v0, (%0)\n" - "vle32.v v1, (%1)\n" + : "r"(sz), + "r"(&g_z.re[off_z]), "r"(&g_z.im[off_z]) + ); + +#if defined(DATA_TYPE_fixed) + /* Divide by D_t */ + __asm__ volatile ( + "vle32.v v8, (%0)\n" + "vfdiv.vv v0, v0, v8\n" + "vfdiv.vv v4, v4, v8\n" : - : "r"(&g_z.re[i * NUM_SC + off_sc]), - "r"(&g_z.im[i * NUM_SC + off_sc])); + : "r"(&g_D[t * NUM_SC + s]) + ); +#endif - for (j = i + 1; j < NUM_TX; ++j) { - /* b - sum L_ji * z_j */ - /* v2 - L real part */ - /* v3 - L imaginary part */ - /* v4 - x_MMSE_j real part */ - /* v5 - x_MMSE_j imaginary part */ + for (tt = tt + 1; tt < NUM_TX; ++tt) { + /* z - sum L_{tt t} * x_{MMSE}_{tt} or + * z / D_t - sum L_{tt t} * x_{MMSE}_{tt} */ + off_L = tt * NUM_TX * NUM_SC + t * NUM_SC + s; + off_x_MMSE = tt * NUM_SC + s; __asm__ volatile( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vle32.v v4, (%2)\n" - "vle32.v v5, (%3)\n" + "vle32.v v8, (%0)\n" + "vle32.v v12, (%1)\n" + "vle32.v v16, (%2)\n" + "vle32.v v20, (%3)\n" +#if defined(DATA_TYPE_float) /* real part */ - "vfnmsac.vv v0, v2, v4\n" - "vfmacc.vv v0, v3, v5\n" + "vfnmsac.vv v0, v8, v16\n" + "vfmacc.vv v0, v12, v20\n" /* imaginary part */ - "vfnmsac.vv v1, v3, v4\n" - "vfnmsac.vv v1, v2, v5\n" + "vfnmsac.vv v4, v12, v16\n" + "vfnmsac.vv v4, v8, v20\n" +#elif defined(DATA_TYPE_fixed) + /* real part */ + "vsmul.vv v24, v8, v16\n" + "vssub.vv v0, v0, v24\n" + "vsmul.vv v24, v12, v20\n" + "vsadd.vv v0, v0, v24\n" + /* imaginary part */ + "vsmul.vv v24, v12, v16\n" + "vssub.vv v4, v4, v24\n" + "vsmul.vv v24, v8, v20\n" + "vssub.vv v4, v4, v24\n" +#else +#error "Unknown data type" +#endif : - : "r"(&g_L.re[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), - "r"(&g_L.im[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), - "r"(&g_x_MMSE.re[j * NUM_SC + off_sc]), - "r"(&g_x_MMSE.im[j * NUM_SC + off_sc]) + : "r"(&g_L.re[off_L]), "r"(&g_L.im[off_L]), + "r"(&g_x_MMSE.re[off_x_MMSE]), "r"(&g_x_MMSE.im[off_x_MMSE]) ); } +#if defined(DATA_TYPE_float) /* Divide by L_ii */ - /* v2 - L_ii real part */ - /* v3 - L_ii imaginary part */ + off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s; __asm__ volatile ( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */ - "vfmul.vv v4, v2, v2\n" - "vfmacc.vv v4, v3, v3\n" - /* real part */ - "vfmul.vv v5, v0, v2\n" - "vfmacc.vv v5, v1, v3\n" - "vdiv.vv v0, v5, v4\n" - /* imaginary part */ - "vfmul.vv v6, v1, v2\n" - "vfnmsac.vv v6, v0, v3\n" - "vfdiv.vv v1, v6, v4\n" - /* store HH_H */ - "vse32.v v0, (%2)\n" - "vse32.v v1, (%3)\n" - : - : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), - "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), - "r"(&g_x_MMSE.re[i * NUM_SC + off_sc]), - "r"(&g_x_MMSE.im[i * NUM_SC + off_sc]) + "vle32.v v8, (%0)\n" + "vfdiv.vv v0, v0, v8\n" + "vfdiv.vv v4, v4, v8\n" + : + : "r"(&g_L.re[off_L]) + ); +#endif + + /* Store result */ + off_x_MMSE = t * NUM_SC + s; + __asm__ volatile( + "vse32.v v0, (%0)\n" + "vse32.v v4, (%1)\n" + : + : "r"(&g_x_MMSE.re[off_x_MMSE]), "r"(&g_x_MMSE.im[off_x_MMSE]) ); sz -= vl; - off_sc += vl; + s += vl; } } + #else #error "Unknown architecture" #endif diff --git a/src/ccholesky.c b/src/ccholesky.c @@ -20,13 +20,15 @@ */ void ccholesky() { -#if defined(ARCH_x86) || defined(ARCH_rv) size_t i, j, k, s; size_t off_ij, off_jj, off_ii; size_t off_ik, off_jk; size_t off_i, off_j, off_k; + +#if defined(ARCH_x86) || defined(ARCH_rv) data_t tmp; /* Temporary variable for sqrt */ acc_t sum_re, sum_im; + for (i = 0; i < NUM_TX; ++i) { for (j = 0; j <= i; ++j) { for (s = 0; s < NUM_SC; ++s) { @@ -82,6 +84,7 @@ void ccholesky() #else #error "Unknown data type" #endif + } else { /* i != j */ #if defined(DATA_TYPE_float) /* Calculate L_ij = (G_ij - sum) / L_jj */ @@ -91,13 +94,9 @@ void ccholesky() #elif defined(DATA_TYPE_fixed) /* Calculate L_ij = (G_ij - sum) / D_j */ off_j = j * NUM_SC + s; - /* real */ sum_re = ((acc_t)g_G.re[off_ij] << FP_Q) - sum_re; - /* TODO roubding? */ g_L.re[off_ij] = (data_t)(sum_re / (acc_t)g_D[off_j]); - /* imaginary */ sum_im = ((acc_t)g_G.im[off_ij] << FP_Q) - sum_im; - /* TODO roubding? */ g_L.im[off_ij] = (data_t)(sum_im / (acc_t)g_D[off_j]); #else #error "Unknown data type" @@ -107,127 +106,133 @@ void ccholesky() } } #elif defined(ARCH_rvv) - size_t i, j, k; size_t sz, vl; - size_t off_sc; - - /* Init float registers */ - register float f0 __asm__("f0") = 2.0f; for (i = 0; i < NUM_TX; ++i) for (j = 0; j <= i; ++j) { sz = NUM_SC; - off_sc = 0; + s = 0; while (sz > 0) { - /* Initialize L registers */ - /* v0 - L_ij real part = G_ij.re */ - /* v1 - L_ij imaginary part = G_ij.im */ - __asm__ volatile( - "vsetvli %0, %1, e32, m1, ta, ma\n" - : "=r"(vl) : "r"(sz)); + /* Calculate + * sum_{k=0}^{j-1} L_ik L_jk^* or + * sum_{k=0}^{j-1} L_ik D_k L_jk^* + * + * v0 - sum real part + * v4 - sum imaginary part */ __asm__ volatile( - "vle32.v v0, (%0)\n" - "vle32.v v1, (%1)\n" - : - : "r"(&g_G.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), - "r"(&g_G.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]) + "vsetvli %0, %1, e32, m4, ta, ma\n" + "vmv.v.i v0, 0\n" + "vmv.v.i v4, 0\n" + : "=r"(vl) : "r"(sz) ); - /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */ - /* v2 - sum real part */ - /* v3 - sum imaginary part */ - __asm__ volatile( - "vmv.v.i v2, 0\n" - "vmv.v.i v3, 0\n" - ); for (k = 0; k < j; ++k) { + off_ik = i * NUM_TX * NUM_SC + k * NUM_SC + s; + off_jk = j * NUM_TX * NUM_SC + k * NUM_SC + s; __asm__ volatile( - "vle32.v v4, (%0)\n" - "vle32.v v5, (%1)\n" - "vle32.v v6, (%2)\n" - "vle32.v v7, (%3)\n" + "vle32.v v8, (%0)\n" + "vle32.v v12, (%1)\n" + "vle32.v v16, (%2)\n" + "vle32.v v20, (%3)\n" +#if defined(DATA_TYPE_float) /* real part */ - "vfmacc.vv v2, v4, v6\n" - "vfmacc.vv v2, v5, v7\n" + "vfmacc.vv v0, v8, v16\n" + "vfmacc.vv v0, v12, v20\n" /* imaginary part */ - "vfmacc.vv v3, v5, v6\n" - "vfnmsac.vv v3, v4, v7\n" + "vfmacc.vv v4, v12, v16\n" + "vfnmsac.vv v4, v8, v20\n" +#elif defined(DATA_TYPE_fixed) + "vle32.v v24, (%4)\n" + /* real part */ + "vsmul.vv v28, v8, v24\n" + "vsmul.vv v28, v28, v16\n" + "vsadd.vv v0, v0, v28\n" + "vsmul.vv v28, v12, v24\n" + "vsmul.vv v28, v28, v20\n" + "vsadd.vv v0, v0, v28\n" + /* imaginary part */ + "vsmul.vv v28, v12, v16\n" + "vsmul.vv v28, v28, v20\n" + "vsadd.vv v4, v4, v28\n" + "vsmul.vv v28, v8, v24\n" + "vsmul.vv v28, v28, v20\n" + "vssub.vv v4, v4, v28\n" +#else +#error "Unknown data type" +#endif : - : "r"(&g_L.re[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]), - "r"(&g_L.im[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]), - "r"(&g_L.re[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc]), - "r"(&g_L.im[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc]) + : "r"(&g_L.re[off_ik]), "r"(&g_L.im[off_ik]), + "r"(&g_L.re[off_jk]), "r"(&g_L.im[off_jk]) +#if defined(DATA_TYPE_fixed) + , "r"(&g_D[k * NUM_SC + s]) +#endif ); } - /* G_ii - sum */ - __asm__ volatile( - "vfsub.vv v0, v0, v2\n" - "vfsub.vv v1, v1, v3\n" - ); - if (i == j) { - /* Calculate L_ii = sqrt(G_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */ + off_ii = i * NUM_TX * NUM_SC + i * NUM_SC + s; + /* L_ii = sqrt(G_ii - sum) or D_i = (G_ii - sum) */ + /* G_ii imaginary part is 0, so we can ignore it */ __asm__ volatile( - /* Complex sqrt */ - - /* v2 = r = sqrt(re^2 + im^2) */ - "vfmul.vv v2, v0, v0\n" - "vfmacc.vv v2, v1, v1\n" - "vfsqrt.v v2, v2\n" - - /* v3 - real part */ - "vfadd.vv v3, v2, v0\n" /* r + re */ - "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */ - "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */ - /* v4 - imaginary part */ - "vfsub.vv v4, v2, v0\n" /* r - re */ - "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */ - "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */ - "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */ - - /* TODO handle im == 0 */ - - /* Move the result to v0 and v1 */ - "vmv.v.v v0, v3\n" - "vmv.v.v v1, v4\n" + "vle32.v v8, (%0)\n" +#if defined(DATA_TYPE_float) + "vfsub.vv v0, v8, v0\n" + "vfsqrt.v v0, v0\n" + "vfneg.v v4, v4\n" +#elif defined(DATA_TYPE_fixed) + "vssub.vv v0, v8, v0\n" + "vneg.v v4, v4\n" +#else +#error "Unknown data type" +#endif + "vse32.v v0, (%1)\n" + : : "r"(&g_G.re[off_ii]), +#if defined(DATA_TYPE_float) + "r"(&g_L.re[off_ii]) +#elif defined(DATA_TYPE_fixed) + "r"(&g_D[i * NUM_SC + s]) +#else +#error "Unknown data type" +#endif ); - } else { - /* Calculate L_ij = (G_ij - sum) / L_jj */ + } else { /* i != j */ + off_ij = i * NUM_TX * NUM_SC + j * NUM_SC + s; + off_jj = j * NUM_TX * NUM_SC + j * NUM_SC + s; + off_j = j * NUM_SC + s; + /* Calculate L_ij = (G_ij - sum) / L_jj or L_ij = (G_ij - sum) / D_j */ + /* L_jj is always real, so we can ignore the imaginary part */ __asm__ volatile( - /* L_jj */ - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */ - "vfmul.vv v4, v2, v2\n" - "vfmacc.vv v4, v3, v3\n" - /* real part */ - "vfmul.vv v5, v0, v2\n" - "vfmacc.vv v5, v1, v3\n" - /* imaginary part */ - "vfmul.vv v6, v1, v2\n" - "vfnmsac.vv v6, v0, v3\n" - /* divide and store at v0 and v1 */ - "vfdiv.vv v0, v5, v4\n" - "vfdiv.vv v1, v6, v4\n" - : - : "r"(&g_L.re[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), - "r"(&g_L.im[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc]) + "vle32.v v8, (%0)\n" /* G_ij.re */ + "vle32.v v12, (%1)\n" /* G_ij.im */ + "vle32.v v16, (%2)\n" /* L_jj or D_j */ +#if defined(DATA_TYPE_float) + "vfsub.vv v0, v8, v0\n" + "vfsub.vv v4, v12, v4\n" + "vfdiv.vv v0, v0, v16\n" + "vfdiv.vv v4, v4, v16\n" +#elif defined(DATA_TYPE_fixed) + "vssub.vv v0, v8, v0\n" + "vssub.vv v4, v12, v4\n" + "vdiv.vv v0, v0, v16\n" + "vdiv.vv v4, v4, v16\n" + "vsll.vi v0, v0, 1\n" + "vsll.vi v4, v4, 1\n" +#else +#error "Unknown data type" +#endif + /* Store result */ + "vse32.v v0, (%3)\n" + "vse32.v v4, (%4)\n" + : + : "r"(&g_G.re[off_ij]), "r"(&g_G.im[off_ij]), + "r"(&g_L.re[off_jj]), + "r"(&g_L.re[off_ij]), "r"(&g_L.im[off_ij]) ); } - /* Store result */ - __asm__ volatile( - "vse32.v v0, (%0)\n" - "vse32.v v1, (%1)\n" - : - : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), - "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]) - ); - sz -= vl; - off_sc += vl; + s += vl; } } #else diff --git a/src/cforwardsub.c b/src/cforwardsub.c @@ -13,10 +13,12 @@ */ void cforwardsub() { -#if defined(ARCH_x86) || defined(ARCH_rv) size_t t, tt, s; size_t off_L, off_HHy, off_z; + +#if defined(ARCH_x86) || defined(ARCH_rv) acc_t sum_re, sum_im; + for (t = 0; t < NUM_TX; ++t) { for (s = 0; s < NUM_SC; ++s) { sum_re = sum_im = 0; @@ -43,88 +45,91 @@ void cforwardsub() } } #elif defined(ARCH_rvv) - size_t i, j; size_t sz, vl; - size_t off_sc; - for (i = 0; i < NUM_TX; ++i) { - off_sc = 0; + for (t = 0; t < NUM_TX; ++t) { sz = NUM_SC; + s = 0; while (sz > 0) { - // printf("sz: %lu\n", sz); - // printf("vl: %lu\n", vl); - /* Initialize result registers as b */ - /* v0 - result real part */ - /* v1 - result imaginary part */ + off_HHy = t * NUM_SC + s; + + /* Initialize z as HHy */ + /* v0 - z real part */ + /* v4 - z imaginary part */ __asm__ volatile ( - "vsetvli %0, %1, e32, m1, ta, ma\n" + "vsetvli %0, %1, e32, m4, ta, ma\n" + "vle32.v v0, (%2)\n" + "vle32.v v4, (%3)\n" : "=r"(vl) - : "r"(sz)); - __asm__ volatile ( - "vle32.v v0, (%0)\n" - "vle32.v v1, (%1)\n" - : - : "r"(&g_HHy.re[i * NUM_SC + off_sc]), - "r"(&g_HHy.im[i * NUM_SC + off_sc])); - // printf("vl: %lu\n", vl); + : "r"(sz), + "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy]) + ); - for (j = 0; j != i; ++j) { - /* b - sum L_ij * z_j */ - /* v2 - L real part */ - /* v3 - L imaginary part */ - /* v4 - result_j real part */ - /* v5 - result_j imaginary part */ + for (tt = 0; tt != t; ++tt) { + off_L = t * NUM_TX * NUM_SC + tt * NUM_SC + s; + off_z = tt * NUM_SC + s; + + /* HHy_t - sum L_{t tt} * z_{tt} */ __asm__ volatile ( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vle32.v v4, (%2)\n" - "vle32.v v5, (%3)\n" + "vle32.v v8, (%0)\n" + "vle32.v v12, (%1)\n" + "vle32.v v16, (%2)\n" + "vle32.v v20, (%3)\n" +#if defined(DATA_TYPE_float) /* real part */ - "vfnmsac.vv v0, v2, v4\n" - "vfmacc.vv v0, v3, v5\n" + "vfnmsac.vv v0, v8, v16\n" + "vfmacc.vv v0, v12, v20\n" /* imaginary part */ - "vfnmsac.vv v1, v3, v4\n" - "vfnmsac.vv v1, v2, v5\n" + "vfnmsac.vv v4, v12, v16\n" + "vfnmsac.vv v4, v8, v20\n" +#elif defined(DATA_TYPE_fixed) + /* real part */ + "vsmul.vv v24, v8, v16\n" + "vssub.vv v0, v0, v24\n" + "vsmul.vv v24, v12, v20\n" + "vsadd.vv v0, v0, v24\n" + /* imaginary part */ + "vsmul.vv v24, v12, v16\n" + "vssub.vv v4, v4, v24\n" + "vsmul.vv v24, v8, v20\n" + "vssub.vv v4, v4, v24\n" +#else +#error "Unknown data type" +#endif : - : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), - "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), - "r"(&g_z.re[j * NUM_SC + off_sc]), - "r"(&g_z.im[j * NUM_SC + off_sc]) + : "r"(&g_L.re[off_L]), "r"(&g_L.im[off_L]), + "r"(&g_z.re[off_z]), "r"(&g_z.im[off_z]) ); } +#if defined(DATA_TYPE_float) /* Divide by L_ii */ - /* v2 - L_ii real part */ - /* v3 - L_ii imaginary part */ + /* L_ii imaginary part is zero, so we ignore it */ + off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s; __asm__ volatile ( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */ - "vfmul.vv v4, v2, v2\n" - "vfmacc.vv v4, v3, v3\n" - /* real part */ - "vfmul.vv v5, v0, v2\n" - "vfmacc.vv v5, v1, v3\n" - "vdiv.vv v0, v5, v4\n" - /* imaginary part */ - "vfmul.vv v6, v1, v2\n" - "vfnmsac.vv v6, v0, v3\n" - "vfdiv.vv v1, v6, v4\n" - /* store result */ - "vse32.v v0, (%2)\n" - "vse32.v v1, (%3)\n" + "vle32.v v8, (%0)\n" + "vfdiv.vv v0, v0, v8\n" + "vfdiv.vv v4, v4, v8\n" : - : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), - "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), - "r"(&g_z.re[i * NUM_SC + off_sc]), - "r"(&g_z.im[i * NUM_SC + off_sc]) + : "r"(&g_L.re[off_L]) ); +#endif - off_sc += vl; + /* Store result */ + off_z = t * NUM_SC + s; + __asm__ volatile ( + "vse32.v v0, (%0)\n" + "vse32.v v4, (%1)\n" + : + : "r"(&g_z.re[off_z]), "r"(&g_z.im[off_z]) + ); + + s += vl; sz -= vl; } } + #else #error "Unknown architecture" #endif diff --git a/src/cmatgram.c b/src/cmatgram.c @@ -13,118 +13,152 @@ */ void cmatgram() { -#if defined(ARCH_x86) || defined(ARCH_rv) size_t r, t1, t2, s; - size_t off_G, off_R, off_H1, off_H2; + size_t off_G, off_GH, off_H1, off_H2; + +#if defined(ARCH_x86) || defined(ARCH_rv) + acc_t sum_re, sum_im; + for (t1 = 0; t1 < NUM_TX; ++t1) { - for (t2 = 0; t2 < NUM_TX; ++t2) { + for (t2 = 0; t2 <= t1; ++t2) { /* t2 <= t1 since G is hermitian and R is symmetric */ for (s = 0; s < NUM_SC; ++s) { - off_R = off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s; - g_G.re[off_G] = g_R.re[off_R]; - g_G.im[off_G] = g_R.im[off_R]; + /* Calculate the sum */ + sum_re = sum_im = 0; for (r = 0; r < NUM_RX; ++r) { off_H1 = r * NUM_TX * NUM_SC + t1 * NUM_SC + s; off_H2 = r * NUM_TX * NUM_SC + t2 * NUM_SC + s; - g_G.re[off_G] += g_H.re[off_H1] * g_H.re[off_H2] - + g_H.im[off_H1] * g_H.im[off_H2]; - g_G.im[off_G] += g_H.re[off_H1] * g_H.im[off_H2] - - g_H.im[off_H1] * g_H.re[off_H2]; + sum_re += (acc_t)g_H.re[off_H1] * (acc_t)g_H.re[off_H2] + + (acc_t)g_H.im[off_H1] * (acc_t)g_H.im[off_H2]; + sum_im += (acc_t)g_H.re[off_H1] * (acc_t)g_H.im[off_H2] + - (acc_t)g_H.im[off_H1] * (acc_t)g_H.re[off_H2]; + } + + /* Add R */ + off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s; +#if defined(DATA_TYPE_float) + g_G.re[off_G] = (data_t)sum_re + g_R.re[off_G]; + g_G.im[off_G] = (data_t)sum_im + g_R.im[off_G]; +#elif defined(DATA_TYPE_fixed) + g_G.re[off_G] = (data_t)(sum_re >> FP_Q) + g_R.re[off_G]; + g_G.im[off_G] = (data_t)(sum_im >> FP_Q) + g_R.im[off_G]; +#else +#error "Unknown data type" +#endif + + /* Fill the upper triangle */ + /* G_{t2t1} = G_{t1t2}^* */ + if (t1 != t2) { + off_GH = t2 * NUM_TX * NUM_SC + t1 * NUM_SC + s; + g_G.re[off_GH] = g_G.re[off_G]; + g_G.im[off_GH] = -g_G.im[off_G]; } } } } + #elif defined(ARCH_rvv) - size_t t1, t2, r; size_t sz, vl; - size_t off_sc, off_A, off_AH; - size_t off_G_L, off_G_U; - data_t *A_re, *A_im, *AH_re, *AH_im; - data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im; - data_t *G_L_re, *G_L_im, *G_U_re, *G_U_im; for (t1 = 0; t1 != NUM_TX; ++t1) - for (t2 = t1; t2 != NUM_TX; ++t2) { - off_sc = 0; - off_G_L = t1 * NUM_TX * NUM_SC + t2 * NUM_SC; - off_G_U = t2 * NUM_TX * NUM_SC + t1 * NUM_SC; - G_L_re = &g_G.re[off_G_L]; - G_L_im = &g_G.im[off_G_L]; - G_U_re = &g_G.re[off_G_U]; - G_U_im = &g_G.im[off_G_U]; + for (t2 = t1; t2 != t1; ++t2) { sz = NUM_SC; + s = 0; while (sz > 0) { /* Initialize G registers */ /* v0 - G real part */ - /* v1 - G imaginary part */ + /* v4 - G imaginary part */ __asm__ volatile( - "vsetvli %0, %1, e32, m1, ta, ma\n" + "vsetvli %0, %1, e32, m4, ta, ma\n" "vmv.v.i v0, 0\n" - "vmv.v.i v1, 0\n" - : "=r"(vl) : "r"(sz)); + "vmv.v.i v4, 0\n" + : "=r"(vl) : "r"(sz) + ); for (r = 0; r != NUM_RX; ++r) { - off_A = r * NUM_TX * NUM_SC + t1 * NUM_SC + off_sc; - off_AH = r * NUM_TX * NUM_SC + t2 * NUM_SC + off_sc; - A_re = &g_H.re[off_A]; - A_im = &g_H.im[off_A]; - AH_re = &g_H.re[off_AH]; - AH_im = &g_H.im[off_AH]; + off_H1 = r * NUM_TX * NUM_SC + t1 * NUM_SC + s; + off_H2 = r * NUM_TX * NUM_SC + t2 * NUM_SC + s; - /* Calculate A^H*A */ - /* v2 - A real part */ - /* v3 - A imaginary part */ - /* v4 - A^H real part */ - /* v5 - A^H imaginary part */ + /* Calculate H^H*H */ + /* v8 - H_{rt1} real part */ + /* v12 - H_{rt2} imaginary part */ + /* v16 - H_{rt2} real part */ + /* v20 - H_{rt2} imaginary part */ __asm__ volatile( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vle32.v v4, (%2)\n" - "vle32.v v5, (%3)\n" + "vle32.v v8, (%0)\n" + "vle32.v v12, (%1)\n" + "vle32.v v16, (%2)\n" + "vle32.v v20, (%3)\n" +#if defined(DATA_TYPE_float) + /* real part */ + "vfmacc.vv v0, v8, v16\n" + "vfmacc.vv v0, v12, v20\n" + /* imaginary part */ + "vfmacc.vv v4, v12, v16\n" + "vfnmsac.vv v4, v8, v20\n" +#elif defined(DATA_TYPE_fixed) /* real part */ - "vfmacc.vv v0, v2, v4\n" - "vfmacc.vv v0, v3, v5\n" + "vsmul.vv v24, v8, v16\n" + "vsadd.vv v0, v0, v24\n" + "vsmul.vv v24, v12, v20\n" + "vsadd.vv v0, v0, v24\n" /* imaginary part */ - "vfmacc.vv v1, v3, v4\n" - "vfnmsac.vv v1, v2, v5\n" + "vsmul.vv v24, v12, v16\n" + "vsadd.vv v4, v4, v24\n" + "vsmul.vv v24, v8, v20\n" + "vssub.vv v4, v4, v24\n" +#else +#error "Unknown data type" +#endif : - : "r"(A_re), "r"(A_im), "r"(AH_re), "r"(AH_im) + : "r"(g_H.re[off_H1]), "r"(g_H.im[off_H1]), + "r"(g_H.re[off_H2]), "r"(g_H.im[off_H2]) ); } /* Add R */ - /* v2 - R real part */ - /* v3 - R imaginary part */ - R_U_re = &g_R.re[off_G_U]; - R_U_im = &g_R.im[off_G_U]; + off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s; __asm__ volatile( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vfadd.vv v2, v2, v0\n" - "vfadd.vv v3, v3, v1\n" - "vse32.v v2, (%2)\n" - "vse32.v v3, (%3)\n" + "vle32.v v8, (%0)\n" + "vle32.v v12, (%1)\n" +#if defined(DATA_TYPE_float) + "vfadd.vv v0, v0, v8\n" + "vfadd.vv v4, v4, v12\n" +#elif defined(DATA_TYPE_fixed) + "vsadd.vv v0, v0, v8\n" + "vsadd.vv v4, v4, v12\n" +#else +#error "Unknown data type" +#endif + "vse32.v v0, (%2)\n" + "vse32.v v4, (%3)\n" : - : "r"(R_U_re), "r"(R_U_im), "r"(G_U_re), "r"(G_U_im)); + : "r"(&g_R.re[off_G]), "r"(&g_R.im[off_G]), + "r"(&g_G.re[off_G]), "r"(&g_G.im[off_G]) + ); + + /* Fill the upper triangle */ + /* G_{t2t1} = G_{t1t2}^* */ if (t1 != t2) { - R_L_re = &g_R.re[off_G_L]; - R_L_im = &g_R.im[off_G_L]; + off_GH = t2 * NUM_TX * NUM_SC + t1 * NUM_SC + s; __asm__ volatile( - /* Lower triangle */ - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vfadd.vv v2, v2, v0\n" - "vfsub.vv v3, v3, v1\n" - "vse32.v v2, (%2)\n" - "vse32.v v3, (%3)\n" +#if defined(DATA_TYPE_float) + "vfneg.v v4, v4\n" +#elif defined(DATA_TYPE_fixed) + "vneg.v v4, v4\n" +#else +#error "Unknown data type" +#endif + "vse32.v v0, (%0)\n" + "vse32.v v4, (%1)\n" : - : "r"(R_L_re), "r"(R_L_im), "r"(G_L_re), "r"(G_L_im) + : "r"(&g_G.re[off_GH]), "r"(&g_G.im[off_GH]) ); } sz -= vl; - off_sc += vl; + s += vl; } } #endif diff --git a/src/cmatvecmul.c b/src/cmatvecmul.c @@ -12,10 +12,12 @@ */ void cmatvecmul() { -#if defined(ARCH_x86) || defined(ARCH_rv) size_t t, r, s; size_t off_H, off_y, off_HHy; + +#if defined(ARCH_x86) || defined(ARCH_rv) acc_t sum_re, sum_im; + for (t = 0; t < NUM_TX; ++t) { for (s = 0; s < NUM_SC; ++s) { sum_re = sum_im = 0; @@ -39,59 +41,71 @@ void cmatvecmul() #endif } } + #elif defined(ARCH_rvv) - size_t i, j; - size_t off_HH, off_y, off_HHy, off_sc; size_t sz, vl; - for (i = 0; i < NUM_TX; ++i) { - off_HHy = i * NUM_SC; - off_sc = 0; + for (t = 0; t < NUM_TX; ++t) { sz = NUM_SC; - + s = 0; while (sz > 0) { - /* Initialize result registers */ + /* Initialize HHy with 0 */ /* v0 - HHy real part */ - /* v1 - HHy imaginary part */ + /* v4 - HHy imaginary part */ __asm__ volatile( - "vsetvli %0, %1, e32, m1, ta, ma\n" + "vsetvli %0, %1, e32, m4, ta, ma\n" "vmv.v.i v0, 0\n" - "vmv.v.i v1, 0\n" + "vmv.v.i v4, 0\n" : "=r"(vl) : "r"(sz) ); - for (j = 0; j < NUM_RX; ++j) { - off_HH = i * NUM_RX * NUM_SC + j * NUM_SC + off_sc; - off_y = j * NUM_SC + off_sc; + for (r = 0; r < NUM_RX; ++r) { + off_H = r * NUM_TX * NUM_SC + t * NUM_SC + s; + off_y = r * NUM_SC + s; __asm__ volatile( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vle32.v v4, (%2)\n" - "vle32.v v5, (%3)\n" + "vle32.v v8, (%0)\n" + "vle32.v v12, (%1)\n" + "vle32.v v16, (%2)\n" + "vle32.v v20, (%3)\n" +#if defined(DATA_TYPE_float) + /* real part */ + "vfmacc.vv v0, v8, v16\n" + "vfnmsac.vv v0, v12, v20\n" + /* imaginary part */ + "vfmacc.vv v4, v12, v16\n" + "vfmacc.vv v4, v8, v20\n" +#elif defined(DATA_TYPE_fixed) /* real part */ - "vfmacc.vv v0, v2, v4\n" - "vfnmsac.vv v0, v3, v5\n" + "vsmul.vv v24, v8, v16\n" + "vsadd.vv v0, v0, v24\n" + "vsmul.vv v24, v12, v20\n" + "vssub.vv v0, v0, v24\n" /* imaginary part */ - "vfmacc.vv v1, v3, v4\n" - "vfmacc.vv v1, v2, v5\n" + "vsmul.vv v24, v12, v16\n" + "vsadd.vv v4, v4, v24\n" + "vsmul.vv v24, v8, v20\n" + "vsadd.vv v4, v4, v24\n" +#else +#error "Unknown data type" +#endif : - : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]), + : "r"(&g_H.re[off_H]), "r"(&g_H.im[off_H]), "r"(&g_y.re[off_y]), "r"(&g_y.im[off_y]) - ); + ); } /* Store result */ + off_HHy = t * NUM_SC + s; __asm__ volatile( "vse32.v v0, (%0)\n" - "vse32.v v1, (%1)\n" + "vse32.v v4, (%1)\n" : : "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy]) ); sz -= vl; - off_HHy += vl; - off_sc += vl; + s += vl; } } #else