commit 1476d43bf060a4f72c82a1471dfe0fdbf0b65552
parent b09c0cd9f882acd2b75502bae749553131418605
Author: Egor Achkasov <eaachkasov@edu.hse.ru>
Date: Tue, 18 Feb 2025 21:16:15 +0100
Vectorize cholesky; TEMPORARILY make dimensions runtime variable
Diffstat:
| M | main.c | | | 42 | +++++++++++++++++++++++++++++++++++++----- |
| M | src/mmserv.c | | | 374 | ++++++++++++++++++++++++++++++++++++------------------------------------------- |
2 files changed, 209 insertions(+), 207 deletions(-)
diff --git a/main.c b/main.c
@@ -3,7 +3,11 @@
#include "../common/printf.h"
/* extern functions */
-extern void mmse();
+extern void cmatgram_TxRx_cadd();
+extern void ccholesky_TxTx();
+extern void cmatvecmul_TxRx();
+extern void cforwardsub_TxTx();
+extern void cbackwardsub_TxTx();
extern acc_t mse();
/* Raw data */
@@ -31,10 +35,38 @@ data_t x_MMSE_re[NUM_TX_ANT][NUM_SC];
data_t x_MMSE_im[NUM_TX_ANT][NUM_SC];
vcomplex g_x_MMSE = { .re = (data_t *)x_MMSE_re, .im = (data_t *)x_MMSE_im };
+extern vcomplex g_HH, h_y, g_HHy;
+size_t num_rx_cur=1, num_tx_cur=1, num_sc_cur=1;
+
int main() {
- /* Calculate the MMSE approximation */
- mmse();
- printf("MSE: %f\n", mse());
+ unsigned long long start, end;
+
+ size_t t1,t2,t3,t4,t5;
+ for (num_rx_cur = 1; num_rx_cur <= NUM_RX_ANT; ++num_rx_cur)
+ for (num_tx_cur = 1; num_tx_cur <= NUM_TX_ANT; ++num_tx_cur)
+ for (num_sc_cur = 1; num_sc_cur <= NUM_SC; num_sc_cur += 1) {
+ __asm__ volatile("rdcycle %0" : "=r"(start));
+ cmatgram_TxRx_cadd();
+ __asm__ volatile("rdcycle %0" : "=r"(end));
+ t1 = end - start;
+ __asm__ volatile("rdcycle %0" : "=r"(start));
+ ccholesky_TxTx();
+ __asm__ volatile("rdcycle %0" : "=r"(end));
+ t2 = end - start;
+ __asm__ volatile("rdcycle %0" : "=r"(start));
+ cmatvecmul_TxRx();
+ __asm__ volatile("rdcycle %0" : "=r"(end));
+ t3 = end - start;
+ __asm__ volatile("rdcycle %0" : "=r"(start));
+ cforwardsub_TxTx();
+ __asm__ volatile("rdcycle %0" : "=r"(end));
+ t4 = end - start;
+ __asm__ volatile("rdcycle %0" : "=r"(start));
+ cbackwardsub_TxTx();
+ __asm__ volatile("rdcycle %0" : "=r"(end));
+ t5 = end - start;
+ printf("%llu,%llu,%llu,%llu,%llu,", t1, t2, t3, t4, t5);
+ }
- printf("Shutting down\n");
+ return 0;
}
diff --git a/src/mmserv.c b/src/mmserv.c
@@ -46,6 +46,8 @@ uint64_t get_timer()
/* Externs */
extern vcomplex g_x, g_H, g_R, g_y;
extern vcomplex g_x_MMSE;
+extern size_t num_rx_cur, num_tx_cur, num_sc_cur;
+
/* Raw data */
data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC];
@@ -70,48 +72,6 @@ vcomplex g_z = { .re = (data_t *)z_re, .im = (data_t *)z_im };
vcomplex g_LH = { .re = (data_t *)LH_re, .im = (data_t *)LH_im };
/*
- * Complex functions
- */
-
-void cdiv(
- IN data_t a_re, IN data_t a_im,
- IN data_t b_re, IN data_t b_im,
- OUT data_t *res_re, OUT data_t *res_im)
-{
- *res_im = b_re * b_re + b_im * b_im;
- *res_re = (a_re * b_re + a_im * b_im) / *res_im;
- *res_im = (a_im * b_re - a_re * b_im) / *res_im;
-}
-
-data_t sqrt(IN data_t x)
-{
- __asm__ volatile("fsqrt.s %0, %1" : "=f"(x) : "f"(x));
- return x;
-}
-void csqrt(
- IN data_t re, IN data_t im,
- OUT data_t *res_re, OUT data_t *res_im)
-{
- if (im == 0){
- if (re < 0) {
- *res_re = 0;
- *res_im = sqrt(-re);
- return;
- }
- else {
- *res_re = sqrt(re);
- *res_im = 0;
- return;
- }
- }
- data_t length = sqrt(re * re + im * im);
- *res_re = sqrt((length + re) / 2.f);
- *res_im = sqrt((length - re) / 2.f);
- if (im < 0.f)
- *res_re = -*res_re;
-}
-
-/*
* Complex matrix operations
*/
@@ -133,16 +93,16 @@ void cmatgram_TxRx_cadd()
data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im;
data_t *HH_H_L_re, *HH_H_L_im, *HH_H_U_re, *HH_H_U_im;
- for (t1 = 0; t1 != NUM_TX_ANT; ++t1)
- for (t2 = t1; t2 != NUM_TX_ANT; ++t2) {
+ for (t1 = 0; t1 != num_tx_cur; ++t1)
+ for (t2 = t1; t2 != num_tx_cur; ++t2) {
off_sc = 0;
- off_HH_H_L = t1 * NUM_TX_ANT * NUM_SC + t2 * NUM_SC;
- off_HH_H_U = t2 * NUM_TX_ANT * NUM_SC + t1 * NUM_SC;
+ off_HH_H_L = t1 * num_tx_cur * num_sc_cur + t2 * num_sc_cur;
+ off_HH_H_U = t2 * num_tx_cur * num_sc_cur + t1 * num_sc_cur;
HH_H_L_re = &g_HH_H.re[off_HH_H_L];
HH_H_L_im = &g_HH_H.im[off_HH_H_L];
HH_H_U_re = &g_HH_H.re[off_HH_H_U];
HH_H_U_im = &g_HH_H.im[off_HH_H_U];
- sz = NUM_SC;
+ sz = num_sc_cur;
while (sz > 0) {
/* Initialize HH_H registers */
@@ -154,9 +114,9 @@ void cmatgram_TxRx_cadd()
"vmv.v.i v1, 0\n"
: "=r"(vl) : "r"(sz));
- for (r = 0; r != NUM_RX_ANT; ++r) {
- off_A = r * NUM_TX_ANT * NUM_SC + t1 * NUM_SC + off_sc;
- off_AH = r * NUM_TX_ANT * NUM_SC + t2 * NUM_SC + off_sc;
+ for (r = 0; r != num_rx_cur; ++r) {
+ off_A = r * num_tx_cur * num_sc_cur + t1 * num_sc_cur + off_sc;
+ off_AH = r * num_tx_cur * num_sc_cur + t2 * num_sc_cur + off_sc;
A_re = &g_H.re[off_A];
A_im = &g_H.im[off_A];
AH_re = &g_H.re[off_AH];
@@ -222,127 +182,137 @@ void cmatgram_TxRx_cadd()
/** Complex Cholesky decomposition L of a Hermitian positive-definite matrix HH_H
*
* HH_H = L*L^H
+ * L_ij = (HH_H_ij - \sum_{k=0}^{j-1} L_ik L_jk^*) / L_jj
+ * L_ii = sqrt(HH_H_ii - \sum_{k=0}^{i-1} L_ik L_ik^*)
*
* \global g_HH_H matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
* \global g_L output lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
*/
void ccholesky_TxTx()
{
- size_t i, j, k, l;
- size_t off_ijk, off_ilk, off_jlk, off_jjk;
- data_t sum_re, sum_im;
- data_t t1_re, t1_im, t2_re, t2_im;
-
- /* Init upper triangle with zeros */
- for (i = 0; i < NUM_TX_ANT; ++i)
- for (j = i+1; j < NUM_TX_ANT; ++j) {
- off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC;
- for (k = 0; k < NUM_SC; ++k) {
- g_L.re[off_ijk] = g_L.im[off_ijk] = 0;
- ++off_ijk;
- }
- }
+ size_t i, j, k;
+ size_t sz, vl;
+ size_t off_sc;
- /* Calculate diagonal and lower triangular entries */
- for (i = 0; i < NUM_TX_ANT; ++i)
- for (j = 0; j < i+1; ++j)
- for (k = 0; k < NUM_SC; ++k){
- off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC + k;
- sum_re = g_HH_H.re[off_ijk];
- sum_im = g_HH_H.im[off_ijk];
- for (l = 0; l < j; ++l) {
- off_ilk = i * NUM_TX_ANT * NUM_SC + l * NUM_SC + k;
- off_jlk = j * NUM_TX_ANT * NUM_SC + l * NUM_SC + k;
- t1_re = g_L.re[off_ilk];
- t1_im = g_L.im[off_ilk];
- t2_re = g_L.re[off_jlk];
- t2_im = g_L.im[off_jlk];
- sum_re -= t1_re * t2_re + t1_im * t2_im;
- sum_im -= t1_im * t2_re - t1_re * t2_im;
- }
- if (i == j)
- csqrt(
- sum_re, sum_im,
- &g_L.re[off_ijk],
- &g_L.im[off_ijk]);
- else {
- off_jjk = j * NUM_TX_ANT * NUM_SC + j * NUM_SC + k;
- cdiv(
- sum_re, sum_im,
- g_L.re[off_jjk],
- g_L.im[off_jjk],
- &g_L.re[off_ijk],
- &g_L.im[off_ijk]);
+ /* Init float registers */
+ register float f0 __asm__("f0") = 2.0f;
+
+ for (i = 0; i < num_tx_cur; ++i)
+ for (j = 0; j <= i; ++j) {
+ sz = num_sc_cur;
+ off_sc = 0;
+
+ while (sz > 0) {
+ /* Initialize L registers */
+ /* v0 - L_ij real part = HH_H_ij.re */
+ /* v1 - L_ij imaginary part = HH_H_ij.im */
+ __asm__ volatile(
+ "vsetvli %0, %1, e32, m1, ta, ma\n"
+ : "=r"(vl) : "r"(sz));
+ __asm__ volatile(
+ "vle32.v v0, (%0)\n"
+ "vle32.v v1, (%1)\n"
+ :
+ : "r"(&g_HH_H.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
+ "r"(&g_HH_H.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
+ );
+
+ /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */
+ /* v2 - sum real part */
+ /* v3 - sum imaginary part */
+ __asm__ volatile(
+ "vmv.v.i v2, 0\n"
+ "vmv.v.i v3, 0\n"
+ );
+ for (k = 0; k < j; ++k) {
+ __asm__ volatile(
+ "vle32.v v4, (%0)\n"
+ "vle32.v v5, (%1)\n"
+ "vle32.v v6, (%2)\n"
+ "vle32.v v7, (%3)\n"
+ /* real part */
+ "vfmacc.vv v2, v4, v6\n"
+ "vfmacc.vv v2, v5, v7\n"
+ /* imaginary part */
+ "vfmacc.vv v3, v5, v6\n"
+ "vfnmsac.vv v3, v4, v7\n"
+ :
+ : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
+ "r"(&g_L.im[i * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
+ "r"(&g_L.re[j * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
+ "r"(&g_L.im[j * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc])
+ );
}
- }
-}
-void ccholesky_nosqrt_TxTx(
- IN vcomplex *A,
- OUT vcomplex *L,
- OUT vcomplex *D)
-{
- size_t i, j, k, l;
- size_t off_iik, off_ijk, off_ilk, off_jlk;
- size_t off_ik, off_jk;
- data_t t1_re, t1_im, t2_re, t2_im;
- data_t sum_re, sum_im;
-
- /* Init zeros in upper triangles of L and D */
- for (off_ijk = 0; off_ijk < NUM_TX_ANT * NUM_TX_ANT * NUM_SC; ++off_ijk)
- L->re[off_ijk] = L->im[off_ijk] = D->re[off_ijk] = D->im[off_ijk] = 0;
- /* Init zeros in lower triangle of D */
- /* Init ones in the diagonal of the result */
- for (off_ik = 0; off_ik < NUM_TX_ANT * NUM_SC; ++off_ik) {
- L->im[off_ik] = D->re[off_ik] = D->im[off_ik] = 0;
- L->re[off_ik] = 1;
- }
+ /* HH_H_ii - sum */
+ __asm__ volatile(
+ "vfsub.vv v0, v0, v2\n"
+ "vfsub.vv v1, v1, v3\n"
+ );
- /* Calculate the lower triangle of the result */
- for (k = 0; k != NUM_SC; ++k) {
- for (i = 0; i != NUM_TX_ANT; ++i) {
- off_iik = i * NUM_TX_ANT * NUM_SC + i * NUM_SC + k;
- /* L_ij = (A_ij - \sum_{l=0}^{j-1} L_il L_jl^* D_k) / D_j */
- for (j = 0; j != i; ++j) {
- for (l = 0; l != j; ++l) {
- off_ilk = i * NUM_TX_ANT * NUM_SC + l * NUM_SC + k;
- off_jlk = j * NUM_TX_ANT * NUM_SC + l * NUM_SC + k;
- t1_re = L->re[off_ilk];
- t1_im = L->im[off_ilk];
- t2_re = L->re[off_jlk];
- t2_im = L->im[off_jlk];
- sum_re += t1_re * t2_re + t1_im * t2_im;
- sum_im += t1_im * t2_re - t1_re * t2_im;
+ if (i == j) {
+ /* Calculate L_ii = sqrt(HH_H_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */
+ __asm__ volatile(
+ /* Complex sqrt */
+
+ /* v2 = r = sqrt(re^2 + im^2) */
+ "vfmul.vv v2, v0, v0\n"
+ "vfmacc.vv v2, v1, v1\n"
+ "vfsqrt.v v2, v2\n"
+
+ /* v3 - real part */
+ "vfadd.vv v3, v2, v0\n" /* r + re */
+ "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */
+ "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */
+ /* v4 - imaginary part */
+ "vfsub.vv v4, v2, v0\n" /* r - re */
+ "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */
+ "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */
+ "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */
+
+ /* TODO handle im == 0 */
+
+ /* Move the result to v0 and v1 */
+ "vmv.v.v v0, v3\n"
+ "vmv.v.v v1, v4\n"
+ );
+ } else {
+ /* Calculate L_ij = (HH_H_ij - sum) / L_jj */
+ __asm__ volatile(
+ /* L_jj */
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */
+ "vfmul.vv v4, v2, v2\n"
+ "vfmacc.vv v4, v3, v3\n"
+ /* real part */
+ "vfmul.vv v5, v0, v2\n"
+ "vfmacc.vv v5, v1, v3\n"
+ /* imaginary part */
+ "vfmul.vv v6, v1, v2\n"
+ "vfnmsac.vv v6, v0, v3\n"
+ /* divide and store at v0 and v1 */
+ "vfdiv.vv v0, v5, v4\n"
+ "vfdiv.vv v1, v6, v4\n"
+ :
+ : "r"(&g_L.re[j * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
+ "r"(&g_L.im[j * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
+ );
}
- off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC + k;
- off_jk = j * NUM_SC + k;
- t1_re = A->re[off_ijk] - sum_re;
- t1_im = A->im[off_ijk] - sum_im;
- cdiv(
- t1_re, t1_im,
- D->re[off_jk], D->im[off_jk],
- &L->re[off_ijk],
- &L->im[off_ijk]);
- }
- /* D_i = A_ii - \sum_{j=0}^{i-1} L_ij L_ij^* D_j */
- sum_re = sum_im = 0;
- off_ik = i * NUM_SC + k;
- for (j = 0; j != i; ++j) {
- off_ijk = i * NUM_TX_ANT * NUM_SC + j * NUM_SC + k;
- off_jk = j * NUM_SC + k;
- t1_re = L->re[off_ijk];
- t1_im = L->im[off_ijk];
- t2_re = t1_re * t1_re + t1_im * t1_im;
- t2_im = t1_re * t1_im - t1_im * t1_re;
- t1_re = D->re[off_jk];
- t1_im = D->im[off_jk];
- sum_re += t2_re * t1_re - t2_im * t1_im;
- sum_im += t2_re * t1_im + t2_im * t1_re;
+
+ /* Store result */
+ __asm__ volatile(
+ "vse32.v v0, (%0)\n"
+ "vse32.v v1, (%1)\n"
+ :
+ : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
+ "r"(&g_L.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
+ );
+
+ sz -= vl;
+ off_sc += vl;
}
- D->re[off_ik] = A->re[off_iik] - sum_re;
- D->im[off_ik] = A->im[off_iik] - sum_im;
}
- }
}
/** Complex matrix-vector multiplication HHy = HH*y
@@ -357,26 +327,26 @@ void cmatvecmul_TxRx()
size_t off_HH, off_y, off_HHy, off_sc;
size_t sz, vl;
- for (i = 0; i < NUM_TX_ANT; ++i) {
- off_HHy = i * NUM_SC;
+ for (i = 0; i < num_tx_cur; ++i) {
+ off_HHy = i * num_sc_cur;
off_sc = 0;
- sz = NUM_SC;
-
- /* Initialize result registers */
- /* v0 - HHy real part */
- /* v1 - HHy imaginary part */
- __asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
- "vmv.v.i v0, 0\n"
- "vmv.v.i v1, 0\n"
- : "=r"(vl)
- : "r"(sz)
- );
+ sz = num_sc_cur;
while (sz > 0) {
- for (j = 0; j < NUM_RX_ANT; ++j) {
- off_HH = i * NUM_RX_ANT * NUM_SC + j * NUM_SC + off_sc;
- off_y = j * NUM_SC + off_sc;
+ /* Initialize result registers */
+ /* v0 - HHy real part */
+ /* v1 - HHy imaginary part */
+ __asm__ volatile(
+ "vsetvli %0, %1, e32, m1, ta, ma\n"
+ "vmv.v.i v0, 0\n"
+ "vmv.v.i v1, 0\n"
+ : "=r"(vl)
+ : "r"(sz)
+ );
+
+ for (j = 0; j < num_rx_cur; ++j) {
+ off_HH = i * num_rx_cur * num_sc_cur + j * num_sc_cur + off_sc;
+ off_y = j * num_sc_cur + off_sc;
__asm__ volatile(
"vle32.v v2, (%0)\n"
"vle32.v v3, (%1)\n"
@@ -423,9 +393,9 @@ void cforwardsub_TxTx()
size_t sz, vl;
size_t off_sc;
- for (i = 0; i != NUM_TX_ANT; ++i) {
+ for (i = 0; i < num_tx_cur; ++i) {
off_sc = 0;
- sz = NUM_SC;
+ sz = num_sc_cur;
while (sz > 0) {
// printf("sz: %lu\n", sz);
@@ -441,8 +411,8 @@ void cforwardsub_TxTx()
"vle32.v v0, (%0)\n"
"vle32.v v1, (%1)\n"
:
- : "r"(&g_HHy.re[i * NUM_SC + off_sc]),
- "r"(&g_HHy.im[i * NUM_SC + off_sc]));
+ : "r"(&g_HHy.re[i * num_sc_cur + off_sc]),
+ "r"(&g_HHy.im[i * num_sc_cur + off_sc]));
// printf("vl: %lu\n", vl);
for (j = 0; j != i; ++j) {
@@ -463,10 +433,10 @@ void cforwardsub_TxTx()
"vfnmsac.vv v1, v3, v4\n"
"vfnmsac.vv v1, v2, v5\n"
:
- : "r"(&g_L.re[i * NUM_TX_ANT * NUM_SC + j * NUM_SC + off_sc]),
- "r"(&g_L.im[i * NUM_TX_ANT * NUM_SC + j * NUM_SC + off_sc]),
- "r"(&g_z.re[j * NUM_SC + off_sc]),
- "r"(&g_z.im[j * NUM_SC + off_sc])
+ : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
+ "r"(&g_L.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
+ "r"(&g_z.re[j * num_sc_cur + off_sc]),
+ "r"(&g_z.im[j * num_sc_cur + off_sc])
);
}
@@ -491,10 +461,10 @@ void cforwardsub_TxTx()
"vse32.v v0, (%2)\n"
"vse32.v v1, (%3)\n"
:
- : "r"(&g_L.re[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_L.im[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_z.re[i * NUM_SC + off_sc]),
- "r"(&g_z.im[i * NUM_SC + off_sc])
+ : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
+ "r"(&g_L.im[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
+ "r"(&g_z.re[i * num_sc_cur + off_sc]),
+ "r"(&g_z.im[i * num_sc_cur + off_sc])
);
off_sc += vl;
@@ -517,9 +487,9 @@ void cbackwardsub_TxTx()
size_t sz, vl;
size_t off_sc;
- for (i = NUM_TX_ANT - 1; i != 0; --i) {
+ for (i = num_tx_cur - 1; i != (size_t)-1; --i) {
off_sc = 0;
- sz = NUM_SC;
+ sz = num_sc_cur;
while (sz > 0){
/* Initialize result registers as z */
@@ -533,10 +503,10 @@ void cbackwardsub_TxTx()
"vle32.v v0, (%0)\n"
"vle32.v v1, (%1)\n"
:
- : "r"(&g_z.re[i * NUM_SC + off_sc]),
- "r"(&g_z.im[i * NUM_SC + off_sc]));
+ : "r"(&g_z.re[i * num_sc_cur + off_sc]),
+ "r"(&g_z.im[i * num_sc_cur + off_sc]));
- for (j = i + 1; j < NUM_TX_ANT; ++j) {
+ for (j = i + 1; j < num_tx_cur; ++j) {
/* b - sum L_ji * z_j */
/* v2 - L real part */
/* v3 - L imaginary part */
@@ -554,10 +524,10 @@ void cbackwardsub_TxTx()
"vfnmsac.vv v1, v3, v4\n"
"vfnmsac.vv v1, v2, v5\n"
:
- : "r"(&g_L.re[j * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_L.im[j * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_x_MMSE.re[j * NUM_SC + off_sc]),
- "r"(&g_x_MMSE.im[j * NUM_SC + off_sc])
+ : "r"(&g_L.re[j * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
+ "r"(&g_L.im[j * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
+ "r"(&g_x_MMSE.re[j * num_sc_cur + off_sc]),
+ "r"(&g_x_MMSE.im[j * num_sc_cur + off_sc])
);
}
@@ -582,10 +552,10 @@ void cbackwardsub_TxTx()
"vse32.v v0, (%2)\n"
"vse32.v v1, (%3)\n"
:
- : "r"(&g_L.re[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_L.im[i * NUM_TX_ANT * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_x_MMSE.re[i * NUM_SC + off_sc]),
- "r"(&g_x_MMSE.im[i * NUM_SC + off_sc])
+ : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
+ "r"(&g_L.im[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
+ "r"(&g_x_MMSE.re[i * num_sc_cur + off_sc]),
+ "r"(&g_x_MMSE.im[i * num_sc_cur + off_sc])
);
sz -= vl;
@@ -626,8 +596,8 @@ acc_t mse()
acc_t sum = 0.;
size_t off = 0;
data_t sub1, sub2;
- size_t sz = NUM_TX_ANT * NUM_SC, vl;
- register data_t num_tx_num_sc_reg __asm__("f0") = (data_t)(NUM_TX_ANT * NUM_SC);
+ size_t sz = num_tx_cur * num_sc_cur, vl;
+ register data_t num_tx_num_sc_reg __asm__("f0") = (data_t)(num_tx_cur * NUM_SC);
while (sz > 0) {
__asm__ volatile (