commit a5854e1a8f800fb018e07037e74f77536fefefdd
parent 006841ff46ec83e606d11a7f4ec1d0eb9414c11b
Author: Egor Achkasov <eaachkasov@gmail.com>
Date: Fri, 23 May 2025 01:42:32 +0200
Reimplement the rvv solutions lost in merge
Diffstat:
| M | src/cbackwardsub.c | | | 136 | +++++++++++++++++++++++++++++++++++++++++++++---------------------------------- |
| M | src/ccholesky.c | | | 205 | ++++++++++++++++++++++++++++++++++++++++--------------------------------------- |
| M | src/cforwardsub.c | | | 125 | +++++++++++++++++++++++++++++++++++++++++-------------------------------------- |
| M | src/cmatgram.c | | | 174 | +++++++++++++++++++++++++++++++++++++++++++++++-------------------------------- |
| M | src/cmatvecmul.c | | | 68 | +++++++++++++++++++++++++++++++++++++++++--------------------------- |
5 files changed, 392 insertions(+), 316 deletions(-)
diff --git a/src/cbackwardsub.c b/src/cbackwardsub.c
@@ -4,8 +4,8 @@
/** Complex backward substitution L^H*x_MMSE = z
*
- * x_MMSE_t = (z_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) / L_{t t} (for LL / float solution)
- * x_MMSE_t = (z_t / D_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) (for LDL / fixed solution)
+ * x_{MMSE}_t = (z_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_{MMSE}_{tt}) / L_{t t} (for LL / float solution)
+ * x_{MMSE}_t = (z_t / D_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_{MMSE}_{tt}) (for LDL / fixed solution)
*
* \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
* \global g_D diagonal matrix (only if DATA_TYPE_fixed is defined). Shape [NUM_TX][NUM_SC]
@@ -14,10 +14,12 @@
*/
void cbackwardsub()
{
-#if defined(ARCH_x86) || defined(ARCH_rv)
size_t t, tt, s;
size_t off_L, off_z, off_x_MMSE, off_D;
+
+#if defined(ARCH_x86) || defined(ARCH_rv)
acc_t sum_re, sum_im;
+
for (t = NUM_TX - 1; t != (size_t)-1; --t) {
for (s = 0; s < NUM_SC; ++s) {
sum_re = sum_im = 0;
@@ -46,86 +48,102 @@ void cbackwardsub()
#endif
}
}
+
#elif defined(ARCH_rvv)
- size_t i, j;
size_t sz, vl;
- size_t off_sc;
- for (i = NUM_TX - 1; i != (size_t)-1; --i) {
- off_sc = 0;
+ for (t = NUM_TX - 1; t != (size_t)-1; --t) {
sz = NUM_SC;
+ s = 0;
while (sz > 0){
- /* Initialize result registers as z */
- /* v0 - result real part */
- /* v1 - result imaginary part */
+ /* Initialize x_MMSE as z */
+ /* v0 - x_MMSE real part */
+ /* v4 - x_MMSE imaginary part */
+ off_z = t * NUM_SC + s;
__asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
+ "vsetvli %0, %1, e32, m4, ta, ma\n"
+ "vle32.v v0, (%2)\n"
+ "vle32.v v4, (%3)\n"
: "=r"(vl)
- : "r"(sz));
- __asm__ volatile(
- "vle32.v v0, (%0)\n"
- "vle32.v v1, (%1)\n"
+ : "r"(sz),
+ "r"(&g_z.re[off_z]), "r"(&g_z.im[off_z])
+ );
+
+#if defined(DATA_TYPE_fixed)
+ /* Divide by D_t */
+ __asm__ volatile (
+ "vle32.v v8, (%0)\n"
+ "vfdiv.vv v0, v0, v8\n"
+ "vfdiv.vv v4, v4, v8\n"
:
- : "r"(&g_z.re[i * NUM_SC + off_sc]),
- "r"(&g_z.im[i * NUM_SC + off_sc]));
+ : "r"(&g_D[t * NUM_SC + s])
+ );
+#endif
- for (j = i + 1; j < NUM_TX; ++j) {
- /* b - sum L_ji * z_j */
- /* v2 - L real part */
- /* v3 - L imaginary part */
- /* v4 - x_MMSE_j real part */
- /* v5 - x_MMSE_j imaginary part */
+ for (tt = tt + 1; tt < NUM_TX; ++tt) {
+ /* z - sum L_{tt t} * x_{MMSE}_{tt} or
+ * z / D_t - sum L_{tt t} * x_{MMSE}_{tt} */
+ off_L = tt * NUM_TX * NUM_SC + t * NUM_SC + s;
+ off_x_MMSE = tt * NUM_SC + s;
__asm__ volatile(
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vle32.v v4, (%2)\n"
- "vle32.v v5, (%3)\n"
+ "vle32.v v8, (%0)\n"
+ "vle32.v v12, (%1)\n"
+ "vle32.v v16, (%2)\n"
+ "vle32.v v20, (%3)\n"
+#if defined(DATA_TYPE_float)
/* real part */
- "vfnmsac.vv v0, v2, v4\n"
- "vfmacc.vv v0, v3, v5\n"
+ "vfnmsac.vv v0, v8, v16\n"
+ "vfmacc.vv v0, v12, v20\n"
/* imaginary part */
- "vfnmsac.vv v1, v3, v4\n"
- "vfnmsac.vv v1, v2, v5\n"
+ "vfnmsac.vv v4, v12, v16\n"
+ "vfnmsac.vv v4, v8, v20\n"
+#elif defined(DATA_TYPE_fixed)
+ /* real part */
+ "vsmul.vv v24, v8, v16\n"
+ "vssub.vv v0, v0, v24\n"
+ "vsmul.vv v24, v12, v20\n"
+ "vsadd.vv v0, v0, v24\n"
+ /* imaginary part */
+ "vsmul.vv v24, v12, v16\n"
+ "vssub.vv v4, v4, v24\n"
+ "vsmul.vv v24, v8, v20\n"
+ "vssub.vv v4, v4, v24\n"
+#else
+#error "Unknown data type"
+#endif
:
- : "r"(&g_L.re[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_L.im[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_x_MMSE.re[j * NUM_SC + off_sc]),
- "r"(&g_x_MMSE.im[j * NUM_SC + off_sc])
+ : "r"(&g_L.re[off_L]), "r"(&g_L.im[off_L]),
+ "r"(&g_x_MMSE.re[off_x_MMSE]), "r"(&g_x_MMSE.im[off_x_MMSE])
);
}
+#if defined(DATA_TYPE_float)
/* Divide by L_ii */
- /* v2 - L_ii real part */
- /* v3 - L_ii imaginary part */
+ off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s;
__asm__ volatile (
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
- "vfmul.vv v4, v2, v2\n"
- "vfmacc.vv v4, v3, v3\n"
- /* real part */
- "vfmul.vv v5, v0, v2\n"
- "vfmacc.vv v5, v1, v3\n"
- "vdiv.vv v0, v5, v4\n"
- /* imaginary part */
- "vfmul.vv v6, v1, v2\n"
- "vfnmsac.vv v6, v0, v3\n"
- "vfdiv.vv v1, v6, v4\n"
- /* store HH_H */
- "vse32.v v0, (%2)\n"
- "vse32.v v1, (%3)\n"
- :
- : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_x_MMSE.re[i * NUM_SC + off_sc]),
- "r"(&g_x_MMSE.im[i * NUM_SC + off_sc])
+ "vle32.v v8, (%0)\n"
+ "vfdiv.vv v0, v0, v8\n"
+ "vfdiv.vv v4, v4, v8\n"
+ :
+ : "r"(&g_L.re[off_L])
+ );
+#endif
+
+ /* Store result */
+ off_x_MMSE = t * NUM_SC + s;
+ __asm__ volatile(
+ "vse32.v v0, (%0)\n"
+ "vse32.v v4, (%1)\n"
+ :
+ : "r"(&g_x_MMSE.re[off_x_MMSE]), "r"(&g_x_MMSE.im[off_x_MMSE])
);
sz -= vl;
- off_sc += vl;
+ s += vl;
}
}
+
#else
#error "Unknown architecture"
#endif
diff --git a/src/ccholesky.c b/src/ccholesky.c
@@ -20,13 +20,15 @@
*/
void ccholesky()
{
-#if defined(ARCH_x86) || defined(ARCH_rv)
size_t i, j, k, s;
size_t off_ij, off_jj, off_ii;
size_t off_ik, off_jk;
size_t off_i, off_j, off_k;
+
+#if defined(ARCH_x86) || defined(ARCH_rv)
data_t tmp; /* Temporary variable for sqrt */
acc_t sum_re, sum_im;
+
for (i = 0; i < NUM_TX; ++i) {
for (j = 0; j <= i; ++j) {
for (s = 0; s < NUM_SC; ++s) {
@@ -82,6 +84,7 @@ void ccholesky()
#else
#error "Unknown data type"
#endif
+
} else { /* i != j */
#if defined(DATA_TYPE_float)
/* Calculate L_ij = (G_ij - sum) / L_jj */
@@ -91,13 +94,9 @@ void ccholesky()
#elif defined(DATA_TYPE_fixed)
/* Calculate L_ij = (G_ij - sum) / D_j */
off_j = j * NUM_SC + s;
- /* real */
sum_re = ((acc_t)g_G.re[off_ij] << FP_Q) - sum_re;
- /* TODO roubding? */
g_L.re[off_ij] = (data_t)(sum_re / (acc_t)g_D[off_j]);
- /* imaginary */
sum_im = ((acc_t)g_G.im[off_ij] << FP_Q) - sum_im;
- /* TODO roubding? */
g_L.im[off_ij] = (data_t)(sum_im / (acc_t)g_D[off_j]);
#else
#error "Unknown data type"
@@ -107,127 +106,133 @@ void ccholesky()
}
}
#elif defined(ARCH_rvv)
- size_t i, j, k;
size_t sz, vl;
- size_t off_sc;
-
- /* Init float registers */
- register float f0 __asm__("f0") = 2.0f;
for (i = 0; i < NUM_TX; ++i)
for (j = 0; j <= i; ++j) {
sz = NUM_SC;
- off_sc = 0;
+ s = 0;
while (sz > 0) {
- /* Initialize L registers */
- /* v0 - L_ij real part = G_ij.re */
- /* v1 - L_ij imaginary part = G_ij.im */
- __asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
- : "=r"(vl) : "r"(sz));
+ /* Calculate
+ * sum_{k=0}^{j-1} L_ik L_jk^* or
+ * sum_{k=0}^{j-1} L_ik D_k L_jk^*
+ *
+ * v0 - sum real part
+ * v4 - sum imaginary part */
__asm__ volatile(
- "vle32.v v0, (%0)\n"
- "vle32.v v1, (%1)\n"
- :
- : "r"(&g_G.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
- "r"(&g_G.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
+ "vsetvli %0, %1, e32, m4, ta, ma\n"
+ "vmv.v.i v0, 0\n"
+ "vmv.v.i v4, 0\n"
+ : "=r"(vl) : "r"(sz)
);
- /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */
- /* v2 - sum real part */
- /* v3 - sum imaginary part */
- __asm__ volatile(
- "vmv.v.i v2, 0\n"
- "vmv.v.i v3, 0\n"
- );
for (k = 0; k < j; ++k) {
+ off_ik = i * NUM_TX * NUM_SC + k * NUM_SC + s;
+ off_jk = j * NUM_TX * NUM_SC + k * NUM_SC + s;
__asm__ volatile(
- "vle32.v v4, (%0)\n"
- "vle32.v v5, (%1)\n"
- "vle32.v v6, (%2)\n"
- "vle32.v v7, (%3)\n"
+ "vle32.v v8, (%0)\n"
+ "vle32.v v12, (%1)\n"
+ "vle32.v v16, (%2)\n"
+ "vle32.v v20, (%3)\n"
+#if defined(DATA_TYPE_float)
/* real part */
- "vfmacc.vv v2, v4, v6\n"
- "vfmacc.vv v2, v5, v7\n"
+ "vfmacc.vv v0, v8, v16\n"
+ "vfmacc.vv v0, v12, v20\n"
/* imaginary part */
- "vfmacc.vv v3, v5, v6\n"
- "vfnmsac.vv v3, v4, v7\n"
+ "vfmacc.vv v4, v12, v16\n"
+ "vfnmsac.vv v4, v8, v20\n"
+#elif defined(DATA_TYPE_fixed)
+ "vle32.v v24, (%4)\n"
+ /* real part */
+ "vsmul.vv v28, v8, v24\n"
+ "vsmul.vv v28, v28, v16\n"
+ "vsadd.vv v0, v0, v28\n"
+ "vsmul.vv v28, v12, v24\n"
+ "vsmul.vv v28, v28, v20\n"
+ "vsadd.vv v0, v0, v28\n"
+ /* imaginary part */
+ "vsmul.vv v28, v12, v16\n"
+ "vsmul.vv v28, v28, v20\n"
+ "vsadd.vv v4, v4, v28\n"
+ "vsmul.vv v28, v8, v24\n"
+ "vsmul.vv v28, v28, v20\n"
+ "vssub.vv v4, v4, v28\n"
+#else
+#error "Unknown data type"
+#endif
:
- : "r"(&g_L.re[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
- "r"(&g_L.im[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
- "r"(&g_L.re[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
- "r"(&g_L.im[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc])
+ : "r"(&g_L.re[off_ik]), "r"(&g_L.im[off_ik]),
+ "r"(&g_L.re[off_jk]), "r"(&g_L.im[off_jk])
+#if defined(DATA_TYPE_fixed)
+ , "r"(&g_D[k * NUM_SC + s])
+#endif
);
}
- /* G_ii - sum */
- __asm__ volatile(
- "vfsub.vv v0, v0, v2\n"
- "vfsub.vv v1, v1, v3\n"
- );
-
if (i == j) {
- /* Calculate L_ii = sqrt(G_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */
+ off_ii = i * NUM_TX * NUM_SC + i * NUM_SC + s;
+ /* L_ii = sqrt(G_ii - sum) or D_i = (G_ii - sum) */
+ /* G_ii imaginary part is 0, so we can ignore it */
__asm__ volatile(
- /* Complex sqrt */
-
- /* v2 = r = sqrt(re^2 + im^2) */
- "vfmul.vv v2, v0, v0\n"
- "vfmacc.vv v2, v1, v1\n"
- "vfsqrt.v v2, v2\n"
-
- /* v3 - real part */
- "vfadd.vv v3, v2, v0\n" /* r + re */
- "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */
- "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */
- /* v4 - imaginary part */
- "vfsub.vv v4, v2, v0\n" /* r - re */
- "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */
- "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */
- "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */
-
- /* TODO handle im == 0 */
-
- /* Move the result to v0 and v1 */
- "vmv.v.v v0, v3\n"
- "vmv.v.v v1, v4\n"
+ "vle32.v v8, (%0)\n"
+#if defined(DATA_TYPE_float)
+ "vfsub.vv v0, v8, v0\n"
+ "vfsqrt.v v0, v0\n"
+ "vfneg.v v4, v4\n"
+#elif defined(DATA_TYPE_fixed)
+ "vssub.vv v0, v8, v0\n"
+ "vneg.v v4, v4\n"
+#else
+#error "Unknown data type"
+#endif
+ "vse32.v v0, (%1)\n"
+ : : "r"(&g_G.re[off_ii]),
+#if defined(DATA_TYPE_float)
+ "r"(&g_L.re[off_ii])
+#elif defined(DATA_TYPE_fixed)
+ "r"(&g_D[i * NUM_SC + s])
+#else
+#error "Unknown data type"
+#endif
);
- } else {
- /* Calculate L_ij = (G_ij - sum) / L_jj */
+ } else { /* i != j */
+ off_ij = i * NUM_TX * NUM_SC + j * NUM_SC + s;
+ off_jj = j * NUM_TX * NUM_SC + j * NUM_SC + s;
+ off_j = j * NUM_SC + s;
+ /* Calculate L_ij = (G_ij - sum) / L_jj or L_ij = (G_ij - sum) / D_j */
+ /* L_jj is always real, so we can ignore the imaginary part */
__asm__ volatile(
- /* L_jj */
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */
- "vfmul.vv v4, v2, v2\n"
- "vfmacc.vv v4, v3, v3\n"
- /* real part */
- "vfmul.vv v5, v0, v2\n"
- "vfmacc.vv v5, v1, v3\n"
- /* imaginary part */
- "vfmul.vv v6, v1, v2\n"
- "vfnmsac.vv v6, v0, v3\n"
- /* divide and store at v0 and v1 */
- "vfdiv.vv v0, v5, v4\n"
- "vfdiv.vv v1, v6, v4\n"
- :
- : "r"(&g_L.re[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
- "r"(&g_L.im[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
+ "vle32.v v8, (%0)\n" /* G_ij.re */
+ "vle32.v v12, (%1)\n" /* G_ij.im */
+ "vle32.v v16, (%2)\n" /* L_jj or D_j */
+#if defined(DATA_TYPE_float)
+ "vfsub.vv v0, v8, v0\n"
+ "vfsub.vv v4, v12, v4\n"
+ "vfdiv.vv v0, v0, v16\n"
+ "vfdiv.vv v4, v4, v16\n"
+#elif defined(DATA_TYPE_fixed)
+ "vssub.vv v0, v8, v0\n"
+ "vssub.vv v4, v12, v4\n"
+ "vdiv.vv v0, v0, v16\n"
+ "vdiv.vv v4, v4, v16\n"
+ "vsll.vi v0, v0, 1\n"
+ "vsll.vi v4, v4, 1\n"
+#else
+#error "Unknown data type"
+#endif
+ /* Store result */
+ "vse32.v v0, (%3)\n"
+ "vse32.v v4, (%4)\n"
+ :
+ : "r"(&g_G.re[off_ij]), "r"(&g_G.im[off_ij]),
+ "r"(&g_L.re[off_jj]),
+ "r"(&g_L.re[off_ij]), "r"(&g_L.im[off_ij])
);
}
- /* Store result */
- __asm__ volatile(
- "vse32.v v0, (%0)\n"
- "vse32.v v1, (%1)\n"
- :
- : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
- "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
- );
-
sz -= vl;
- off_sc += vl;
+ s += vl;
}
}
#else
diff --git a/src/cforwardsub.c b/src/cforwardsub.c
@@ -13,10 +13,12 @@
*/
void cforwardsub()
{
-#if defined(ARCH_x86) || defined(ARCH_rv)
size_t t, tt, s;
size_t off_L, off_HHy, off_z;
+
+#if defined(ARCH_x86) || defined(ARCH_rv)
acc_t sum_re, sum_im;
+
for (t = 0; t < NUM_TX; ++t) {
for (s = 0; s < NUM_SC; ++s) {
sum_re = sum_im = 0;
@@ -43,88 +45,91 @@ void cforwardsub()
}
}
#elif defined(ARCH_rvv)
- size_t i, j;
size_t sz, vl;
- size_t off_sc;
- for (i = 0; i < NUM_TX; ++i) {
- off_sc = 0;
+ for (t = 0; t < NUM_TX; ++t) {
sz = NUM_SC;
+ s = 0;
while (sz > 0) {
- // printf("sz: %lu\n", sz);
- // printf("vl: %lu\n", vl);
- /* Initialize result registers as b */
- /* v0 - result real part */
- /* v1 - result imaginary part */
+ off_HHy = t * NUM_SC + s;
+
+ /* Initialize z as HHy */
+ /* v0 - z real part */
+ /* v4 - z imaginary part */
__asm__ volatile (
- "vsetvli %0, %1, e32, m1, ta, ma\n"
+ "vsetvli %0, %1, e32, m4, ta, ma\n"
+ "vle32.v v0, (%2)\n"
+ "vle32.v v4, (%3)\n"
: "=r"(vl)
- : "r"(sz));
- __asm__ volatile (
- "vle32.v v0, (%0)\n"
- "vle32.v v1, (%1)\n"
- :
- : "r"(&g_HHy.re[i * NUM_SC + off_sc]),
- "r"(&g_HHy.im[i * NUM_SC + off_sc]));
- // printf("vl: %lu\n", vl);
+ : "r"(sz),
+ "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy])
+ );
- for (j = 0; j != i; ++j) {
- /* b - sum L_ij * z_j */
- /* v2 - L real part */
- /* v3 - L imaginary part */
- /* v4 - result_j real part */
- /* v5 - result_j imaginary part */
+ for (tt = 0; tt != t; ++tt) {
+ off_L = t * NUM_TX * NUM_SC + tt * NUM_SC + s;
+ off_z = tt * NUM_SC + s;
+
+ /* HHy_t - sum L_{t tt} * z_{tt} */
__asm__ volatile (
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vle32.v v4, (%2)\n"
- "vle32.v v5, (%3)\n"
+ "vle32.v v8, (%0)\n"
+ "vle32.v v12, (%1)\n"
+ "vle32.v v16, (%2)\n"
+ "vle32.v v20, (%3)\n"
+#if defined(DATA_TYPE_float)
/* real part */
- "vfnmsac.vv v0, v2, v4\n"
- "vfmacc.vv v0, v3, v5\n"
+ "vfnmsac.vv v0, v8, v16\n"
+ "vfmacc.vv v0, v12, v20\n"
/* imaginary part */
- "vfnmsac.vv v1, v3, v4\n"
- "vfnmsac.vv v1, v2, v5\n"
+ "vfnmsac.vv v4, v12, v16\n"
+ "vfnmsac.vv v4, v8, v20\n"
+#elif defined(DATA_TYPE_fixed)
+ /* real part */
+ "vsmul.vv v24, v8, v16\n"
+ "vssub.vv v0, v0, v24\n"
+ "vsmul.vv v24, v12, v20\n"
+ "vsadd.vv v0, v0, v24\n"
+ /* imaginary part */
+ "vsmul.vv v24, v12, v16\n"
+ "vssub.vv v4, v4, v24\n"
+ "vsmul.vv v24, v8, v20\n"
+ "vssub.vv v4, v4, v24\n"
+#else
+#error "Unknown data type"
+#endif
:
- : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
- "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
- "r"(&g_z.re[j * NUM_SC + off_sc]),
- "r"(&g_z.im[j * NUM_SC + off_sc])
+ : "r"(&g_L.re[off_L]), "r"(&g_L.im[off_L]),
+ "r"(&g_z.re[off_z]), "r"(&g_z.im[off_z])
);
}
+#if defined(DATA_TYPE_float)
/* Divide by L_ii */
- /* v2 - L_ii real part */
- /* v3 - L_ii imaginary part */
+ /* L_ii imaginary part is zero, so we ignore it */
+ off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s;
__asm__ volatile (
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
- "vfmul.vv v4, v2, v2\n"
- "vfmacc.vv v4, v3, v3\n"
- /* real part */
- "vfmul.vv v5, v0, v2\n"
- "vfmacc.vv v5, v1, v3\n"
- "vdiv.vv v0, v5, v4\n"
- /* imaginary part */
- "vfmul.vv v6, v1, v2\n"
- "vfnmsac.vv v6, v0, v3\n"
- "vfdiv.vv v1, v6, v4\n"
- /* store result */
- "vse32.v v0, (%2)\n"
- "vse32.v v1, (%3)\n"
+ "vle32.v v8, (%0)\n"
+ "vfdiv.vv v0, v0, v8\n"
+ "vfdiv.vv v4, v4, v8\n"
:
- : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
- "r"(&g_z.re[i * NUM_SC + off_sc]),
- "r"(&g_z.im[i * NUM_SC + off_sc])
+ : "r"(&g_L.re[off_L])
);
+#endif
- off_sc += vl;
+ /* Store result */
+ off_z = t * NUM_SC + s;
+ __asm__ volatile (
+ "vse32.v v0, (%0)\n"
+ "vse32.v v4, (%1)\n"
+ :
+ : "r"(&g_z.re[off_z]), "r"(&g_z.im[off_z])
+ );
+
+ s += vl;
sz -= vl;
}
}
+
#else
#error "Unknown architecture"
#endif
diff --git a/src/cmatgram.c b/src/cmatgram.c
@@ -13,118 +13,152 @@
*/
void cmatgram()
{
-#if defined(ARCH_x86) || defined(ARCH_rv)
size_t r, t1, t2, s;
- size_t off_G, off_R, off_H1, off_H2;
+ size_t off_G, off_GH, off_H1, off_H2;
+
+#if defined(ARCH_x86) || defined(ARCH_rv)
+ acc_t sum_re, sum_im;
+
for (t1 = 0; t1 < NUM_TX; ++t1) {
- for (t2 = 0; t2 < NUM_TX; ++t2) {
+ for (t2 = 0; t2 <= t1; ++t2) { /* t2 <= t1 since G is hermitian and R is symmetric */
for (s = 0; s < NUM_SC; ++s) {
- off_R = off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s;
- g_G.re[off_G] = g_R.re[off_R];
- g_G.im[off_G] = g_R.im[off_R];
+ /* Calculate the sum */
+ sum_re = sum_im = 0;
for (r = 0; r < NUM_RX; ++r) {
off_H1 = r * NUM_TX * NUM_SC + t1 * NUM_SC + s;
off_H2 = r * NUM_TX * NUM_SC + t2 * NUM_SC + s;
- g_G.re[off_G] += g_H.re[off_H1] * g_H.re[off_H2]
- + g_H.im[off_H1] * g_H.im[off_H2];
- g_G.im[off_G] += g_H.re[off_H1] * g_H.im[off_H2]
- - g_H.im[off_H1] * g_H.re[off_H2];
+ sum_re += (acc_t)g_H.re[off_H1] * (acc_t)g_H.re[off_H2]
+ + (acc_t)g_H.im[off_H1] * (acc_t)g_H.im[off_H2];
+ sum_im += (acc_t)g_H.re[off_H1] * (acc_t)g_H.im[off_H2]
+ - (acc_t)g_H.im[off_H1] * (acc_t)g_H.re[off_H2];
+ }
+
+ /* Add R */
+ off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+ g_G.re[off_G] = (data_t)sum_re + g_R.re[off_G];
+ g_G.im[off_G] = (data_t)sum_im + g_R.im[off_G];
+#elif defined(DATA_TYPE_fixed)
+ g_G.re[off_G] = (data_t)(sum_re >> FP_Q) + g_R.re[off_G];
+ g_G.im[off_G] = (data_t)(sum_im >> FP_Q) + g_R.im[off_G];
+#else
+#error "Unknown data type"
+#endif
+
+ /* Fill the upper triangle */
+ /* G_{t2t1} = G_{t1t2}^* */
+ if (t1 != t2) {
+ off_GH = t2 * NUM_TX * NUM_SC + t1 * NUM_SC + s;
+ g_G.re[off_GH] = g_G.re[off_G];
+ g_G.im[off_GH] = -g_G.im[off_G];
}
}
}
}
+
#elif defined(ARCH_rvv)
- size_t t1, t2, r;
size_t sz, vl;
- size_t off_sc, off_A, off_AH;
- size_t off_G_L, off_G_U;
- data_t *A_re, *A_im, *AH_re, *AH_im;
- data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im;
- data_t *G_L_re, *G_L_im, *G_U_re, *G_U_im;
for (t1 = 0; t1 != NUM_TX; ++t1)
- for (t2 = t1; t2 != NUM_TX; ++t2) {
- off_sc = 0;
- off_G_L = t1 * NUM_TX * NUM_SC + t2 * NUM_SC;
- off_G_U = t2 * NUM_TX * NUM_SC + t1 * NUM_SC;
- G_L_re = &g_G.re[off_G_L];
- G_L_im = &g_G.im[off_G_L];
- G_U_re = &g_G.re[off_G_U];
- G_U_im = &g_G.im[off_G_U];
+ for (t2 = t1; t2 != t1; ++t2) {
sz = NUM_SC;
+ s = 0;
while (sz > 0) {
/* Initialize G registers */
/* v0 - G real part */
- /* v1 - G imaginary part */
+ /* v4 - G imaginary part */
__asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
+ "vsetvli %0, %1, e32, m4, ta, ma\n"
"vmv.v.i v0, 0\n"
- "vmv.v.i v1, 0\n"
- : "=r"(vl) : "r"(sz));
+ "vmv.v.i v4, 0\n"
+ : "=r"(vl) : "r"(sz)
+ );
for (r = 0; r != NUM_RX; ++r) {
- off_A = r * NUM_TX * NUM_SC + t1 * NUM_SC + off_sc;
- off_AH = r * NUM_TX * NUM_SC + t2 * NUM_SC + off_sc;
- A_re = &g_H.re[off_A];
- A_im = &g_H.im[off_A];
- AH_re = &g_H.re[off_AH];
- AH_im = &g_H.im[off_AH];
+ off_H1 = r * NUM_TX * NUM_SC + t1 * NUM_SC + s;
+ off_H2 = r * NUM_TX * NUM_SC + t2 * NUM_SC + s;
- /* Calculate A^H*A */
- /* v2 - A real part */
- /* v3 - A imaginary part */
- /* v4 - A^H real part */
- /* v5 - A^H imaginary part */
+ /* Calculate H^H*H */
+ /* v8 - H_{rt1} real part */
+ /* v12 - H_{rt2} imaginary part */
+ /* v16 - H_{rt2} real part */
+ /* v20 - H_{rt2} imaginary part */
__asm__ volatile(
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vle32.v v4, (%2)\n"
- "vle32.v v5, (%3)\n"
+ "vle32.v v8, (%0)\n"
+ "vle32.v v12, (%1)\n"
+ "vle32.v v16, (%2)\n"
+ "vle32.v v20, (%3)\n"
+#if defined(DATA_TYPE_float)
+ /* real part */
+ "vfmacc.vv v0, v8, v16\n"
+ "vfmacc.vv v0, v12, v20\n"
+ /* imaginary part */
+ "vfmacc.vv v4, v12, v16\n"
+ "vfnmsac.vv v4, v8, v20\n"
+#elif defined(DATA_TYPE_fixed)
/* real part */
- "vfmacc.vv v0, v2, v4\n"
- "vfmacc.vv v0, v3, v5\n"
+ "vsmul.vv v24, v8, v16\n"
+ "vsadd.vv v0, v0, v24\n"
+ "vsmul.vv v24, v12, v20\n"
+ "vsadd.vv v0, v0, v24\n"
/* imaginary part */
- "vfmacc.vv v1, v3, v4\n"
- "vfnmsac.vv v1, v2, v5\n"
+ "vsmul.vv v24, v12, v16\n"
+ "vsadd.vv v4, v4, v24\n"
+ "vsmul.vv v24, v8, v20\n"
+ "vssub.vv v4, v4, v24\n"
+#else
+#error "Unknown data type"
+#endif
:
- : "r"(A_re), "r"(A_im), "r"(AH_re), "r"(AH_im)
+ : "r"(g_H.re[off_H1]), "r"(g_H.im[off_H1]),
+ "r"(g_H.re[off_H2]), "r"(g_H.im[off_H2])
);
}
/* Add R */
- /* v2 - R real part */
- /* v3 - R imaginary part */
- R_U_re = &g_R.re[off_G_U];
- R_U_im = &g_R.im[off_G_U];
+ off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s;
__asm__ volatile(
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vfadd.vv v2, v2, v0\n"
- "vfadd.vv v3, v3, v1\n"
- "vse32.v v2, (%2)\n"
- "vse32.v v3, (%3)\n"
+ "vle32.v v8, (%0)\n"
+ "vle32.v v12, (%1)\n"
+#if defined(DATA_TYPE_float)
+ "vfadd.vv v0, v0, v8\n"
+ "vfadd.vv v4, v4, v12\n"
+#elif defined(DATA_TYPE_fixed)
+ "vsadd.vv v0, v0, v8\n"
+ "vsadd.vv v4, v4, v12\n"
+#else
+#error "Unknown data type"
+#endif
+ "vse32.v v0, (%2)\n"
+ "vse32.v v4, (%3)\n"
:
- : "r"(R_U_re), "r"(R_U_im), "r"(G_U_re), "r"(G_U_im));
+ : "r"(&g_R.re[off_G]), "r"(&g_R.im[off_G]),
+ "r"(&g_G.re[off_G]), "r"(&g_G.im[off_G])
+ );
+
+ /* Fill the upper triangle */
+ /* G_{t2t1} = G_{t1t2}^* */
if (t1 != t2) {
- R_L_re = &g_R.re[off_G_L];
- R_L_im = &g_R.im[off_G_L];
+ off_GH = t2 * NUM_TX * NUM_SC + t1 * NUM_SC + s;
__asm__ volatile(
- /* Lower triangle */
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vfadd.vv v2, v2, v0\n"
- "vfsub.vv v3, v3, v1\n"
- "vse32.v v2, (%2)\n"
- "vse32.v v3, (%3)\n"
+#if defined(DATA_TYPE_float)
+ "vfneg.v v4, v4\n"
+#elif defined(DATA_TYPE_fixed)
+ "vneg.v v4, v4\n"
+#else
+#error "Unknown data type"
+#endif
+ "vse32.v v0, (%0)\n"
+ "vse32.v v4, (%1)\n"
:
- : "r"(R_L_re), "r"(R_L_im), "r"(G_L_re), "r"(G_L_im)
+ : "r"(&g_G.re[off_GH]), "r"(&g_G.im[off_GH])
);
}
sz -= vl;
- off_sc += vl;
+ s += vl;
}
}
#endif
diff --git a/src/cmatvecmul.c b/src/cmatvecmul.c
@@ -12,10 +12,12 @@
*/
void cmatvecmul()
{
-#if defined(ARCH_x86) || defined(ARCH_rv)
size_t t, r, s;
size_t off_H, off_y, off_HHy;
+
+#if defined(ARCH_x86) || defined(ARCH_rv)
acc_t sum_re, sum_im;
+
for (t = 0; t < NUM_TX; ++t) {
for (s = 0; s < NUM_SC; ++s) {
sum_re = sum_im = 0;
@@ -39,59 +41,71 @@ void cmatvecmul()
#endif
}
}
+
#elif defined(ARCH_rvv)
- size_t i, j;
- size_t off_HH, off_y, off_HHy, off_sc;
size_t sz, vl;
- for (i = 0; i < NUM_TX; ++i) {
- off_HHy = i * NUM_SC;
- off_sc = 0;
+ for (t = 0; t < NUM_TX; ++t) {
sz = NUM_SC;
-
+ s = 0;
while (sz > 0) {
- /* Initialize result registers */
+ /* Initialize HHy with 0 */
/* v0 - HHy real part */
- /* v1 - HHy imaginary part */
+ /* v4 - HHy imaginary part */
__asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
+ "vsetvli %0, %1, e32, m4, ta, ma\n"
"vmv.v.i v0, 0\n"
- "vmv.v.i v1, 0\n"
+ "vmv.v.i v4, 0\n"
: "=r"(vl)
: "r"(sz)
);
- for (j = 0; j < NUM_RX; ++j) {
- off_HH = i * NUM_RX * NUM_SC + j * NUM_SC + off_sc;
- off_y = j * NUM_SC + off_sc;
+ for (r = 0; r < NUM_RX; ++r) {
+ off_H = r * NUM_TX * NUM_SC + t * NUM_SC + s;
+ off_y = r * NUM_SC + s;
__asm__ volatile(
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vle32.v v4, (%2)\n"
- "vle32.v v5, (%3)\n"
+ "vle32.v v8, (%0)\n"
+ "vle32.v v12, (%1)\n"
+ "vle32.v v16, (%2)\n"
+ "vle32.v v20, (%3)\n"
+#if defined(DATA_TYPE_float)
+ /* real part */
+ "vfmacc.vv v0, v8, v16\n"
+ "vfnmsac.vv v0, v12, v20\n"
+ /* imaginary part */
+ "vfmacc.vv v4, v12, v16\n"
+ "vfmacc.vv v4, v8, v20\n"
+#elif defined(DATA_TYPE_fixed)
/* real part */
- "vfmacc.vv v0, v2, v4\n"
- "vfnmsac.vv v0, v3, v5\n"
+ "vsmul.vv v24, v8, v16\n"
+ "vsadd.vv v0, v0, v24\n"
+ "vsmul.vv v24, v12, v20\n"
+ "vssub.vv v0, v0, v24\n"
/* imaginary part */
- "vfmacc.vv v1, v3, v4\n"
- "vfmacc.vv v1, v2, v5\n"
+ "vsmul.vv v24, v12, v16\n"
+ "vsadd.vv v4, v4, v24\n"
+ "vsmul.vv v24, v8, v20\n"
+ "vsadd.vv v4, v4, v24\n"
+#else
+#error "Unknown data type"
+#endif
:
- : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]),
+ : "r"(&g_H.re[off_H]), "r"(&g_H.im[off_H]),
"r"(&g_y.re[off_y]), "r"(&g_y.im[off_y])
- );
+ );
}
/* Store result */
+ off_HHy = t * NUM_SC + s;
__asm__ volatile(
"vse32.v v0, (%0)\n"
- "vse32.v v1, (%1)\n"
+ "vse32.v v4, (%1)\n"
:
: "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy])
);
sz -= vl;
- off_HHy += vl;
- off_sc += vl;
+ s += vl;
}
}
#else