Change data type to float - mmserv - Minimum Mean Square Error detection on RISC-V Vector Extention

commit bf6a94e5a9d53997aaf89d915327aba66411079c
parent 9362e7014154afd7772f908b36aee6cada2b8b7b
Author: Egor Achkasov <eaachkasov@edu.hse.ru>
Date:   Fri,  8 Nov 2024 00:01:47 +0100

Change data type to float

Diffstat:
M inc/define.h  | 7 +++++--
M inc/mmserv.h  | 25 ++++++++++++++++---------
M main.c  | 46 ++++++++++++++++++++++++----------------------
M scripts/gen_data.py  | 10 +++++-----
M scripts/mmse.py  | 57 +++++++++++++++++++++++++--------------------------------
M scripts/util.py  | 18 +++++++++---------
M scripts/viz_compare_x.py  | 4 ++--
M src/mmserv.c  | 170 +++++++++++++++++++++++++++++++++++++++++++++++--------------------------------

8 files changed, 187 insertions(+), 150 deletions(-)
diff --git a/inc/define.h b/inc/define.h
@@ -1,8 +1,8 @@
 #ifndef __DEFINE_H
 #define __DEFINE_H
 
-#define NUM_RX_ANT 12
-#define NUM_TX_ANT 12
+#define NUM_RX_ANT 4
+#define NUM_TX_ANT 4
 #define NUM_SC 1
 
 #define NOSQRT 1
@@ -10,4 +10,7 @@
 #define IN
 #define OUT
 
+typedef float data_t;
+typedef float acc_t;
+
 #endif
diff --git a/inc/mmserv.h b/inc/mmserv.h
@@ -11,12 +11,9 @@
  * Typedefs
  */
 
-typedef int16_t data_t;
-typedef int32_t acc_t;
-
 typedef struct {
-    acc_t re;
-    acc_t im;
+    data_t re;
+    data_t im;
 } complex;
 
 /** Calculate MMSE estimation of x in y = H*x + n
@@ -54,7 +51,7 @@ complex cmake(IN data_t re, IN data_t im);
 complex cmul(IN complex a, IN complex b);
 
 /** Complex absolute value squared */
-acc_t cabs2(IN complex a);
+data_t cabs2(IN complex a);
 
 /** Complex addition */
 complex cadd(IN complex a, IN complex b);
@@ -70,7 +67,7 @@ complex cconj(IN complex a);
 complex cdiv(IN complex a, IN complex b);
 
 /** Square root of a natural number */
-data_t sqrt(IN acc_t x);
+data_t sqrt(IN data_t x);
 
 /** Complex root */
 complex csqrt(IN complex a);
@@ -105,6 +102,16 @@ void cmatmul_TxRx_RxTx(
   IN complex B[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
   OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]);
 
+/** Multiply two matrices A and B
+ * \param A input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param B input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param result output matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ */
+void cmatmul_TxTx_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]);
+
 /** Add two matrices A and B
  * \param A input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
  * \param B input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
@@ -180,7 +187,7 @@ void cbackwardsub_TxTx(
 void load_data(
   IN const char *file,
   IN size_t size,
-  OUT uint16_t *out);
+  OUT data_t *out);
 
 /** Save data to a binary file
  * \param file path to the file
@@ -188,7 +195,7 @@ void load_data(
  */
 void save_data(
   IN const char* file,
-  IN uint16_t x_mmse[NUM_TX_ANT][NUM_SC][2]);
+  IN data_t x_mmse[NUM_TX_ANT][NUM_SC][2]);
 
 
 #endif /* __MMSERV_H */
diff --git a/main.c b/main.c
@@ -1,52 +1,54 @@
 #include "inc/mmserv.h"
 
 int main() {
+  uint32_t i, j, k;
+
   /* Load the data */
-  uint16_t x_raw[NUM_TX_ANT][NUM_SC][2];              /* Transmitted signal */
-  uint16_t H_raw[NUM_RX_ANT][NUM_TX_ANT][NUM_SC][2];  /* Channel */
-  uint16_t R_raw[NUM_TX_ANT][NUM_TX_ANT][NUM_SC][2];  /* Noise covariance matrix */
-  uint16_t y_raw[NUM_RX_ANT][NUM_SC][2];              /* Received signal */
-  load_data("data/x.bin", NUM_TX_ANT*NUM_SC*2, x_raw);
-  load_data("data/H.bin", NUM_RX_ANT*NUM_TX_ANT*NUM_SC*2, H_raw);
-  load_data("data/y.bin", NUM_RX_ANT*NUM_SC*2, y_raw);
-  load_data("data/R.bin", NUM_RX_ANT*NUM_SC*2, R_raw);
+  data_t x_raw[NUM_TX_ANT][NUM_SC][2];              /* Transmitted signal */
+  data_t H_raw[NUM_RX_ANT][NUM_TX_ANT][NUM_SC][2];  /* Channel */
+  data_t R_raw[NUM_TX_ANT][NUM_TX_ANT][NUM_SC][2];  /* Noise covariance matrix */
+  data_t y_raw[NUM_RX_ANT][NUM_SC][2];              /* Received signal */
+  load_data("data\\x.bin", NUM_TX_ANT*NUM_SC*2, x_raw);
+  load_data("data\\H.bin", NUM_RX_ANT*NUM_TX_ANT*NUM_SC*2, H_raw);
+  load_data("data\\R.bin", NUM_TX_ANT*NUM_RX_ANT*NUM_SC*2, R_raw);
+  load_data("data\\y.bin", NUM_RX_ANT*NUM_SC*2, y_raw);
 
   /* Cast the data into complex data structures */
   complex x[NUM_TX_ANT][NUM_SC];
   complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC];
   complex y[NUM_RX_ANT][NUM_SC];
   complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_SC; ++j){
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_SC; ++j){
       x[i][j].re = x_raw[i][j][0];
       x[i][j].im = x_raw[i][j][1];
     }
-  for (uint32_t i = 0; i < NUM_RX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k){
+  for (i = 0; i < NUM_RX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k){
         H[i][j][k].re = H_raw[i][j][k][0];
         H[i][j][k].im = H_raw[i][j][k][1];
       }
-  for (uint32_t i = 0; i < NUM_RX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_SC; ++j){
+  for (i = 0; i < NUM_RX_ANT; ++i)
+    for (j = 0; j < NUM_SC; ++j){
       y[i][j].re = y_raw[i][j][0];
       y[i][j].im = y_raw[i][j][1];
     }
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k){
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k){
         R[i][j][k].re = R_raw[i][j][k][0];
         R[i][j][k].im = R_raw[i][j][k][1];
       }
 
   /* Calculate the MMSE approximation */
   complex x_MMSE[NUM_TX_ANT][NUM_SC];
-  mmse_nosqrt(H, y, R, x_MMSE);
+  mmse(H, y, R, x_MMSE);
 
   /* Interleave the result */
-  uint16_t res[NUM_TX_ANT][NUM_SC][2];
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_SC; ++j){
+  data_t res[NUM_TX_ANT][NUM_SC][2];
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_SC; ++j){
       res[i][j][0] = x_MMSE[i][j].re;
       res[i][j][1] = x_MMSE[i][j].im;
     }
diff --git a/scripts/gen_data.py b/scripts/gen_data.py
@@ -20,7 +20,7 @@ WRITE_DATA_BIN = "--bin" in argv
 WRITE_DATA_S = "--s" in argv
 
 NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines()
-NOISE_STD_DEVIATION = np.sqrt(.5) / 10  # noise standard deviation
+NOISE_STD_DEVIATION = np.sqrt(.5) / 100  # noise standard deviation
 
 # Transmitter signal
 x = random((NUM_TX_ANT, NUM_SC)) \
@@ -58,10 +58,10 @@ class Section:
 
 
 sections = [
-    Section("x", "data/x.txt", "3", 16, x.size * 2),
-    Section("H", "data/H.txt", "3", 16, H.size * 2),
-    Section("R", "data/R.txt", "3", 16, R.size * 2),
-    Section("y", "data/y.txt", "3", 16, y.size * 2),
+    Section("x", "data/x.txt", "3", 32, x.size * 2),
+    Section("H", "data/H.txt", "3", 32, H.size * 2),
+    Section("R", "data/R.txt", "3", 32, R.size * 2),
+    Section("y", "data/y.txt", "3", 32, y.size * 2),
 ]
 
 # Create "data" directory if it does not exist
diff --git a/scripts/mmse.py b/scripts/mmse.py
@@ -8,38 +8,31 @@ from os import path, makedirs
 NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines()
 
 # Read the data from the files
-x = deinterleave(np.fromfile("data/x.bin", dtype=np.int16), (NUM_TX_ANT, NUM_SC))
-H = deinterleave(np.fromfile("data/H.bin", dtype=np.int16), (NUM_RX_ANT, NUM_TX_ANT, NUM_SC))
-R = deinterleave(np.fromfile("data/R.bin", dtype=np.int16), (NUM_TX_ANT, NUM_TX_ANT, NUM_SC))
-y = deinterleave(np.fromfile("data/y.bin", dtype=np.int16), (NUM_RX_ANT, NUM_SC))
-
-# Move the SC dimension to the beginning
-x = np.moveaxis(x, -1, 0)
-H = np.moveaxis(H, -1, 0)
-R = np.moveaxis(R, -1, 0)
-y = np.moveaxis(y, -1, 0)
-
-HH = H.conj().transpose(0, 2, 1)
-L = np.linalg.cholesky(HH @ H + R)
-HHy = np.einsum("...ij,...j->...i", HH, y)
-
-z = np.empty((NUM_SC, NUM_TX_ANT), dtype=np.complex64)
-for i in range(NUM_TX_ANT):
-    z[:, i] = (HHy[:, i] - np.sum([L[:, i, j]*z[:, j] for j in range(i)], 0)) / L[:, i, i]
-
-LH = L.conj().transpose(0, 2, 1)
-x_mmse = np.empty((NUM_SC, NUM_TX_ANT), dtype=np.complex64)
-for i in range(NUM_TX_ANT-1, -1, -1):
-    x_mmse[:, i] = (z[:, i] - np.sum([LH[:, i, j]*x_mmse[:, j] for j in range(i+1, NUM_TX_ANT)], 0)) / LH[:, i, i]
-
-# Move the SC dimension back to the end
-x_mmse = np.moveaxis(x_mmse, 0, -1)
-x = np.moveaxis(x, 0, -1)
-H = np.moveaxis(H, 0, -1)
-R = np.moveaxis(R, 0, -1)
-y = np.moveaxis(y, 0, -1)
-
-# Cast the complex data to int16
+x = deinterleave(np.fromfile("data/x.bin", dtype=np.float32), (NUM_TX_ANT, NUM_SC))
+H = deinterleave(np.fromfile("data/H.bin", dtype=np.float32), (NUM_RX_ANT, NUM_TX_ANT, NUM_SC))
+R = deinterleave(np.fromfile("data/R.bin", dtype=np.float32), (NUM_TX_ANT, NUM_TX_ANT, NUM_SC))
+y = deinterleave(np.fromfile("data/y.bin", dtype=np.float32), (NUM_RX_ANT, NUM_SC))
+
+x_mmse = np.empty((NUM_TX_ANT, NUM_SC), np.complex64)
+
+for sc in range(NUM_SC):
+    HH = H[..., sc].conj().T
+    L = np.linalg.cholesky(HH @ H[..., sc] + R[..., sc])
+    HHy = np.einsum("ij,j->i", HH, y[:, sc])
+    print("\nHHy:\n", HHy)
+    print("\nL:\n", L)
+
+    z = np.empty((NUM_TX_ANT,), dtype=np.complex64)
+    for i in range(NUM_TX_ANT):
+        z[i] = (HHy[i] - np.sum([L[i, j]*z[j] for j in range(i)])) / L[i, i]
+    print("\nz:\n", z)
+
+    LH = L.conj().T
+    for i in range(NUM_TX_ANT-1, -1, -1):
+        x_mmse[i, sc] = (z[i] - np.sum([LH[i, j]*x_mmse[j, sc] for j in range(i+1, NUM_TX_ANT)], 0)) / LH[i, i]
+    print("\nx_mmse:\n", x_mmse)
+
+# Output the data
 x_mmse_data = interleave(x_mmse)
 out_dir = path.join(path.dirname(__file__), "..", "out")
 if not path.exists(out_dir):
diff --git a/scripts/util.py b/scripts/util.py
@@ -29,31 +29,31 @@ def read_defines():
 
 
 def interleave(data: np.ndarray) -> np.ndarray:
-    """Cast a np.complex64 array to np.int16 array.
+    """Cast a np.complex64 array to np.float32 array.
 
     Args:
         data (np.ndarray): The complex data to be casted. Dtype: np.complex64
 
     Returns:
-        np.ndarray: The interleaved data. Dtype: np.int16
+        np.ndarray: The interleaved data. Dtype: np.float32
     """
-    res = np.empty(2 * data.size, dtype=np.int16)
-    res[0::2] = data.flatten().real.astype(np.float16).view(np.int16)
-    res[1::2] = data.flatten().imag.astype(np.float16).view(np.int16)
+    res = np.empty(2 * data.size, dtype=np.float32)
+    res[0::2] = data.flatten().real.astype(np.float32)
+    res[1::2] = data.flatten().imag.astype(np.float32)
     return res
 
 
 def deinterleave(data: np.ndarray, shape: tuple) -> np.ndarray:
-    """Cast a np.int16 array to np.complex64 array.
+    """Cast a np.float32 array to np.complex64 array.
 
     Args:
-        data (np.ndarray): The interleaved data to be casted. Dtype: np.int16
+        data (np.ndarray): The interleaved data to be casted. Dtype: np.float32
         shape (tuple): The shape of the deinterleaved data.
 
     Returns:
         np.ndarray: The deinterleaved data. Dtype: np.complex64
     """
     res = np.empty(shape, np.complex64)
-    res.real = data[0::2].view(np.float16).astype(np.float32).reshape(shape)
-    res.imag = data[1::2].view(np.float16).astype(np.float32).reshape(shape)
+    res.real = data[0::2].reshape(shape)
+    res.imag = data[1::2].reshape(shape)
     return res
diff --git a/scripts/viz_compare_x.py b/scripts/viz_compare_x.py
@@ -10,9 +10,9 @@ from matplotlib import pyplot as plt
 NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines()
 
 with open("out/x_mmse.bin", "rb") as f:
-    x_mmse = deinterleave(np.fromfile(f, dtype=np.int16), (NUM_TX_ANT, NUM_SC))
+    x_mmse = deinterleave(np.fromfile(f, dtype=np.float32), (NUM_TX_ANT, NUM_SC))
 with open("data/x.bin", "rb") as f:
-    x = deinterleave(np.fromfile(f, dtype=np.int16), (NUM_TX_ANT, NUM_SC))
+    x = deinterleave(np.fromfile(f, dtype=np.float32), (NUM_TX_ANT, NUM_SC))
 
 for tx in range(NUM_TX_ANT):
     plt.scatter(x_mmse[tx].real,
diff --git a/src/mmserv.c b/src/mmserv.c
@@ -20,7 +20,7 @@ complex cmul(IN complex a, IN complex b)
   t.im = a.im * b.re + a.re * b.im;
   return t;	
 }
-acc_t cabs2(IN complex a)
+data_t cabs2(IN complex a)
 {
   return a.re * a.re + a.im * a.im;
 }
@@ -59,10 +59,10 @@ complex cdiv(IN complex a, IN complex b)
   return t;
 }
 
-data_t sqrt(IN acc_t x)
+data_t sqrt(IN data_t x)
 {
   if (x < 2) return x;
-  acc_t lo = 1, hi = x, mid;
+  data_t lo = 1, hi = x, mid;
   while (100 * lo * lo < x) lo *= 10;
   while (hi * hi / 100 > x) hi /= 10;
   for (int i = 0; i < 100; i++) {
@@ -91,9 +91,10 @@ void cmat_hermitian_transpose_RxTx(
   IN complex A[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
   OUT complex AH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC])
 {
-  for (uint32_t i = 0; i < NUM_RX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k)
+  uint32_t i, j, k;
+  for (i = 0; i < NUM_RX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k)
         AH[i][j][k] = cconj(A[j][i][k]);
 }
 
@@ -101,9 +102,10 @@ void cmat_hermitian_transpose_TxTx(
   IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
   OUT complex AH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
 {
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k)
+  uint32_t i, j, k;
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k)
         AH[i][j][k] = cconj(A[j][i][k]);
 }
 
@@ -112,10 +114,32 @@ void cmatmul_TxRx_RxTx(
   IN complex B[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
   OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
 {
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_RX_ANT; ++k)
-        for (uint32_t l = 0; l < NUM_SC; ++l)
+  uint32_t i, j, k, l;
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k)
+        result[i][j][k] = cmake(0, 0);
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_RX_ANT; ++k)
+        for (l = 0; l < NUM_SC; ++l)
+          result[i][j][l] = cadd(result[i][j][l], cmul(A[i][k][l], B[k][j][l]));
+}
+
+void cmatmul_TxTx_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
+{
+  uint32_t i, j, k, l;
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k)
+        result[i][j][k] = cmake(0, 0);
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_TX_ANT; ++k)
+        for (l = 0; l < NUM_SC; ++l)
           result[i][j][l] = cadd(result[i][j][l], cmul(A[i][k][l], B[k][j][l]));
 }
 
@@ -124,9 +148,10 @@ void cmatadd_TxTx(
   IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
   OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
 {
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k)
+  uint32_t i, j, k;
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k)
         result[i][j][k] = cadd(A[i][j][k], B[i][j][k]);
 }
 
@@ -134,32 +159,25 @@ void ccholesky_TxTx(
   IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
   OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
 {
+  uint32_t i, j, k, l;
+  complex sum;
   /* Init upper triangle with zeros */
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = i+1; j < NUM_TX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k)
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = i+1; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k)
         L[i][j][k] = cmake(0, 0);
   /* Calculate diagonal and lower triangular entries */
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < i+1; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k){
-        complex sum = {0, 0};
-        for (uint32_t l = 0; l < j; ++l)
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < i+1; ++j)
+      for (k = 0; k < NUM_SC; ++k){
+        sum.re = 0;
+        sum.im = 0;
+        for (l = 0; l < j; ++l)
           cadd_acc(sum, cmul(L[i][l][k], cconj(L[j][l][k])));
         if (i == j)
           L[i][j][k] = csqrt(csub(A[i][i][k], sum));
         else
           L[i][j][k] = cdiv(csub(A[i][j][k], sum), L[j][j][k]);
-
-        // TODO delete
-        #if 0
-        for (uint32_t i = 0; i < NUM_TX_ANT; ++i){
-          for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
-            printf("(%6i, %6i) ", L[i][j][0].re, L[i][j][0].im);
-          printf("\n");
-        }
-        printf("\n");
-        #endif
       }
 }
 
@@ -168,47 +186,53 @@ void ccholesky_nosqrt_TxTx(
   OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
   OUT complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
 {
+  uint32_t i, j, k, l;
+  complex sum, t;
   /* Init zeros in upper triangles of L and D */
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = i+1; j < NUM_TX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k)
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = i+1; j < NUM_TX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k)
         D[i][j][k] = L[i][j][k] = cmake(0, 0);
   /* Init zeros in lower triangle of D */
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < i; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k)
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < i; ++j)
+      for (k = 0; k < NUM_SC; ++k)
         D[i][j][k] = cmake(0, 0);
   /* Init ones in the diagonal of the result */
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t k = 0; k < NUM_SC; ++k)
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (k = 0; k < NUM_SC; ++k)
       L[i][i][k] = cmake(1, 0);
 
 
   /* Calculate the lower triangle of the result */
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i){
+  for (i = 0; i < NUM_TX_ANT; ++i){
     /* L_ij = (A_ij - \sum_{l=0}^{j-1} L_il L_jl^* D_k) / D_j */
-    for (uint32_t j = 0; j < i; ++j){
-      complex sum = {0, 0};
-      for (uint32_t l = 0; l < j; ++l)
-        for (uint32_t k = 0; k < NUM_SC; ++k){
+    for (j = 0; j < i; ++j){
+      sum.re = 0;
+      sum.im = 0;
+      for (l = 0; l < j; ++l)
+        for (k = 0; k < NUM_SC; ++k){
+          /* TODO there is one sum for all scs what leads to contamination. Fix that */
           complex t = cmul(L[i][l][k], cconj(L[j][l][k]));
           t = cmul(t, D[l][l][k]);
           cadd_acc(sum, t);
         }
-      for (uint32_t k = 0; k < NUM_SC; ++k){
+      for (k = 0; k < NUM_SC; ++k){
         L[i][j][k] = csub(A[i][j][k], sum);
         L[i][j][k] = cdiv(L[i][j][k], D[j][j][k]);
       }
     }
     /* D_i = A_ii - \sum_{j=0}^{i-1} L_ij L_ij^* D_j */
-    complex sum = {0, 0};
-    for (uint32_t j = 0; j < i; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k){
-        complex t = cmul(L[i][j][k], cconj(L[i][j][k]));
+    sum.re = 0;
+    sum.im = 0;
+    for (j = 0; j < i; ++j)
+      for (k = 0; k < NUM_SC; ++k){
+        /* TODO there is one sum for all scs what leads to contamination. Fix that */
+        t = cmul(L[i][j][k], cconj(L[i][j][k]));
         t = cmul(t, D[j][j][k]);
         cadd_acc(sum, t);
       }
-    for (uint32_t k = 0; k < NUM_SC; ++k)
+    for (k = 0; k < NUM_SC; ++k)
       D[i][i][k] = csub(A[i][i][k], sum);
   }
 }
@@ -218,9 +242,13 @@ void cmatvecmul_TxRx(
   IN complex b[NUM_RX_ANT][NUM_SC],
   OUT complex result[NUM_TX_ANT][NUM_SC])
 {
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_RX_ANT; ++j)
-      for (uint32_t k = 0; k < NUM_SC; ++k)
+  uint32_t i, j, k;
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_SC; ++j)
+      result[i][j] = cmake(0., 0.);
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_RX_ANT; ++j)
+      for (k = 0; k < NUM_SC; ++k)
         result[i][k] = cadd(result[i][k], cmul(A[i][j][k], b[j][k]));
 }
 
@@ -229,10 +257,11 @@ void cforwardsub_TxTx(
   IN complex b[NUM_TX_ANT][NUM_SC],
   OUT complex result[NUM_TX_ANT][NUM_SC])
 {
-  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
-    for (uint32_t j = 0; j < NUM_SC; ++j){
+  uint32_t i, j, k;
+  for (i = 0; i < NUM_TX_ANT; ++i)
+    for (j = 0; j < NUM_SC; ++j){
       result[i][j] = b[i][j];
-      for (uint32_t k = 0; k < i; ++k)
+      for (k = 0; k < i; ++k)
         result[i][j] = csub(result[i][j], cmul(L[i][k][j], result[k][j]));
       result[i][j] = cdiv(result[i][j], L[i][i][j]);
     }
@@ -243,10 +272,11 @@ void cbackwardsub_TxTx(
   IN complex b[NUM_TX_ANT][NUM_SC],
   OUT complex result[NUM_TX_ANT][NUM_SC])
 {
-  for (uint32_t i = NUM_TX_ANT; i > 0; --i)
-    for (uint32_t j = 0; j < NUM_SC; ++j){
+  uint32_t i, j, k;
+  for (i = NUM_TX_ANT; i != (uint32_t)-1; --i)
+    for (j = 0; j < NUM_SC; ++j){
       result[i][j] = b[i][j];
-      for (uint32_t k = i+1; k < NUM_TX_ANT; ++k)
+      for (k = i+1; k < NUM_TX_ANT; ++k)
         result[i][j] = csub(result[i][j], cmul(U[i][k][j], result[k][j]));
       result[i][j] = cdiv(result[i][j], U[i][i][j]);
     }
@@ -259,7 +289,7 @@ void cbackwardsub_TxTx(
 void load_data(
   IN const char* file,
   IN size_t size,
-  OUT uint16_t *out)
+  OUT data_t *out)
 {
     FILE *fp = fopen(file, "rb");
     
@@ -267,7 +297,7 @@ void load_data(
         fprintf(stderr, "Error opening file %s.\n", file);
         exit(8);
     }
-    if ((fread(out, sizeof(uint16_t), size, fp)) != size) {
+    if ((fread(out, sizeof(data_t), size, fp)) != size) {
         fprintf(stderr, "Error reading file %s.\n", file);
         exit(8);
     }
@@ -277,7 +307,7 @@ void load_data(
 
 void save_data(
   IN const char* file,
-  IN uint16_t x_mmse[NUM_TX_ANT][NUM_SC][2])
+  IN data_t x_mmse[NUM_TX_ANT][NUM_SC][2])
 {
     FILE *fp = fopen(file, "wb");
     
@@ -285,7 +315,7 @@ void save_data(
         fprintf(stderr, "Error opening file %s.\n", file);
         exit(8);
     }
-    if ((fwrite(x_mmse, sizeof(uint16_t), NUM_TX_ANT*NUM_SC*2, fp)) != NUM_TX_ANT*NUM_SC*2) {
+    if ((fwrite(x_mmse, sizeof(data_t), NUM_TX_ANT*NUM_SC*2, fp)) != NUM_TX_ANT*NUM_SC*2) {
         fprintf(stderr, "Error writing file %s.\n", file);
         exit(8);
     }
@@ -341,7 +371,7 @@ void mmse_nosqrt(
   cmatadd_TxTx(HH_H, R, HH_H);
   /* L, D: (H^H*H + R) = L*D*L^H */
   complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-  complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; /* TODO D should be a vector to save memory*/
   ccholesky_nosqrt_TxTx(HH_H, L, D);
   /* z: L*z = H^H*y */
   complex HHy[NUM_TX_ANT][NUM_SC];
@@ -349,8 +379,10 @@ void mmse_nosqrt(
   complex z[NUM_TX_ANT][NUM_SC];
   cforwardsub_TxTx(L, HHy, z);
   /* x_MMSE: L^H*x_MMSE = D^-1*z */
+  for (uint32_t i = 0; i != NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j != NUM_SC; ++j)
+      z[i][j] = cdiv(z[i][j], D[i][i][j]);
   complex LH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
   cmat_hermitian_transpose_TxTx(L, LH);
-  cbackwardsub_TxTx(LH, z, x_MMSE);
-  cmatvecmul_TxRx(D, x_MMSE, x_MMSE);
+  cbackwardsub_TxTx(L, z, x_MMSE);
 }

	mmserv Minimum Mean Square Error detection on RISC-V Vector Extention
	git clone https://git.ea.contact/mmserv
	Log \| Files \| Refs \| README

M	inc/define.h	\|	7	+++++--
M	inc/mmserv.h	\|	25	++++++++++++++++---------
M	main.c	\|	46	++++++++++++++++++++++++----------------------
M	scripts/gen_data.py	\|	10	+++++-----
M	scripts/mmse.py	\|	57	+++++++++++++++++++++++++--------------------------------
M	scripts/util.py	\|	18	+++++++++---------
M	scripts/viz_compare_x.py	\|	4	++--
M	src/mmserv.c	\|	170	+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------