Initial floating point solution - mmserv - Minimum Mean Square Error detection on RISC-V Vector Extention

commit 70e540b099b6280fa4ccd5dd11ea0b7457e21601
Author: Egor Achkasov <eaachkasov@edu.hse.ru>
Date:   Fri, 25 Oct 2024 13:59:19 +0200

Initial floating point solution

Diffstat:
A .gitignore  | 9 +++++++++
A GNUmakefile  | 37 +++++++++++++++++++++++++++++++++++++
A README.md  | 36 ++++++++++++++++++++++++++++++++++++
A inc/define.h  | 13 +++++++++++++
A inc/mmserv.h  | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A main.c  | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A scripts/gen_data.py  | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A scripts/mmse.py  | 44 ++++++++++++++++++++++++++++++++++++++++++++
A scripts/simulation-run.fish  | 39 +++++++++++++++++++++++++++++++++++++++
A scripts/util.py  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A scripts/viz_compare_x.py  | 36 ++++++++++++++++++++++++++++++++++++
A src/mmserv.c  | 356 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

12 files changed, 961 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+data
+out
+data.S
+build
+obj
+.vscode
+*__pycache__*
+.gdb_history
+\ No newline at end of file
diff --git a/GNUmakefile b/GNUmakefile
@@ -0,0 +1,37 @@
+CC = gcc
+CFLAGS = -Iinc -fno-builtin -Wall -Wextra
+OBJDIR = obj
+BUILDDIR = build
+
+SRC = $(wildcard src/*.c)
+OBJ = $(SRC:src/%.c=$(OBJDIR)/%.o)
+OBJ += $(OBJDIR)/main.o
+
+TARGET = $(BUILDDIR)/mmse
+
+# Debug target
+dbg: CFLAGS += -g
+dbg: $(TARGET)
+
+all: $(TARGET)
+
+# Compile the elf
+$(TARGET): $(OBJ)
+	@mkdir -p $(BUILDDIR)
+	$(CC) -o $@ $^
+
+# Compile the src/*.c object files
+$(OBJDIR)/%.o: src/%.c
+	@mkdir -p $(OBJDIR)
+	$(CC) $(CFLAGS) -c $< -o $@
+
+# Compile the main.c object files
+$(OBJDIR)/main.o: main.c
+	@mkdir -p $(OBJDIR)
+	$(CC) $(CFLAGS) -c $< -o $@
+
+clean:
+	rm -rf $(OBJDIR)/* $(BUILDDIR)/*
+
+.PHONY: all dbg clean
+
diff --git a/README.md b/README.md
@@ -0,0 +1,36 @@
+# MMSE on RISC-V
+
+Forked from [gfdmrv](https://gitlab.vodafone-chair.org/viktor.razilov/gfdmrv) and [lte-benchmark-code](https://gitlab.vodafone-chair.org/viktor.razilov/lte-benchmark-code).
+
+## Configuration
+
+Set up defines in `inc/mmserv.conf` before using the tool.
+- NUM_RX_ANT - number of recieving antennas
+- NUM_TX_ANT - number of transmitting antennas
+- NUM_SC - number of subcarriers 
+
+## Data generation
+
+Generate data with `scripts/gen_data.py`.
+
+```
+$ python scripts/gen_data.py --help
+```
+
+There are 4 variables that are generated:
+- x - transmitted signal
+- H - channel signal
+- R - noise correlation matrix
+- y - received signal
+
+The data is generated as complex floating point. It is interleaved into int16 before output. Interleaved data contains real values on even positions and imaginary values on odd positions. Interleaving and deinterleaving functions can be found in `scripts/util.py`.
+
+- txt files are line separated entries
+- bin files are densly concatenated values
+- S file an asm data file with multiple sections for each variable. Note that running `python scripts/gen_data.py --s` outputs the file into stdout, so consider using `> data.S`.  
+
+## Data visualization
+
+The program outputs the approximated `x` into `out/x_mmse.bin`. Run `python scripts/viz_compare_x.py` to plot the actual and approximated signal samples, assuming that `data/x.bin` exists.
+
+It is possible to compare the C implementation with a simple python numpy one. Running `python scripts/mmse.py` creates `out/x_mmse_python.bin` assuming that `data/x.bin` exists. Plot it together with the C approximation by `cp out/x_mmse_python.bin bin/x.bin`. Compare it with the original signal by `cp out/x_mmse_python.bin out/x_mmse.bin`.
diff --git a/inc/define.h b/inc/define.h
@@ -0,0 +1,13 @@
+#ifndef __DEFINE_H
+#define __DEFINE_H
+
+#define NUM_RX_ANT 12
+#define NUM_TX_ANT 12
+#define NUM_SC 1
+
+#define NOSQRT 1
+
+#define IN
+#define OUT
+
+#endif
diff --git a/inc/mmserv.h b/inc/mmserv.h
@@ -0,0 +1,194 @@
+#ifndef __MMSERV_H
+#define __MMSERV_H
+
+#include "define.h"
+
+#include <stdint.h> // for int16_t, int32_t
+#include <stddef.h> // for size_t
+#include <stdio.h>  // for FILE
+
+/*
+ * Typedefs
+ */
+
+typedef float data_t;
+typedef double acc_t;
+
+typedef struct {
+    acc_t re;
+    acc_t im;
+} complex;
+
+/** Calculate MMSE estimation of x in y = H*x + n
+ * \param H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param y received signal. Shape [NUM_RX_ANT][NUM_SC]
+ * \param R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param x_MMSE output MMSE estimation of x. Shape [NUM_TX_ANT][NUM_SC]
+ */
+void mmse(
+  IN complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex y[NUM_RX_ANT][NUM_SC],
+  IN complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex x_MMSE[NUM_TX_ANT][NUM_SC]);
+
+/** Calculate MMSE estimation of x in y = H*x + n. Uses a sqrt-free Cholesky decomposition.
+ * \param H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param y received signal. Shape [NUM_RX_ANT][NUM_SC]
+ * \param R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param x_MMSE output MMSE estimation of x. Shape [NUM_TX_ANT][NUM_SC]
+ */
+void mmse_nosqrt(
+  IN complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex y[NUM_RX_ANT][NUM_SC],
+  IN complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex x_MMSE[NUM_TX_ANT][NUM_SC]);
+
+/*
+ * Complex operations
+ */
+
+/** Create a complex number from real and imaginary parts */
+complex cmake(IN data_t re, IN data_t im);
+
+/** Complex multiplication */
+complex cmul(IN complex a, IN complex b);
+
+/** Complex absolute value squared */
+acc_t cabs2(IN complex a);
+
+/** Complex addition */
+complex cadd(IN complex a, IN complex b);
+void cadd_acc(IN complex a, IN complex b);
+
+/** Complex subtraction */
+complex csub(IN complex a, IN complex b);
+
+/** Complex conjugate */
+complex cconj(IN complex a);
+
+/** Complex division */
+complex cdiv(IN complex a, IN complex b);
+
+/** Square root of a natural number */
+data_t sqrt(IN acc_t x);
+
+/** Complex root */
+complex csqrt(IN complex a);
+
+/*
+ * Complex matrix operations
+ */
+
+/** Calculate the Hermitian transpose of a matrix A
+ * \param A input matrix. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param AH output matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC]
+ */
+void cmat_hermitian_transpose_RxTx(
+  IN complex A[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex AH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]);
+
+/** Calculate the Hermitian transpose of a matrix A
+ * \param A input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param AH output matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ */
+void cmat_hermitian_transpose_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex AH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]);
+
+/** Multiply two matrices A and B
+ * \param A input matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC]
+ * \param B input matrix. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param result output matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ */
+void cmatmul_TxRx_RxTx(
+  IN complex A[NUM_TX_ANT][NUM_RX_ANT][NUM_SC],
+  IN complex B[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]);
+
+/** Add two matrices A and B
+ * \param A input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param B input matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param result output matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ */
+void cmatadd_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]);
+
+/** Perfom Cholesky decomposition of a square matrix A
+ * such that A = L*L^H with Cholesky–Banachiewicz algorithm
+ * \param A input square matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param L output lower triangular matrix L. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ */
+void ccholesky_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]);
+
+/** Perfom Cholesky decomposition of a square matrix A
+ * such that A = L*D*L^H with Cholesky–Banachiewicz algorithm,
+ * where D is a diagonal matrix and
+ * L is a lower triangular matrix with ones on the diagonal.
+ * This function does not calculate use csqrt and sqrt functions.
+ * \param A input square matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param L output lower triangular matrix L. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param D output diagonal matrix L. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ */
+void ccholesky_nosqrt_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]);
+
+/** Multiply a matrix A with a vector b
+ * \param A input matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC]
+ * \param b input vector. Shape [NUM_RX_ANT][NUM_SC]
+ * \param result output vector. Shape [NUM_TX_ANT][NUM_SC]
+ */
+void cmatvecmul_TxRx(
+  IN complex A[NUM_TX_ANT][NUM_RX_ANT][NUM_SC],
+  IN complex b[NUM_RX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_SC]);
+
+/** Find z from L*z = b with a forward substitution
+ * \param L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param b rhs vector. Shape [NUM_TX_ANTS][NUM_SC]
+ * \param result output vector z. Shape [NUM_TX_ANT][NUM_SC]
+ */
+void cforwardsub_TxTx(
+  IN complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex b[NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_SC]);
+
+/** Find x from U*x = b with a backward substitution
+ * \param U upper triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
+ * \param b rhs vector. Shape [NUM_TX_ANTS][NUM_SC]
+ * \param result output vector x. Shape [NUM_TX_ANT][NUM_SC]
+ */
+void cbackwardsub_TxTx(
+  IN complex U[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex b[NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_SC]);
+
+/*
+ * IO
+ */
+
+/** Load data from a binary file
+ * \param file path to the file
+ * \param size number of elements to read
+ * \param out output array
+ */
+void load_data(
+  IN const char *file,
+  IN size_t size,
+  OUT uint16_t *out);
+
+/** Save data to a binary file
+ * \param file path to the file
+ * \param x_mmse data to save
+ */
+void save_data(
+  IN const char* file,
+  IN uint16_t x_mmse[NUM_TX_ANT][NUM_SC][2]);
+
+
+#endif /* __MMSERV_H */
diff --git a/main.c b/main.c
@@ -0,0 +1,56 @@
+#include "inc/mmserv.h"
+
+int main() {
+  /* Load the data */
+  uint16_t x_raw[NUM_TX_ANT][NUM_SC][2];              /* Transmitted signal */
+  uint16_t H_raw[NUM_RX_ANT][NUM_TX_ANT][NUM_SC][2];  /* Channel */
+  uint16_t R_raw[NUM_TX_ANT][NUM_TX_ANT][NUM_SC][2];  /* Noise covariance matrix */
+  uint16_t y_raw[NUM_RX_ANT][NUM_SC][2];              /* Received signal */
+  load_data("data/x.bin", NUM_TX_ANT*NUM_SC*2, x_raw);
+  load_data("data/H.bin", NUM_RX_ANT*NUM_TX_ANT*NUM_SC*2, H_raw);
+  load_data("data/y.bin", NUM_RX_ANT*NUM_SC*2, y_raw);
+  load_data("data/R.bin", NUM_RX_ANT*NUM_SC*2, R_raw);
+
+  /* Cast the data into complex data structures */
+  complex x[NUM_TX_ANT][NUM_SC];
+  complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC];
+  complex y[NUM_RX_ANT][NUM_SC];
+  complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_SC; ++j){
+      x[i][j].re = x_raw[i][j][0];
+      x[i][j].im = x_raw[i][j][1];
+    }
+  for (uint32_t i = 0; i < NUM_RX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k){
+        H[i][j][k].re = H_raw[i][j][k][0];
+        H[i][j][k].im = H_raw[i][j][k][1];
+      }
+  for (uint32_t i = 0; i < NUM_RX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_SC; ++j){
+      y[i][j].re = y_raw[i][j][0];
+      y[i][j].im = y_raw[i][j][1];
+    }
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k){
+        R[i][j][k].re = R_raw[i][j][k][0];
+        R[i][j][k].im = R_raw[i][j][k][1];
+      }
+
+  /* Calculate the MMSE approximation */
+  complex x_MMSE[NUM_TX_ANT][NUM_SC];
+  mmse_nosqrt(H, y, R, x_MMSE);
+
+  /* Interleave the result */
+  uint16_t res[NUM_TX_ANT][NUM_SC][2];
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_SC; ++j){
+      res[i][j][0] = x_MMSE[i][j].re;
+      res[i][j][1] = x_MMSE[i][j].im;
+    }
+
+  /* Save the result */
+  save_data("out/x_mmse.bin", res);
+}
diff --git a/scripts/gen_data.py b/scripts/gen_data.py
@@ -0,0 +1,83 @@
+#!/usr/bin/python
+
+from util import read_defines, interleave
+
+import numpy as np
+from numpy.random import random, normal
+from sys import argv
+
+if "--help" in argv:
+    print("Usage: python scripts/gen_data.py [--txt] [--bin] [--s]")
+    print("  --txt: write the data/*.txt files")
+    print("  --bin: write the data/*.bin files")
+    print("  --s: print the .S file to stdout")
+    exit(0)
+
+# Change these flags to control the output
+WRITE_DATA_TXT = "--txt" in argv
+WRITE_DATA_BIN = "--bin" in argv
+WRITE_DATA_S = "--s" in argv
+
+NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines()
+NOISE_STD_DEVIATION = np.sqrt(.5) / 10  # noise standard deviation
+
+# Transmitter signal
+x = random((NUM_TX_ANT, NUM_SC)) \
+    + 1.j * random((NUM_TX_ANT, NUM_SC))
+x = (x - 0.5 - 0.5j) * 2  # scale it from [0, 1] to [-1, 1]
+# Channel
+H = random((NUM_RX_ANT, NUM_TX_ANT, NUM_SC)) \
+    + 1.j * random((NUM_RX_ANT, NUM_TX_ANT, NUM_SC))
+H = (H - 0.5 - 0.5j) * 2  # scale it from [0, 1] to [-1, 1]
+# Noise
+n = normal(0, NOISE_STD_DEVIATION, (NUM_RX_ANT, NUM_SC)) \
+    + 1.j * normal(0, NOISE_STD_DEVIATION, (NUM_RX_ANT, NUM_SC))
+# Received signal
+y = np.einsum("ijk,jk->ik", H, x) + n
+
+# Noise covariance matrix
+R = np.eye(NUM_TX_ANT, NUM_TX_ANT, dtype=np.complex64) * NOISE_STD_DEVIATION**2
+
+# Cast the complex data to int16
+x_data = interleave(x)
+H_data = interleave(H)
+R_data = interleave(R)
+y_data = interleave(y)
+data_tuple = (x_data, H_data, R_data, y_data)
+
+
+class Section:
+    def __init__(self, name, source, align, sizeof, length, duplicate=False):
+        self.name = name
+        self.source = source
+        self.align = align
+        self.sizeof = sizeof
+        self.length = length
+        self.duplicate = duplicate
+
+
+sections = [
+    Section("x", "data/x.txt", "3", 16, x.size * 2),
+    Section("H", "data/H.txt", "3", 16, H.size * 2),
+    Section("R", "data/R.txt", "3", 16, R.size * 2),
+    Section("y", "data/y.txt", "3", 16, y.size * 2),
+]
+
+if WRITE_DATA_TXT:
+    for data, sec in zip(data_tuple, sections):
+        with open(sec.source, "w") as f:
+            for sample in data:
+                f.write(f"{sample}\n")
+
+if WRITE_DATA_BIN:
+    for data, sec in zip(data_tuple, sections):
+        with open(sec.source.replace(".txt", ".bin"), "wb") as f:
+            f.write(data.tobytes())
+
+if WRITE_DATA_S:
+    print(".section .data,\"aw\",@progbits")
+    for data, sec in zip(data_tuple, sections):
+        print(f".global {sec.name}")
+        print(f"{sec.name}:")
+        for sample in data:
+            print(f"    .hword 0x{sample.view(np.uint16):04x} // {sample}")
diff --git a/scripts/mmse.py b/scripts/mmse.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python
+
+from util import read_defines, deinterleave, interleave
+
+import numpy as np
+
+NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines()
+
+# Read the data from the files
+x = deinterleave(np.fromfile("data/x.bin", dtype=np.int16), (NUM_TX_ANT, NUM_SC))
+H = deinterleave(np.fromfile("data/H.bin", dtype=np.int16), (NUM_RX_ANT, NUM_TX_ANT, NUM_SC))
+R = deinterleave(np.fromfile("data/R.bin", dtype=np.int16), (NUM_TX_ANT, NUM_TX_ANT, NUM_SC))
+y = deinterleave(np.fromfile("data/y.bin", dtype=np.int16), (NUM_RX_ANT, NUM_SC))
+
+# Move the SC dimension to the beginning
+x = np.moveaxis(x, -1, 0)
+H = np.moveaxis(H, -1, 0)
+R = np.moveaxis(R, -1, 0)
+y = np.moveaxis(y, -1, 0)
+
+HH = H.conj().transpose(0, 2, 1)
+L = np.linalg.cholesky(HH @ H + R)
+HHy = np.einsum("...ij,...j->...i", HH, y)
+
+z = np.empty((NUM_SC, NUM_TX_ANT), dtype=np.complex64)
+for i in range(NUM_TX_ANT):
+    z[:, i] = (HHy[:, i] - np.sum([L[:, i, j]*z[:, j] for j in range(i)], 0)) / L[:, i, i]
+
+LH = L.conj().transpose(0, 2, 1)
+x_mmse = np.empty((NUM_SC, NUM_TX_ANT), dtype=np.complex64)
+for i in range(NUM_TX_ANT-1, -1, -1):
+    x_mmse[:, i] = (z[:, i] - np.sum([LH[:, i, j]*x_mmse[:, j] for j in range(i+1, NUM_TX_ANT)], 0)) / LH[:, i, i]
+
+# Move the SC dimension back to the end
+x_mmse = np.moveaxis(x_mmse, 0, -1)
+x = np.moveaxis(x, 0, -1)
+H = np.moveaxis(H, 0, -1)
+R = np.moveaxis(R, 0, -1)
+y = np.moveaxis(y, 0, -1)
+
+# Cast the complex data to int16
+x_mmse_data = interleave(x_mmse)
+with open("out/x_mmse_python.bin", "wb") as f:
+    f.write(x_mmse_data.tobytes())
diff --git a/scripts/simulation-run.fish b/scripts/simulation-run.fish
@@ -0,0 +1,39 @@
+#!/bin/fish
+
+date
+module load questa/10v7a
+
+set root $PWD
+set defconf config/default.mk
+set result "$root/results.csv"
+
+rm -f $defconf
+
+echo 'NR_LANES;VLEN;datalayout;order;M;K;cycles' > $result
+
+for nrl in 2 4 8 16
+    for vl in 512 1024 2048 4096
+        # Write config
+        echo "nr_lanes ?= $nrl" > $defconf
+        echo "vlen ?= $vl" >> $defconf
+        # Compile binary
+        cd apps
+        rm -f gfdmrv/src/gfdrmv.c.o
+        rm -f gfdmrv/data.S.o
+        make bin/gfdmrv
+        make bin/gfdmrv
+        cd $root
+        #Run simulation
+        cd hardware
+        app=gfdmrv timeout -s KILL 1h make simc
+        grep "~~~~~" build/transcript
+        if set resline (grep "~~~~~" build/transcript)
+            echo $resline | sed "s/# ~~~~~/$nrl;$vl;/" >> $result
+        else
+            echo "Something went wrong for NR_LANES $nrl and VLEN $vl"
+        end
+        cd $root
+    end
+end
+
+date
diff --git a/scripts/util.py b/scripts/util.py
@@ -0,0 +1,58 @@
+import numpy as np
+
+
+def read_defines():
+    """Read the defines from the define.h file
+
+    Returns:
+        int: Number of receive antennas
+        int: Number of transmit antennas
+        int: Number of subcarriers
+    """
+    with open(__file__[:__file__.rfind("/")] + "/../inc/define.h", "r") as f:
+        lines = f.read().split("\n")
+    for line in lines:
+        if line[:19] == "#define NUM_RX_ANT ":
+            NUM_RX_ANT = int(line[19:])
+        if line[:19] == "#define NUM_TX_ANT ":
+            NUM_TX_ANT = int(line[19:])
+        if line[:15] == "#define NUM_SC ":
+            NUM_SC = int(line[15:])
+
+    # Assert that all the defines are read
+    assert NUM_RX_ANT
+    assert NUM_TX_ANT
+    assert NUM_SC
+
+    return NUM_RX_ANT, NUM_TX_ANT, NUM_SC
+
+
+def interleave(data: np.ndarray) -> np.ndarray:
+    """Cast a np.complex64 array to np.int16 array.
+
+    Args:
+        data (np.ndarray): The complex data to be casted. Dtype: np.complex64
+
+    Returns:
+        np.ndarray: The interleaved data. Dtype: np.int16
+    """
+    res = np.empty(2 * data.size, dtype=np.int16)
+    res[0::2] = data.flatten().real.astype(np.float16).view(np.int16)
+    res[1::2] = data.flatten().imag.astype(np.float16).view(np.int16)
+    return res
+
+
+def deinterleave(data: np.ndarray, shape: tuple) -> np.ndarray:
+    """Cast a np.int16 array to np.complex64 array.
+
+    Args:
+        data (np.ndarray): The interleaved data to be casted. Dtype: np.int16
+        shape (tuple): The shape of the deinterleaved data.
+
+    Returns:
+        np.ndarray: The deinterleaved data. Dtype: np.complex64
+    """
+    res = np.empty(shape, np.complex64)
+    res.real = data[0::2].view(np.float16).astype(np.float32).reshape(shape)
+    res.imag = data[1::2].view(np.float16).astype(np.float32).reshape(shape)
+    return res
diff --git a/scripts/viz_compare_x.py b/scripts/viz_compare_x.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+from util import read_defines, deinterleave
+
+import numpy as np
+from matplotlib import pyplot as plt
+
+# read `out/x_mmse.bin` and plot the complex signal data samples
+
+NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines()
+
+with open("out/x_mmse.bin", "rb") as f:
+    x_mmse = deinterleave(np.fromfile(f, dtype=np.int16), (NUM_TX_ANT, NUM_SC))
+with open("data/x.bin", "rb") as f:
+    x = deinterleave(np.fromfile(f, dtype=np.int16), (NUM_TX_ANT, NUM_SC))
+
+for tx in range(NUM_TX_ANT):
+    plt.scatter(x_mmse[tx].real,
+                x_mmse[tx].imag,
+                label=f"MMSE Tx {tx}",
+                marker="x",
+                color=plt.cm.hsv(tx / NUM_TX_ANT))
+    plt.scatter(x[tx].real,
+                x[tx].imag,
+                label=f"Orig Tx {tx}",
+                marker="o",
+                color=plt.cm.hsv(tx / NUM_TX_ANT))
+plt.axhline(0, color='black')
+plt.axvline(0, color='black')
+plt.xlim(-1.1, 1.1)
+plt.ylim(-1.1, 1.1)
+plt.title("Approximated MMSE Signal vs Original Signal Samples")
+plt.ylabel("Imaginary")
+plt.xlabel("Real")
+plt.legend()
+plt.show()
diff --git a/src/mmserv.c b/src/mmserv.c
@@ -0,0 +1,356 @@
+#include "mmserv.h"
+
+#include <stdlib.h> // for exit
+
+/*
+ * Complex functions
+ */
+
+complex cmake(IN data_t re, IN data_t im)
+{
+  complex t;
+  t.re = re;
+  t.im = im;
+  return t;
+}
+complex cmul(IN complex a, IN complex b)
+{
+  complex t;
+  t.re = a.re * b.re - a.im * b.im;
+  t.im = a.im * b.re + a.re * b.im;
+  return t;	
+}
+acc_t cabs2(IN complex a)
+{
+  return a.re * a.re + a.im * a.im;
+}
+complex cadd(IN complex a, IN complex b)
+{
+  complex t;
+  t.re = a.re + b.re;
+  t.im = a.im + b.im;
+  return t;
+}
+void cadd_acc(IN complex a, IN complex b)
+{
+  a.re += b.re;
+  a.im += b.im;
+}
+complex csub(IN complex a, IN complex b)
+{
+  complex t;
+  t.re = a.re - b.re;
+  t.im = a.im - b.im;
+  return t;
+}
+complex cconj(IN complex a)
+{
+  complex t;
+  t.re = a.re;
+  t.im = -a.im;
+  return t;
+}
+complex cdiv(IN complex a, IN complex b)
+{
+  complex t;
+  t.im = b.re * b.re + b.im * b.im;
+  t.re = (a.re * b.re + a.im * b.im) / t.im;
+  t.im = (a.im * b.re - a.re * b.im) / t.im;
+  return t;
+}
+
+data_t sqrt(IN acc_t x)
+{
+  if (x < 2) return x;
+  acc_t lo = 1, hi = x, mid;
+  while (100 * lo * lo < x) lo *= 10;
+  while (hi * hi / 100 > x) hi /= 10;
+  for (int i = 0; i < 100; i++) {
+    mid = (lo + hi) / 2;
+    if (mid * mid == x) return mid;
+    if (mid * mid > x) hi = mid;
+    else lo = mid;
+  }
+  return mid;
+}
+complex csqrt(IN complex a)
+{
+  data_t length = sqrt(cabs2(a));
+  complex res;
+  res.re = sqrt((length + a.re) / 2);
+  res.im = sqrt((length - a.re) / 2);
+  res.im *= (a.im > 0) - (a.im < 0);
+  return res;
+}
+
+/*
+ * Complex matrix operations
+ */
+
+void cmat_hermitian_transpose_RxTx(
+  IN complex A[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex AH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC])
+{
+  for (uint32_t i = 0; i < NUM_RX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k)
+        AH[i][j][k] = cconj(A[j][i][k]);
+}
+
+void cmat_hermitian_transpose_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex AH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
+{
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k)
+        AH[i][j][k] = cconj(A[j][i][k]);
+}
+
+void cmatmul_TxRx_RxTx(
+  IN complex A[NUM_TX_ANT][NUM_RX_ANT][NUM_SC],
+  IN complex B[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
+{
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_RX_ANT; ++k)
+        for (uint32_t l = 0; l < NUM_SC; ++l)
+          result[i][j][l] = cadd(result[i][j][l], cmul(A[i][k][l], B[k][j][l]));
+}
+
+void cmatadd_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex B[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
+{
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k)
+        result[i][j][k] = cadd(A[i][j][k], B[i][j][k]);
+}
+
+void ccholesky_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
+{
+  /* Init upper triangle with zeros */
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = i+1; j < NUM_TX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k)
+        L[i][j][k] = cmake(0, 0);
+  /* Calculate diagonal and lower triangular entries */
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < i+1; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k){
+        complex sum = {0, 0};
+        for (uint32_t l = 0; l < j; ++l)
+          cadd_acc(sum, cmul(L[i][l][k], cconj(L[j][l][k])));
+        if (i == j)
+          L[i][j][k] = csqrt(csub(A[i][i][k], sum));
+        else
+          L[i][j][k] = cdiv(csub(A[i][j][k], sum), L[j][j][k]);
+
+        // TODO delete
+        #if 0
+        for (uint32_t i = 0; i < NUM_TX_ANT; ++i){
+          for (uint32_t j = 0; j < NUM_TX_ANT; ++j)
+            printf("(%6i, %6i) ", L[i][j][0].re, L[i][j][0].im);
+          printf("\n");
+        }
+        printf("\n");
+        #endif
+      }
+}
+
+void ccholesky_nosqrt_TxTx(
+  IN complex A[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC])
+{
+  /* Init zeros in upper triangles of L and D */
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = i+1; j < NUM_TX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k)
+        D[i][j][k] = L[i][j][k] = cmake(0, 0);
+  /* Init zeros in lower triangle of D */
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < i; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k)
+        D[i][j][k] = cmake(0, 0);
+  /* Init ones in the diagonal of the result */
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t k = 0; k < NUM_SC; ++k)
+      L[i][i][k] = cmake(1, 0);
+
+
+  /* Calculate the lower triangle of the result */
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i){
+    /* L_ij = (A_ij - \sum_{l=0}^{j-1} L_il L_jl^* D_k) / D_j */
+    for (uint32_t j = 0; j < i; ++j){
+      complex sum = {0, 0};
+      for (uint32_t l = 0; l < j; ++l)
+        for (uint32_t k = 0; k < NUM_SC; ++k){
+          complex t = cmul(L[i][l][k], cconj(L[j][l][k]));
+          t = cmul(t, D[l][l][k]);
+          cadd_acc(sum, t);
+        }
+      for (uint32_t k = 0; k < NUM_SC; ++k){
+        L[i][j][k] = csub(A[i][j][k], sum);
+        L[i][j][k] = cdiv(L[i][j][k], D[j][j][k]);
+      }
+    }
+    /* D_i = A_ii - \sum_{j=0}^{i-1} L_ij L_ij^* D_j */
+    complex sum = {0, 0};
+    for (uint32_t j = 0; j < i; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k){
+        complex t = cmul(L[i][j][k], cconj(L[i][j][k]));
+        t = cmul(t, D[j][j][k]);
+        cadd_acc(sum, t);
+      }
+    for (uint32_t k = 0; k < NUM_SC; ++k)
+      D[i][i][k] = csub(A[i][i][k], sum);
+  }
+}
+
+void cmatvecmul_TxRx(
+  IN complex A[NUM_TX_ANT][NUM_RX_ANT][NUM_SC],
+  IN complex b[NUM_RX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_SC])
+{
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_RX_ANT; ++j)
+      for (uint32_t k = 0; k < NUM_SC; ++k)
+        result[i][k] = cadd(result[i][k], cmul(A[i][j][k], b[j][k]));
+}
+
+void cforwardsub_TxTx(
+  IN complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex b[NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_SC])
+{
+  for (uint32_t i = 0; i < NUM_TX_ANT; ++i)
+    for (uint32_t j = 0; j < NUM_SC; ++j){
+      result[i][j] = b[i][j];
+      for (uint32_t k = 0; k < i; ++k)
+        result[i][j] = csub(result[i][j], cmul(L[i][k][j], result[k][j]));
+      result[i][j] = cdiv(result[i][j], L[i][i][j]);
+    }
+}
+
+void cbackwardsub_TxTx(
+  IN complex U[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex b[NUM_TX_ANT][NUM_SC],
+  OUT complex result[NUM_TX_ANT][NUM_SC])
+{
+  for (uint32_t i = NUM_TX_ANT; i > 0; --i)
+    for (uint32_t j = 0; j < NUM_SC; ++j){
+      result[i][j] = b[i][j];
+      for (uint32_t k = i+1; k < NUM_TX_ANT; ++k)
+        result[i][j] = csub(result[i][j], cmul(U[i][k][j], result[k][j]));
+      result[i][j] = cdiv(result[i][j], U[i][i][j]);
+    }
+}
+
+/*
+ * IO
+ */
+
+void load_data(
+  IN const char* file,
+  IN size_t size,
+  OUT uint16_t *out)
+{
+    FILE *fp = fopen(file, "rb");
+    
+    if (fp == NULL) {
+        fprintf(stderr, "Error opening file %s.\n", file);
+        exit(8);
+    }
+    if ((fread(out, sizeof(uint16_t), size, fp)) != size) {
+        fprintf(stderr, "Error reading file %s.\n", file);
+        exit(8);
+    }
+
+    fclose(fp);
+}
+
+void save_data(
+  IN const char* file,
+  IN uint16_t x_mmse[NUM_TX_ANT][NUM_SC][2])
+{
+    FILE *fp = fopen(file, "wb");
+    
+    if (fp == NULL) {
+        fprintf(stderr, "Error opening file %s.\n", file);
+        exit(8);
+    }
+    if ((fwrite(x_mmse, sizeof(uint16_t), NUM_TX_ANT*NUM_SC*2, fp)) != NUM_TX_ANT*NUM_SC*2) {
+        fprintf(stderr, "Error writing file %s.\n", file);
+        exit(8);
+    }
+
+    fclose(fp);
+}
+
+/*
+ * MMSE
+ */
+
+void mmse(
+  IN complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex y[NUM_RX_ANT][NUM_SC],
+  IN complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex x_MMSE[NUM_TX_ANT][NUM_SC])
+{
+  /* H^H */
+  complex HH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC];
+  cmat_hermitian_transpose_RxTx(H, HH);
+  /* H^H*H */
+  complex HH_H[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  cmatmul_TxRx_RxTx(HH, H, HH_H);
+  /* H^H*H + R */
+  cmatadd_TxTx(HH_H, R, HH_H);
+  /* L: (H^H*H + R) = L*L^H */
+  complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  ccholesky_TxTx(HH_H, L);
+  /* z: L*z = H^H*y */
+  complex HHy[NUM_TX_ANT][NUM_SC];
+  cmatvecmul_TxRx(HH, y, HHy);
+  complex z[NUM_TX_ANT][NUM_SC];
+  cforwardsub_TxTx(L, HHy, z);
+  /* x_MMSE: L^H*x_MMSE = z */
+  complex LH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  cmat_hermitian_transpose_TxTx(L, LH);
+  cbackwardsub_TxTx(LH, z, x_MMSE);
+}
+
+void mmse_nosqrt(
+  IN complex H[NUM_RX_ANT][NUM_TX_ANT][NUM_SC],
+  IN complex y[NUM_RX_ANT][NUM_SC],
+  IN complex R[NUM_TX_ANT][NUM_TX_ANT][NUM_SC],
+  OUT complex x_MMSE[NUM_TX_ANT][NUM_SC])
+{
+  /* H^H */
+  complex HH[NUM_TX_ANT][NUM_RX_ANT][NUM_SC];
+  cmat_hermitian_transpose_RxTx(H, HH);
+  /* H^H*H */
+  complex HH_H[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  cmatmul_TxRx_RxTx(HH, H, HH_H);
+  /* H^H*H + R */
+  cmatadd_TxTx(HH_H, R, HH_H);
+  /* L, D: (H^H*H + R) = L*D*L^H */
+  complex L[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  complex D[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  ccholesky_nosqrt_TxTx(HH_H, L, D);
+  /* z: L*z = H^H*y */
+  complex HHy[NUM_TX_ANT][NUM_SC];
+  cmatvecmul_TxRx(HH, y, HHy);
+  complex z[NUM_TX_ANT][NUM_SC];
+  cforwardsub_TxTx(L, HHy, z);
+  /* x_MMSE: L^H*x_MMSE = D^-1*z */
+  complex LH[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
+  cmat_hermitian_transpose_TxTx(L, LH);
+  cbackwardsub_TxTx(LH, z, x_MMSE);
+  cmatvecmul_TxRx(D, x_MMSE, x_MMSE);
+}

	mmserv Minimum Mean Square Error detection on RISC-V Vector Extention
	git clone https://git.ea.contact/mmserv
	Log \| Files \| Refs \| README

A	.gitignore	\|	9	+++++++++
A	GNUmakefile	\|	37	+++++++++++++++++++++++++++++++++++++
A	README.md	\|	36	++++++++++++++++++++++++++++++++++++
A	inc/define.h	\|	13	+++++++++++++
A	inc/mmserv.h	\|	194	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	main.c	\|	56	++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	scripts/gen_data.py	\|	83	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	scripts/mmse.py	\|	44	++++++++++++++++++++++++++++++++++++++++++++
A	scripts/simulation-run.fish	\|	39	+++++++++++++++++++++++++++++++++++++++
A	scripts/util.py	\|	58	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	scripts/viz_compare_x.py	\|	36	++++++++++++++++++++++++++++++++++++
A	src/mmserv.c	\|	356	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++