Massive refactor. Squash all the branches into master. - mmserv - Minimum Mean Square Error detection on RISC-V Vector Extention

commit cd5c7cbc34e6593198ef39c5fc61661a6998ae86
parent c3c44a832f0f41859028f99bd12f46af43e5131b
Author: Egor Achkasov <eaachkasov@gmail.com>
Date:   Tue, 20 May 2025 00:48:28 +0200

Massive refactor. Squash all the branches into master.

Diffstat:
M GNUmakefile  | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
A include/common.h  | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D include/define.h  | 20 --------------------
M main.c  | 173 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M script/gen_data.py  | 98 ++++++++++++++++++++++---------------------------------------------------------
M script/util.py  | 36 +++++-------------------------------
A src/cbackwardsub.c  | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/ccholesky.c  | 236 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/cforwardsub.c  | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/cmatgram.c  | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A src/cmatvecmul.c  | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D src/mmserv.c  | 631 -------------------------------------------------------------------------------

12 files changed, 1036 insertions(+), 841 deletions(-)
diff --git a/GNUmakefile b/GNUmakefile
@@ -1,37 +1,91 @@
-CC = gcc
-CFLAGS = -Iinclude -fno-builtin -Wall -Wextra
-OBJDIR = obj
-BUILDDIR = .
+# Default values
+ARCH      ?= x86
+DATA_TYPE ?= float
+PLATFORM  ?= linux
+NUM_RX    ?= 4
+NUM_TX    ?= 4
+NUM_SC    ?= 1024
 
-SRC = $(wildcard src/*.c)
-OBJ = $(SRC:src/%.c=$(OBJDIR)/%.o)
-OBJ += $(OBJDIR)/main.o
+# Valid values
+VALID_ARCHS      := x86 rv rvv
+VALID_DATA_TYPES := float fixed
+VALID_PLATFORMS  := linux ara baremetal
 
-TARGET = $(BUILDDIR)/mmse
+# Validate inputs
+ifneq ($(filter $(ARCH),$(VALID_ARCHS)),$(ARCH))
+	$(error Invalid ARCH: $(ARCH). Supported: $(VALID_ARCHS))
+endif
+ifneq ($(filter $(DATA_TYPE),$(VALID_DATA_TYPES)),$(DATA_TYPE))
+	$(error Invalid DATA_TYPE: $(DATA_TYPE). Supported: $(VALID_DATA_TYPES))
+endif
+ifneq ($(filter $(PLATFORM),$(VALID_PLATFORMS)),$(PLATFORM))
+	$(error Invalid PLATFORM: $(PLATFORM). Supported: $(VALID_PLATFORMS))
+endif
+ifneq ($(shell test $(NUM_RX) -gt 0 >/dev/null 2>&1 && echo valid),valid)
+	$(error NUM_RX must be an integer > 0)
+endif
+ifneq ($(shell test $(NUM_TX) -gt 0 >/dev/null 2>&1 && echo valid),valid)
+	$(error NUM_TX must be an integer > 0)
+endif
+ifneq ($(shell test $(NUM_SC) -gt 0 >/dev/null 2>&1 && echo valid),valid)
+	$(error NUM_SC must be an integer > 0)
+endif
 
-all: $(TARGET)
+# CFLAGS
+CFLAGS += -DARCH_$(ARCH)
+CFLAGS += -DDATA_TYPE_$(DATA_TYPE)
+CFLAGS += -DPLATFORM_$(PLATFORM)
+CFLAGS += -DNUM_RX=$(NUM_RX)
+CFLAGS += -DNUM_TX=$(NUM_TX)
+CFLAGS += -DNUM_SC=$(NUM_SC)
 
-# Debug target
-dbg: CFLAGS += -g
-dbg: $(TARGET)
+# Compiler selection
+ifeq ($(ARCH),x86)
+	CC := gcc
+else
+	CC := riscv64-unknown-elf-gcc
+endif
 
-# Compile the elf
-$(TARGET): $(OBJ)
-	@mkdir -p $(BUILDDIR)
-	$(CC) -o $@ $^
+# Output file
+OUTPUT := build/mmse_$(ARCH)_$(DATA_TYPE)_$(PLATFORM)_$(NUM_RX)x$(NUM_TX)x$(NUM_SC).elf
 
-# Compile the src/*.c object files
-$(OBJDIR)/%.o: src/%.c
-	@mkdir -p $(OBJDIR)
-	$(CC) $(CFLAGS) -c $< -o $@
+# Source files
+SRCS := main.c $(wildcard src/*.c)
 
-# Compile the main.c object files
-$(OBJDIR)/main.o: main.c
-	@mkdir -p $(OBJDIR)
-	$(CC) $(CFLAGS) -c $< -o $@
+# Phony targets
+.PHONY: all help gen_data
 
-clean:
-	rm -rf $(OBJDIR)/* $(BUILDDIR)/mmse
+# Default target
+all: gen_data $(OUTPUT)
 
-.PHONY: all dbg clean
+# Run data generation
+gen_data:
+	python script/gen_data.py $(NUM_TX) $(NUM_RX) $(NUM_SC)
 
+# Compile
+$(OUTPUT): $(SRCS)
+	mkdir -p build
+	$(CC) $(CFLAGS) $^ -o $@
+
+# Help target
+help:
+	@echo "Usage:"
+	@echo "  make [ARCH=<arch>] [DATA_TYPE=<type>] [PLATFORM=<platform>] [NUM_RX=<num_rx>] [NUM_TX=<num_tx>] [NUM_SC=<num_sc>]"
+	@echo ""
+	@echo "Supported ARCH values:"
+	@echo "  - x86 (default)"
+	@echo "  - rv"
+	@echo "  - rvv"
+	@echo ""
+	@echo "Supported DATA_TYPE values:"
+	@echo "  - float (default)"
+	@echo "  - fixed"
+	@echo ""
+	@echo "Supported PLATFORM values:"
+	@echo "  - linux (default)"
+	@echo "  - ara"
+	@echo "  - baremetal"
+	@echo ""
+	@echo "Supported NUM_RX values: integers > 0 (default = 4)"
+	@echo "Supported NUM_TX values: integers > 0 (default = 4)"
+	@echo "Supported NUM_SC values: integers > 0 (default = 1024)"
diff --git a/include/common.h b/include/common.h
@@ -0,0 +1,81 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <stdint.h> /* for uint64_t, uint32_t */
+
+
+/*
+ * Typedefs
+ */
+
+#if defined(DATA_TYPE_float)
+typedef float data_t;
+typedef float acc_t;
+#elif defined(DATA_TYPE_fixed)
+typedef int32_t data_t;
+typedef int64_t acc_t;
+#define FP_Q 31
+#else
+#error "Please define DATA_TYPE_float or DATA_TYPE_fixed"
+#endif
+
+typedef struct {
+  data_t *re;
+  data_t *im;
+} vcomplex;
+
+
+/*
+ * Global variables
+ */
+
+/* Raw data */
+/* Transmitted signal */
+extern data_t x_re[NUM_TX][NUM_SC];
+extern data_t x_im[NUM_TX][NUM_SC];
+/* Channel */
+extern data_t H_re[NUM_RX][NUM_TX][NUM_SC];
+extern data_t H_im[NUM_RX][NUM_TX][NUM_SC];
+/* Noise covariance matrix */
+extern data_t R_re[NUM_TX][NUM_TX][NUM_SC];
+extern data_t R_im[NUM_TX][NUM_TX][NUM_SC];
+/* Received signal */
+extern data_t y_re[NUM_RX][NUM_SC];
+extern data_t y_im[NUM_RX][NUM_SC];
+/* MMSE raw data */
+extern data_t G_re[NUM_TX][NUM_TX][NUM_SC];
+extern data_t G_im[NUM_TX][NUM_TX][NUM_SC];
+extern data_t L_re[NUM_TX][NUM_TX][NUM_SC];
+extern data_t L_im[NUM_TX][NUM_TX][NUM_SC];
+extern data_t g_D[NUM_TX][NUM_SC]; /* no imaginary part in D */
+extern data_t HHy_re[NUM_TX][NUM_SC];
+extern data_t HHy_im[NUM_TX][NUM_SC];
+extern data_t z_re[NUM_TX][NUM_SC];
+extern data_t z_im[NUM_TX][NUM_SC];
+/* Result of MMSE approximation */
+extern data_t x_MMSE_re[NUM_TX][NUM_SC];
+extern data_t x_MMSE_im[NUM_TX][NUM_SC];
+
+/* Same data but casted to vcomplex */
+extern vcomplex g_x;
+extern vcomplex g_H;
+extern vcomplex g_R;
+extern vcomplex g_y;
+extern vcomplex g_G;
+extern vcomplex g_L;
+extern vcomplex g_HHy;
+extern vcomplex g_z;
+extern vcomplex g_x_MMSE;
+
+
+/*
+ * Complex matrix operations
+ */
+
+extern void cmatgram();
+extern void ccholesky();
+extern void cmatvecmul();
+extern void cforwardsub();
+extern void cbackwardsub();
+
+#endif
diff --git a/include/define.h b/include/define.h
@@ -1,20 +0,0 @@
-#ifndef __DEFINE_H
-#define __DEFINE_H
-
-#define NUM_RX_ANT 4
-#define NUM_TX_ANT 4
-#define NUM_SC 1
-
-#define DEBUG
-
-#define IN
-#define OUT
-
-typedef float data_t;
-typedef float acc_t;
-typedef struct {
-  data_t *re;
-  data_t *im;
-} vcomplex;
-
-#endif
diff --git a/main.c b/main.c
@@ -1,72 +1,123 @@
-#include "include/define.h"
-
-#include "../common/printf.h"
-
-/* extern functions */
-extern void cmatgram_TxRx_cadd();
-extern void ccholesky_TxTx();
-extern void cmatvecmul_TxRx();
-extern void cforwardsub_TxTx();
-extern void cbackwardsub_TxTx();
-extern acc_t mse();
-
-/* Raw data */
-/* Transmitted signal */
-extern data_t x_re[NUM_TX_ANT][NUM_SC];
-extern data_t x_im[NUM_TX_ANT][NUM_SC];
-/* Channel */
-extern data_t H_re[NUM_RX_ANT][NUM_TX_ANT][NUM_SC];
-extern data_t H_im[NUM_RX_ANT][NUM_TX_ANT][NUM_SC];
-/* Noise covariance matrix */
-extern data_t R_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-extern data_t R_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-/* Received signal */
-extern data_t y_re[NUM_RX_ANT][NUM_SC];
-extern data_t y_im[NUM_RX_ANT][NUM_SC];
-
-/* Same data but casted to vcomplex */
+#include "include/common.h"
+
+#include <stddef.h> /* for size_t */
+
+
+/*
+ * Defines
+ */
+
+/* Got from https://elm-chan.org/junk/32bit/binclude.html */
+/* Import a binary file */
+#define IMPORT_BIN(sect, file, sym) __asm__ (\
+    ".section " #sect "\n"                  /* Change section */\
+    ".balign 4\n"                           /* Word alignment */\
+    ".global " #sym "\n"                    /* Export the object address */\
+    #sym ":\n"                              /* Define the object label */\
+    ".incbin \"" file "\"\n"                /* Import the file */\
+    ".global _sizeof_" #sym "\n"            /* Export the object size */\
+    ".set _sizeof_" #sym ", . - " #sym "\n" /* Define the object size */\
+    ".balign 4\n"                           /* Word alignment */\
+    ".section \".text\"\n")                 /* Restore section */
+
+
+/*
+ * Global variables
+ */
+
+/* Import data from binary files */
+IMPORT_BIN(.rodata, "data/x_re.bin", x_re);
+IMPORT_BIN(.rodata, "data/x_im.bin", x_im);
+IMPORT_BIN(.rodata, "data/H_re.bin", H_re);
+IMPORT_BIN(.rodata, "data/H_im.bin", H_im);
+IMPORT_BIN(.rodata, "data/R_re.bin", R_re);
+IMPORT_BIN(.rodata, "data/R_im.bin", R_im);
+IMPORT_BIN(.rodata, "data/y_re.bin", y_re);
+IMPORT_BIN(.rodata, "data/y_im.bin", y_im);
+
+/* Allocate space for MMSE raw data */
+data_t G_re[NUM_TX][NUM_TX][NUM_SC];
+data_t G_im[NUM_TX][NUM_TX][NUM_SC];
+data_t L_re[NUM_TX][NUM_TX][NUM_SC];
+data_t L_im[NUM_TX][NUM_TX][NUM_SC];
+data_t g_D[NUM_TX][NUM_SC]; /* no imaginary part in D */
+data_t HHy_re[NUM_TX][NUM_SC];
+data_t HHy_im[NUM_TX][NUM_SC];
+data_t z_re[NUM_TX][NUM_SC];
+data_t z_im[NUM_TX][NUM_SC];
+data_t x_MMSE_re[NUM_TX][NUM_SC];
+data_t x_MMSE_im[NUM_TX][NUM_SC];
+
+/* Initialize data */
 vcomplex g_x = { .re = (data_t *)x_re, .im = (data_t *)x_im };
 vcomplex g_H = { .re = (data_t *)H_re, .im = (data_t *)H_im };
 vcomplex g_R = { .re = (data_t *)R_re, .im = (data_t *)R_im };
 vcomplex g_y = { .re = (data_t *)y_re, .im = (data_t *)y_im };
-
-/* MMSE approximation will be stored in this global*/
-data_t x_MMSE_re[NUM_TX_ANT][NUM_SC];
-data_t x_MMSE_im[NUM_TX_ANT][NUM_SC];
+vcomplex g_G = { .re = (data_t *)G_re, .im = (data_t *)G_im };
+vcomplex g_L = { .re = (data_t *)L_re, .im = (data_t *)L_im };
+vcomplex g_HHy = { .re = (data_t *)HHy_re, .im = (data_t *)HHy_im };
+vcomplex g_z = { .re = (data_t *)z_re, .im = (data_t *)z_im };
 vcomplex g_x_MMSE = { .re = (data_t *)x_MMSE_re, .im = (data_t *)x_MMSE_im };
 
-extern vcomplex g_HH, h_y, g_HHy;
-size_t num_rx_cur=1, num_tx_cur=1, num_sc_cur=1;
+
+/*
+ * Read cycles macro and function
+ */
+
+/** Read and return the cycle counter value */
+uint64_t readcycle() {
+#if defined(ARCH_rvv) || defined(ARCH_rv)
+  uint64_t val;
+  __asm__ volatile("rdcycle %0" : "=r"(val));
+  return val;
+#elif defined(ARCH_x86)
+  unsigned int hi, lo;
+  __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
+  return ((uint64_t)hi << 32) | lo;
+#else
+#error "Unknown architecture"
+#endif
+}
+
+/** Read cycles for a function call and store the result in out */
+#define FUNC_CYCLES(func, out) \
+  do { \
+    uint64_t start = readcycle(); \
+    func; \
+    uint64_t end = readcycle(); \
+    out = end - start; \
+  } while (0)
+
+
+/*
+ * Printf function
+ */
+
+#if defined(PLATFORM_ara)
+#include "../../common/printf.h"
+#elif defined(PLATFORM_linux)
+#include <stdio.h>
+#elif defined(PLATFORM_baremetal)
+/* TODO */
+#else
+#error "Unknown platform"
+#endif
+
+
+/*
+ * Main
+ */
 
 int main() {
-  unsigned long long start, end;
-
-  size_t t1,t2,t3,t4,t5;
-  for (num_rx_cur = 1; num_rx_cur <= NUM_RX_ANT; ++num_rx_cur)
-    for (num_tx_cur = 1; num_tx_cur <= NUM_TX_ANT; ++num_tx_cur)
-      for (num_sc_cur = 1; num_sc_cur <= NUM_SC; num_sc_cur += 1) {
-        __asm__ volatile("rdcycle %0" : "=r"(start));
-        cmatgram_TxRx_cadd();
-        __asm__ volatile("rdcycle %0" : "=r"(end));
-        t1 = end - start;
-        __asm__ volatile("rdcycle %0" : "=r"(start));
-        ccholesky_TxTx();
-        __asm__ volatile("rdcycle %0" : "=r"(end));
-        t2 = end - start;
-        __asm__ volatile("rdcycle %0" : "=r"(start));
-        cmatvecmul_TxRx();
-        __asm__ volatile("rdcycle %0" : "=r"(end));
-        t3 = end - start;
-        __asm__ volatile("rdcycle %0" : "=r"(start));
-        cforwardsub_TxTx();
-        __asm__ volatile("rdcycle %0" : "=r"(end));
-        t4 = end - start;
-        __asm__ volatile("rdcycle %0" : "=r"(start));
-        cbackwardsub_TxTx();
-        __asm__ volatile("rdcycle %0" : "=r"(end));
-        t5 = end - start;
-        printf("%llu,%llu,%llu,%llu,%llu,", t1, t2, t3, t4, t5);
-      }
+  uint64_t start, end;
+  uint64_t t1,t2,t3,t4,t5;
+
+  FUNC_CYCLES(cmatgram(), t1);
+  FUNC_CYCLES(ccholesky(), t2);
+  FUNC_CYCLES(cmatvecmul(), t3);
+  FUNC_CYCLES(cforwardsub(), t4);
+  FUNC_CYCLES(cbackwardsub(), t5);
+  printf("%lu,%lu,%lu,%lu,%lu,", t1, t2, t3, t4, t5);
 
   return 0;
 }
diff --git a/script/gen_data.py b/script/gen_data.py
@@ -1,42 +1,34 @@
 #!/usr/bin/python
 
-from util import read_defines
-
 import numpy as np
 from numpy.random import random, normal
 from sys import argv
 from os import path, makedirs
 
-if "--help" in argv:
-    print("Usage: python scripts/gen_data.py [--txt] [--bin] [--s]")
-    print("  --txt: write the data/*.txt files")
-    print("  --bin: write the data/*.bin files")
-    print("  --s: print the .S file to stdout")
-    exit(0)
-
-# Change these flags to control the output
-WRITE_DATA_TXT = "--txt" in argv
-WRITE_DATA_BIN = "--bin" in argv
-WRITE_DATA_S = "--s" in argv or len(argv) == 1
+if len(argv) != 4:
+    print("Usage: python gen_data.py <NUM_RX> <NUM_TX> <NUM_SC>")
+    exit(1)
+NUM_RX = int(argv[1])  # number of receive antennas
+NUM_TX = int(argv[2])  # number of transmit antennas
+NUM_SC = int(argv[3])  # number of subcarriers
 
-NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines()
 NOISE_STD_DEVIATION = np.sqrt(.5) / 100  # noise standard deviation
 
 # Transmitter signal
-x = random((NUM_TX_ANT, NUM_SC)) \
-    + 1.j * random((NUM_TX_ANT, NUM_SC))
+x = random((NUM_TX, NUM_SC)) \
+    + 1.j * random((NUM_TX, NUM_SC))
 x = (x - 0.5 - 0.5j) * 2  # scale it from [0, 1] to [-1, 1]
 # Channel
-H = random((NUM_RX_ANT, NUM_TX_ANT, NUM_SC)) \
-    + 1.j * random((NUM_RX_ANT, NUM_TX_ANT, NUM_SC))
+H = random((NUM_RX, NUM_TX, NUM_SC)) \
+    + 1.j * random((NUM_RX, NUM_TX, NUM_SC))
 H = (H - 0.5 - 0.5j) * 2  # scale it from [0, 1] to [-1, 1]
 # Noise
-n = normal(0, NOISE_STD_DEVIATION, (NUM_RX_ANT, NUM_SC)) \
-    + 1.j * normal(0, NOISE_STD_DEVIATION, (NUM_RX_ANT, NUM_SC))
+n = normal(0, NOISE_STD_DEVIATION, (NUM_RX, NUM_SC)) \
+    + 1.j * normal(0, NOISE_STD_DEVIATION, (NUM_RX, NUM_SC))
 # Received signal
 y = np.einsum("ijk,jk->ik", H, x) + n
 # Noise covariance matrix
-R = np.eye(NUM_TX_ANT, NUM_TX_ANT, dtype=np.complex64) * NOISE_STD_DEVIATION**2
+R = np.eye(NUM_TX, NUM_TX, dtype=np.complex64) * NOISE_STD_DEVIATION**2
 
 data_tuple = (
     x.real.astype(np.float32), x.imag.astype(np.float32),
@@ -44,55 +36,19 @@ data_tuple = (
     R.real.astype(np.float32), R.imag.astype(np.float32),
     y.real.astype(np.float32), y.imag.astype(np.float32)
 )
-
-
-class Section:
-    def __init__(self, name, source, align, sizeof, length, duplicate=False):
-        self.name = name
-        self.source = source
-        self.align = align
-        self.sizeof = sizeof
-        self.length = length
-        self.duplicate = duplicate
-
-
-sections = [
-    Section("x_re", "data/x_re.txt", "3", 32, x.size),
-    Section("x_im", "data/x_im.txt", "3", 32, x.size),
-    Section("H_re", "data/H_re.txt", "3", 32, H.size),
-    Section("H_im", "data/H_im.txt", "3", 32, H.size),
-    Section("R_re", "data/R_re.txt", "3", 32, R.size),
-    Section("R_im", "data/R_im.txt", "3", 32, R.size),
-    Section("y_re", "data/y_re.txt", "3", 32, y.size),
-    Section("y_im", "data/y_im.txt", "3", 32, y.size),
-]
+data_filenames = (
+    "data/x_re.bin", "data/x_im.bin",
+    "data/H_re.bin", "data/H_im.bin",
+    "data/R_re.bin", "data/R_im.bin",
+    "data/y_re.bin", "data/y_im.bin"
+)
 
 # Create "data" directory if it does not exist
-if WRITE_DATA_BIN or WRITE_DATA_TXT:
-    data_dir = path.join(path.dirname(__file__), "..", "data")
-    if not path.exists(data_dir):
-        makedirs(data_dir)
-
-if WRITE_DATA_TXT:
-    for data, sec in zip(data_tuple, sections):
-        with open(sec.source, "w") as f:
-            for sample in data:
-                f.write(f"{sample}\n")
-
-if WRITE_DATA_BIN:
-    for data, sec in zip(data_tuple, sections):
-        with open(sec.source.replace(".txt", ".bin"), "wb") as f:
-            f.write(data.tobytes())
-
-if WRITE_DATA_S:
-    print(".section .data,\"aw\",@progbits")
-    for data, sec in zip(data_tuple, sections):
-        print(f".global {sec.name}")
-        print(f"{sec.name}:")
-        for sample in data:
-            bs = sample.tobytes()
-            for i in range(0, len(bs), 4):
-                s = ""
-                for n in range(4):
-                    s += "%02x" % bs[i+3-n]
-                print("    .word 0x%s" % s)
+data_dir = path.join(path.dirname(__file__), "..", "data")
+if not path.exists(data_dir):
+    makedirs(data_dir)
+
+# Write data to bin files
+for data, filename in zip(data_tuple, data_filenames):
+    with open(filename, "wb") as f:
+        f.write(data.tobytes())
diff --git a/script/util.py b/script/util.py
@@ -2,33 +2,7 @@ from os import path
 import numpy as np
 
 
-def read_defines():
-    """Read the defines from the define.h file
-
-    Returns:
-        int: Number of receive antennas
-        int: Number of transmit antennas
-        int: Number of subcarriers
-    """
-    with open(path.join(path.dirname(__file__), "..", "include", "define.h"), "r") as f:
-        lines = f.read().split("\n")
-    for line in lines:
-        if line.startswith("#define NUM_RX_ANT "):
-            NUM_RX_ANT = int(line[19:])
-        if line.startswith("#define NUM_TX_ANT "):
-            NUM_TX_ANT = int(line[19:])
-        if line.startswith("#define NUM_SC "):
-            NUM_SC = int(line[15:])
-
-    # Assert that all the defines are read
-    assert NUM_RX_ANT
-    assert NUM_TX_ANT
-    assert NUM_SC
-
-    return NUM_RX_ANT, NUM_TX_ANT, NUM_SC
-
-
-def load_xHRy(NUM_RX_ANT, NUM_TX_ANT, NUM_SC) -> tuple:
+def load_xHRy(NUM_RX, NUM_TX, NUM_SC) -> tuple:
     """Load x, H, R and y from the data files.
     Assumes that the following files are present in the data directory:
     x_re.bin, x_im.bin, H_re.bin, H_im.bin, R_re.bin, R_im.bin, y_re.bin, y_im.bin
@@ -39,15 +13,15 @@ def load_xHRy(NUM_RX_ANT, NUM_TX_ANT, NUM_SC) -> tuple:
     """
     x = np.fromfile("data/x_re.bin", dtype=np.float32)
     x = x + 1j*np.fromfile("data/x_im.bin", dtype=np.float32)
-    x = x.reshape((NUM_TX_ANT, NUM_SC))
+    x = x.reshape((NUM_TX, NUM_SC))
     H = np.fromfile("data/H_re.bin", dtype=np.float32)
     H = H + 1j*np.fromfile("data/H_im.bin", dtype=np.float32)
-    H = H.reshape((NUM_RX_ANT, NUM_TX_ANT, NUM_SC))
+    H = H.reshape((NUM_RX, NUM_TX, NUM_SC))
     R = np.fromfile("data/R_re.bin", dtype=np.float32)
     R = R + 1j*np.fromfile("data/R_im.bin", dtype=np.float32)
-    R = R.reshape((NUM_TX_ANT, NUM_TX_ANT, NUM_SC))
+    R = R.reshape((NUM_TX, NUM_TX, NUM_SC))
     y = np.fromfile("data/y_re.bin", dtype=np.float32)
     y = y + 1j*np.fromfile("data/y_im.bin", dtype=np.float32)
-    y = y.reshape((NUM_RX_ANT, NUM_SC))
+    y = y.reshape((NUM_RX, NUM_SC))
 
     return x, H, R, y
diff --git a/src/cbackwardsub.c b/src/cbackwardsub.c
@@ -0,0 +1,132 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex backward substitution L^H*x_MMSE = z
+ * 
+ * x_MMSE_t = (z_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) / L_{t t} (for LL / float solution)
+ * x_MMSE_t = (z_t / D_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) (for LDL / fixed solution)
+ * 
+ * \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_D diagonal matrix (only if DATA_TYPE_fixed is defined). Shape [NUM_TX][NUM_SC]
+ * \global g_z rhs vector. Shape [NUM_TX][NUM_SC]
+ * \global g_x_MMSE output vector. Shape [NUM_TX][NUM_SC]
+ */
+void cbackwardsub()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+  size_t t, tt, s;
+  size_t off_L, off_z, off_x_MMSE, off_D;
+  acc_t sum_re, sum_im;
+  for (t = NUM_TX - 1; t != (size_t)-1; --t) {
+    for (s = 0; s < NUM_SC; ++s) {
+      sum_re = sum_im = 0;
+      for (tt = t + 1; tt < NUM_TX; ++tt) {
+        off_L = tt * NUM_TX * NUM_SC + t * NUM_SC + s;
+        off_x_MMSE = tt * NUM_SC + s;
+        sum_re += (acc_t)g_L.re[off_L] * (acc_t)g_x_MMSE.re[off_x_MMSE]
+                - (acc_t)g_L.im[off_L] * (acc_t)g_x_MMSE.im[off_x_MMSE];
+        sum_im += (acc_t)g_L.re[off_L] * (acc_t)g_x_MMSE.im[off_x_MMSE]
+                + (acc_t)g_L.im[off_L] * (acc_t)g_x_MMSE.re[off_x_MMSE];
+      }
+      off_z = t * NUM_SC + s;
+      off_x_MMSE = t * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+      off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s;
+      g_x_MMSE.re[off_x_MMSE] = (g_z.re[off_z] - (data_t)sum_re) / g_L.re[off_L];
+      g_x_MMSE.im[off_x_MMSE] = (g_z.im[off_z] - (data_t)sum_im) / g_L.re[off_L];
+#elif defined(DATA_TYPE_fixed)
+      off_D = t * NUM_SC + s;
+      g_x_MMSE.re[off_x_MMSE] = (data_t)((acc_t)(g_z.re[off_z] << FP_Q) / g_D[off_D])
+                              - (data_t)(sum_re >> FP_Q);
+      g_x_MMSE.im[off_x_MMSE] = (data_t)((acc_t)(g_z.im[off_z] << FP_Q) / g_D[off_D])
+                              - (data_t)(sum_im >> FP_Q);
+#else
+#error "Unknown data type"
+#endif
+    }
+  }
+#elif defined(ARCH_rvv)
+  size_t i, j;
+  size_t sz, vl;
+  size_t off_sc;
+
+  for (i = NUM_TX - 1; i != (size_t)-1; --i) {
+    off_sc = 0;
+    sz = NUM_SC;
+
+    while (sz > 0){
+      /* Initialize result registers as z */
+      /* v0 - result real part */
+      /* v1 - result imaginary part */
+      __asm__ volatile(
+        "vsetvli %0, %1, e32, m1, ta, ma\n"
+        : "=r"(vl)
+        : "r"(sz));
+      __asm__ volatile(
+        "vle32.v v0, (%0)\n"
+        "vle32.v v1, (%1)\n"
+        :
+        : "r"(&g_z.re[i * NUM_SC + off_sc]),
+          "r"(&g_z.im[i * NUM_SC + off_sc]));
+
+      for (j = i + 1; j < NUM_TX; ++j) {
+        /* b - sum L_ji * z_j */
+        /* v2 - L real part */
+        /* v3 - L imaginary part */
+        /* v4 - x_MMSE_j real part */
+        /* v5 - x_MMSE_j imaginary part */
+        __asm__ volatile(
+          "vle32.v v2, (%0)\n"
+          "vle32.v v3, (%1)\n"
+          "vle32.v v4, (%2)\n"
+          "vle32.v v5, (%3)\n"
+          /* real part */
+          "vfnmsac.vv v0, v2, v4\n"
+          "vfmacc.vv v0, v3, v5\n"
+          /* imaginary part */
+          "vfnmsac.vv v1, v3, v4\n"
+          "vfnmsac.vv v1, v2, v5\n"
+          :
+          : "r"(&g_L.re[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+            "r"(&g_L.im[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+            "r"(&g_x_MMSE.re[j * NUM_SC + off_sc]),
+            "r"(&g_x_MMSE.im[j * NUM_SC + off_sc])
+        );
+      }
+
+      /* Divide by L_ii */
+      /* v2 - L_ii real part */
+      /* v3 - L_ii imaginary part */
+      __asm__ volatile (
+        "vle32.v v2, (%0)\n"
+        "vle32.v v3, (%1)\n"
+        /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
+        "vfmul.vv v4, v2, v2\n"
+        "vfmacc.vv v4, v3, v3\n"
+        /* real part */
+        "vfmul.vv v5, v0, v2\n"
+        "vfmacc.vv v5, v1, v3\n"
+        "vdiv.vv v0, v5, v4\n"
+        /* imaginary part */
+        "vfmul.vv v6, v1, v2\n"
+        "vfnmsac.vv v6, v0, v3\n"
+        "vfdiv.vv v1, v6, v4\n"
+        /* store HH_H */
+        "vse32.v v0, (%2)\n"
+        "vse32.v v1, (%3)\n"
+      :
+      : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+        "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+        "r"(&g_x_MMSE.re[i * NUM_SC + off_sc]),
+        "r"(&g_x_MMSE.im[i * NUM_SC + off_sc])
+      );
+
+      sz -= vl;
+      off_sc += vl;
+    }
+  }
+#else
+#error "Unknown architecture"
+#endif
+}
diff --git a/src/ccholesky.c b/src/ccholesky.c
@@ -0,0 +1,236 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex Cholesky decomposition of a Hermitian positive-definite matrix G
+ *
+ * LL (floating point solution): 
+ * G = L*L^H
+ * L_ij = (G_ij - \sum_{k=0}^{j-1} L_ik L_jk^*) / L_jj
+ * L_ii = sqrt(G_ii - \sum_{k=0}^{i-1} L_ik L_ik^*)
+ * 
+ * LDL (fixed point solution):
+ * G = L*D*L^H
+ * L_ij = (G_ij - \sum_{k=0}^{j-1} L_ik D_k L_jk^*) / D_j
+ * D_i = G_ii - \sum_{k=0}^{i-1} L_ik D_k L_ik^*
+ *
+ * \global g_G matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_L output lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_D output diagonal matrix (if DATA_TYPE_fixed is defined). Shape [NUM_TX][NUM_SC]
+ */
+void ccholesky()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+  size_t i, j, k, s;
+  size_t off_ij, off_jj, off_ii;
+  size_t off_ik, off_jk;
+  size_t off_i, off_j, off_k;
+  data_t tmp; /* Temporary variable for sqrt */
+  acc_t sum_re, sum_im;
+  for (i = 0; i < NUM_TX; ++i) {
+    for (j = 0; j <= i; ++j) {
+      for (s = 0; s < NUM_SC; ++s) {
+        off_ij = i * NUM_TX * NUM_SC + j * NUM_SC + s;
+        sum_im = sum_re = 0;
+
+        /* Calculate the sum */
+        for (k = 0; k < j; ++k) {
+          off_ik = i * NUM_TX * NUM_SC + k * NUM_SC + s;
+          off_jk = j * NUM_TX * NUM_SC + k * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+          sum_re += g_L.re[off_ik] * g_L.re[off_jk]
+                  - g_L.im[off_ik] * g_L.im[off_jk];
+#elif defined(DATA_TYPE_fixed)
+          sum_re += (g_L.re[off_ik] * g_L.re[off_jk]
+                  - g_L.im[off_ik] * g_L.im[off_jk])
+                  * g_D[k * NUM_SC + s];
+#else
+#error "Unknown data type"
+#endif
+          sum_im += g_L.re[off_ik] * g_L.im[off_jk]
+                  + g_L.im[off_ik] * g_L.re[off_jk];
+        }
+
+        if (i == j) {
+          off_ii = i * NUM_TX * NUM_SC + i * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+
+#if defined(ARCH_x86)
+          __asm__ volatile (
+            "flds %1\n"
+            "fsubs %2\n"
+            "fsqrt\n"
+            "fstps %0\n"
+            : "=m" (g_L.re[off_ii])
+            : "m" (g_G.re[off_ij]), "m" (sum_re)
+          );
+#elif defined(ARCH_rv)
+          __asm__ volatile (
+            "fsub.s %0, %1, %2\n"   /* tmp = g_G.re[off_ij] - sum_re */
+            "fsqrt.s %0, %0\n"      /* tmp = sqrtf(tmp) */
+            : "=&f"(tmp) : "f"(g_G.re[off_ij]), "f"(sum_re)
+          );
+          g_L.re[off_ii] = tmp;
+          g_L.im[off_ii] = 0;
+#else
+#error "Unknown architecture"
+#endif
+
+#elif defined(DATA_TYPE_fixed)
+          /* Calculate D_i = G_ii - sum */
+          g_D[i * NUM_SC + s] = g_G.re[off_ii] - (data_t)(sum_re >> FP_Q);
+#else
+#error "Unknown data type"
+#endif
+        } else { /* i != j */
+#if defined(DATA_TYPE_float)
+          /* Calculate L_ij = (G_ij - sum) / L_jj */
+          off_jj = j * NUM_TX * NUM_SC + j * NUM_SC + s;
+          g_L.re[off_ij] = (g_G.re[off_ij] - sum_re) / g_L.re[off_jj];
+          g_L.im[off_ij] = (g_G.im[off_ij] - sum_im) / g_L.re[off_jj];
+#elif defined(DATA_TYPE_fixed)
+          /* Calculate L_ij = (G_ij - sum) / D_j */
+          off_j = j * NUM_SC + s;
+          /* real */
+          sum_re = ((acc_t)g_G.re[off_ij] << FP_Q) - sum_re;
+          /* TODO roubding? */
+          g_L.re[off_ij] = (data_t)(sum_re / (acc_t)g_D[off_j]);
+          /* imaginary */
+          sum_im = ((acc_t)g_G.im[off_ij] << FP_Q) - sum_im;
+          /* TODO roubding? */
+          g_L.im[off_ij] = (data_t)(sum_im / (acc_t)g_D[off_j]);
+#else
+#error "Unknown data type"
+#endif
+        }
+      }
+    }
+  }
+#elif defined(ARCH_rvv)
+  size_t i, j, k;
+  size_t sz, vl;
+  size_t off_sc;
+
+  /* Init float registers */
+  register float f0 __asm__("f0") = 2.0f;
+
+  for (i = 0; i < NUM_TX; ++i)
+    for (j = 0; j <= i; ++j) {
+      sz = NUM_SC;
+      off_sc = 0;
+
+      while (sz > 0) {
+        /* Initialize L registers */
+        /* v0 - L_ij real part = G_ij.re */
+        /* v1 - L_ij imaginary part  = G_ij.im */
+        __asm__ volatile(
+          "vsetvli %0, %1, e32, m1, ta, ma\n"
+          : "=r"(vl) : "r"(sz));
+        __asm__ volatile(
+          "vle32.v v0, (%0)\n"
+          "vle32.v v1, (%1)\n"
+        :
+        : "r"(&g_G.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+          "r"(&g_G.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
+        );
+
+        /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */
+        /* v2 - sum real part */
+        /* v3 - sum imaginary part */
+        __asm__ volatile(
+          "vmv.v.i v2, 0\n"
+          "vmv.v.i v3, 0\n"
+        );
+        for (k = 0; k < j; ++k) {
+          __asm__ volatile(
+            "vle32.v v4, (%0)\n"
+            "vle32.v v5, (%1)\n"
+            "vle32.v v6, (%2)\n"
+            "vle32.v v7, (%3)\n"
+            /* real part */
+            "vfmacc.vv v2, v4, v6\n"
+            "vfmacc.vv v2, v5, v7\n"
+            /* imaginary part */
+            "vfmacc.vv v3, v5, v6\n"
+            "vfnmsac.vv v3, v4, v7\n"
+          :
+          : "r"(&g_L.re[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
+            "r"(&g_L.im[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
+            "r"(&g_L.re[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
+            "r"(&g_L.im[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc])
+          );
+        }
+
+        /* G_ii - sum */
+        __asm__ volatile(
+          "vfsub.vv v0, v0, v2\n"
+          "vfsub.vv v1, v1, v3\n"
+        );
+
+        if (i == j) {
+          /* Calculate L_ii = sqrt(G_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */
+          __asm__ volatile(
+            /* Complex sqrt */
+
+            /* v2 = r = sqrt(re^2 + im^2) */
+            "vfmul.vv v2, v0, v0\n"
+            "vfmacc.vv v2, v1, v1\n"
+            "vfsqrt.v v2, v2\n"
+
+            /* v3 - real part */
+            "vfadd.vv v3, v2, v0\n" /* r + re */
+            "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */
+            "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */
+            /* v4 - imaginary part */
+            "vfsub.vv v4, v2, v0\n" /* r - re */
+            "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */
+            "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */
+            "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */
+
+            /* TODO handle im == 0 */
+
+            /* Move the result to v0 and v1 */
+            "vmv.v.v v0, v3\n"
+            "vmv.v.v v1, v4\n"
+          );
+        } else {
+          /* Calculate L_ij = (G_ij - sum) / L_jj */
+          __asm__ volatile(
+            /* L_jj */
+            "vle32.v v2, (%0)\n"
+            "vle32.v v3, (%1)\n"
+            /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */
+            "vfmul.vv v4, v2, v2\n"
+            "vfmacc.vv v4, v3, v3\n"
+            /* real part */
+            "vfmul.vv v5, v0, v2\n"
+            "vfmacc.vv v5, v1, v3\n"
+            /* imaginary part */
+            "vfmul.vv v6, v1, v2\n"
+            "vfnmsac.vv v6, v0, v3\n"
+            /* divide and store at v0 and v1 */
+            "vfdiv.vv v0, v5, v4\n"
+            "vfdiv.vv v1, v6, v4\n"
+          :
+          : "r"(&g_L.re[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+            "r"(&g_L.im[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
+          );
+        }
+
+        /* Store result */
+        __asm__ volatile(
+          "vse32.v v0, (%0)\n"
+          "vse32.v v1, (%1)\n"
+          :
+          : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+            "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
+        );
+
+        sz -= vl;
+        off_sc += vl;
+      }
+    }
+#else
+#error "Unknown architecture"
+#endif
+}
diff --git a/src/cforwardsub.c b/src/cforwardsub.c
@@ -0,0 +1,131 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex forward substitution L*z = HHy
+ * 
+ * z_t = (HHy_t - \sum_{tt=0}^{t-1} L_{t tt} z_tt) / L_{t t} (for LL / float solution)
+ * z_t = (HHy_t - \sum_{tt=0}^{t-1} L_{t tt} z_tt) (for LDL / fixed solution)
+ * 
+ * \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_HHy vector. Shape [NUM_TX][NUM_SC]
+ * \global g_z output vector. Shape [NUM_TX][NUM_SC]
+ */
+void cforwardsub()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+  size_t t, tt, s;
+  size_t off_L, off_HHy, off_z;
+  acc_t sum_re, sum_im;
+  for (t = 0; t < NUM_TX; ++t) {
+    for (s = 0; s < NUM_SC; ++s) {
+      sum_re = sum_im = 0;
+      for (tt = 0; tt < t; ++tt) {
+        off_L = t * NUM_TX * NUM_SC + tt * NUM_SC + s;
+        off_z = tt * NUM_SC + s;
+        sum_re += (acc_t)g_L.re[off_L] * (acc_t)g_z.re[off_z]
+                - (acc_t)g_L.im[off_L] * (acc_t)g_z.im[off_z];
+        sum_im += (acc_t)g_L.re[off_L] * (acc_t)g_z.im[off_z]
+                + (acc_t)g_L.im[off_L] * (acc_t)g_z.re[off_z];
+      }
+      off_HHy = t * NUM_SC + s;
+      off_z = t * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+      off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s;
+      g_z.re[off_z] = (g_HHy.re[off_HHy] - (data_t)sum_re) / g_L.re[off_L];
+      g_z.im[off_z] = (g_HHy.im[off_HHy] - (data_t)sum_im) / g_L.re[off_L];
+#elif defined(DATA_TYPE_fixed)
+      g_z.re[off_z] = g_HHy.re[off_HHy] - (data_t)(sum_re >> FP_Q);
+      g_z.im[off_z] = g_HHy.im[off_HHy] - (data_t)(sum_im >> FP_Q);
+#else
+#error "Unknown data type"
+#endif
+    }
+  }
+#elif defined(ARCH_rvv)
+  size_t i, j;
+  size_t sz, vl;
+  size_t off_sc;
+
+  for (i = 0; i < NUM_TX; ++i) {
+    off_sc = 0;
+    sz = NUM_SC;
+
+    while (sz > 0) {
+      // printf("sz: %lu\n", sz);
+      // printf("vl: %lu\n", vl);
+      /* Initialize result registers as b */
+      /* v0 - result real part */
+      /* v1 - result imaginary part */
+      __asm__ volatile (
+        "vsetvli %0, %1, e32, m1, ta, ma\n"
+        : "=r"(vl)
+        : "r"(sz));
+      __asm__ volatile (
+        "vle32.v v0, (%0)\n"
+        "vle32.v v1, (%1)\n"
+        :
+        : "r"(&g_HHy.re[i * NUM_SC + off_sc]),
+          "r"(&g_HHy.im[i * NUM_SC + off_sc]));
+      // printf("vl: %lu\n", vl);
+
+      for (j = 0; j != i; ++j) {
+        /* b - sum L_ij * z_j */
+        /* v2 - L real part */
+        /* v3 - L imaginary part */
+        /* v4 - result_j real part */
+        /* v5 - result_j imaginary part */
+        __asm__ volatile (
+          "vle32.v v2, (%0)\n"
+          "vle32.v v3, (%1)\n"
+          "vle32.v v4, (%2)\n"
+          "vle32.v v5, (%3)\n"
+          /* real part */
+          "vfnmsac.vv v0, v2, v4\n"
+          "vfmacc.vv v0, v3, v5\n"
+          /* imaginary part */
+          "vfnmsac.vv v1, v3, v4\n"
+          "vfnmsac.vv v1, v2, v5\n"
+          :
+          : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+            "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+            "r"(&g_z.re[j * NUM_SC + off_sc]),
+            "r"(&g_z.im[j * NUM_SC + off_sc])
+        );
+      }
+
+      /* Divide by L_ii */
+      /* v2 - L_ii real part */
+      /* v3 - L_ii imaginary part */
+      __asm__ volatile (
+        "vle32.v v2, (%0)\n"
+        "vle32.v v3, (%1)\n"
+        /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
+        "vfmul.vv v4, v2, v2\n"
+        "vfmacc.vv v4, v3, v3\n"
+        /* real part */
+        "vfmul.vv v5, v0, v2\n"
+        "vfmacc.vv v5, v1, v3\n"
+        "vdiv.vv v0, v5, v4\n"
+        /* imaginary part */
+        "vfmul.vv v6, v1, v2\n"
+        "vfnmsac.vv v6, v0, v3\n"
+        "vfdiv.vv v1, v6, v4\n"
+        /* store result */
+        "vse32.v v0, (%2)\n"
+        "vse32.v v1, (%3)\n"
+        :
+        : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+          "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+          "r"(&g_z.re[i * NUM_SC + off_sc]),
+          "r"(&g_z.im[i * NUM_SC + off_sc])
+      );
+
+      off_sc += vl;
+      sz -= vl;
+    }
+  }
+#else
+#error "Unknown architecture"
+#endif
+}
diff --git a/src/cmatgram.c b/src/cmatgram.c
@@ -0,0 +1,131 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex Gram matrix H^H*H and add complex matrix R
+ * 
+ * G = H^H*H + R
+ * G_{t1t2} = \sum_{r=0}^{NUM_RX - 1} (H_{rt1}^* H_{rt2}) + R_{t1t2}
+ * 
+ * \global g_H matrix of channel coefficients. Shape [NUM_RX][NUM_TX][NUM_SC]
+ * \global g_R noise covariance matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_G output Gram matrix + R. Shape [NUM_TX][NUM_TX][NUM_SC]
+ */
+void cmatgram()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+  size_t r, t1, t2, s;
+  size_t off_G, off_R, off_H1, off_H2;
+  for (t1 = 0; t1 < NUM_TX; ++t1) {
+    for (t2 = 0; t2 < NUM_TX; ++t2) {
+      for (s = 0; s < NUM_SC; ++s) {
+        off_R = off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s;
+        g_G.re[off_G] = g_R.re[off_R];
+        g_G.im[off_G] = g_R.im[off_R];
+
+        for (r = 0; r < NUM_RX; ++r) {
+          off_H1 = r * NUM_TX * NUM_SC + t1 * NUM_SC + s;
+          off_H2 = r * NUM_TX * NUM_SC + t2 * NUM_SC + s;
+          g_G.re[off_G] += g_H.re[off_H1] * g_H.re[off_H2]
+                         + g_H.im[off_H1] * g_H.im[off_H2];
+          g_G.im[off_G] += g_H.re[off_H1] * g_H.im[off_H2]
+                         - g_H.im[off_H1] * g_H.re[off_H2];
+        }
+      } 
+    }
+  }
+#elif defined(ARCH_rvv)
+  size_t t1, t2, r;
+  size_t sz, vl;
+  size_t off_sc, off_A, off_AH;
+  size_t off_G_L, off_G_U;
+  data_t *A_re, *A_im, *AH_re, *AH_im;
+  data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im;
+  data_t *G_L_re, *G_L_im, *G_U_re, *G_U_im;
+
+  for (t1 = 0; t1 != NUM_TX; ++t1)
+    for (t2 = t1; t2 != NUM_TX; ++t2) {
+      off_sc = 0;
+      off_G_L = t1 * NUM_TX * NUM_SC + t2 * NUM_SC;
+      off_G_U = t2 * NUM_TX * NUM_SC + t1 * NUM_SC;
+      G_L_re = &g_G.re[off_G_L];
+      G_L_im = &g_G.im[off_G_L];
+      G_U_re = &g_G.re[off_G_U];
+      G_U_im = &g_G.im[off_G_U];
+      sz = NUM_SC;
+
+      while (sz > 0) {
+        /* Initialize G registers */
+        /* v0 - G real part */
+        /* v1 - G imaginary part */
+        __asm__ volatile(
+          "vsetvli %0, %1, e32, m1, ta, ma\n"
+          "vmv.v.i v0, 0\n"
+          "vmv.v.i v1, 0\n"
+          : "=r"(vl) : "r"(sz));
+
+        for (r = 0; r != NUM_RX; ++r) {
+          off_A = r * NUM_TX * NUM_SC + t1 * NUM_SC + off_sc;
+          off_AH = r * NUM_TX * NUM_SC + t2 * NUM_SC + off_sc;
+          A_re = &g_H.re[off_A];
+          A_im = &g_H.im[off_A];
+          AH_re = &g_H.re[off_AH];
+          AH_im = &g_H.im[off_AH];
+
+          /* Calculate A^H*A */
+          /* v2 - A real part */
+          /* v3 - A imaginary part */
+          /* v4 - A^H real part */
+          /* v5 - A^H imaginary part */
+          __asm__ volatile(
+            "vle32.v v2, (%0)\n"
+            "vle32.v v3, (%1)\n"
+            "vle32.v v4, (%2)\n"
+            "vle32.v v5, (%3)\n"
+            /* real part */
+            "vfmacc.vv v0, v2, v4\n"
+            "vfmacc.vv v0, v3, v5\n"
+            /* imaginary part */
+            "vfmacc.vv v1, v3, v4\n"
+            "vfnmsac.vv v1, v2, v5\n"
+            :
+            : "r"(A_re), "r"(A_im), "r"(AH_re), "r"(AH_im)
+          );
+        }
+
+        /* Add R */
+        /* v2 - R real part */
+        /* v3 - R imaginary part */
+        R_U_re = &g_R.re[off_G_U];
+        R_U_im = &g_R.im[off_G_U];
+        __asm__ volatile(
+          "vle32.v v2, (%0)\n"
+          "vle32.v v3, (%1)\n"
+          "vfadd.vv v2, v2, v0\n"
+          "vfadd.vv v3, v3, v1\n"
+          "vse32.v v2, (%2)\n"
+          "vse32.v v3, (%3)\n"
+          :
+          : "r"(R_U_re), "r"(R_U_im), "r"(G_U_re), "r"(G_U_im));
+        if (t1 != t2) {
+          R_L_re = &g_R.re[off_G_L];
+          R_L_im = &g_R.im[off_G_L];
+          __asm__ volatile(
+            /* Lower triangle */
+            "vle32.v v2, (%0)\n"
+            "vle32.v v3, (%1)\n"
+            "vfadd.vv v2, v2, v0\n"
+            "vfsub.vv v3, v3, v1\n"
+            "vse32.v v2, (%2)\n"
+            "vse32.v v3, (%3)\n"
+            :
+            : "r"(R_L_re), "r"(R_L_im), "r"(G_L_re), "r"(G_L_im)
+          );
+        }
+
+        sz -= vl;
+        off_sc += vl;
+      }
+    }
+#endif
+}
diff --git a/src/cmatvecmul.c b/src/cmatvecmul.c
@@ -0,0 +1,100 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex matrix-vector multiplication HHy = HH*y
+ * 
+ * HHy_t = \sum_{r=0}^{NUM_RX-1} H_{rt}^* * y_r
+ *
+ * \global g_H matrix. Shape [NUM_RX][NUM_TX][NUM_SC]
+ * \global g_y vector. Shape [NUM_RX][NUM_SC]
+ * \global g_HHy output vector. Shape [NUM_TX][NUM_SC]
+ */
+void cmatvecmul()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+  size_t t, r, s;
+  size_t off_H, off_y, off_HHy;
+  acc_t sum_re, sum_im;
+  for (t = 0; t < NUM_TX; ++t) {
+    for (s = 0; s < NUM_SC; ++s) {
+      sum_re = sum_im = 0;
+      for (r = 0; r < NUM_RX; ++r) {
+        off_H = r * NUM_TX * NUM_SC + t * NUM_SC + s;
+        off_y = r * NUM_SC + s;
+        sum_re += (acc_t)g_H.re[off_H] * (acc_t)g_y.re[off_y]
+                - (acc_t)g_H.im[off_H] * (acc_t)g_y.im[off_y];
+        sum_im += (acc_t)g_H.re[off_H] * (acc_t)g_y.im[off_y]
+                + (acc_t)g_H.im[off_H] * (acc_t)g_y.re[off_y];
+      }
+      off_HHy = t * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+      g_HHy.re[off_HHy] = (data_t)sum_re;
+      g_HHy.im[off_HHy] = (data_t)sum_im;
+#elif defined(DATA_TYPE_fixed)
+      g_HHy.re[off_HHy] = (data_t)(sum_re >> FP_Q);
+      g_HHy.im[off_HHy] = (data_t)(sum_im >> FP_Q);
+#else
+#error "Unknown data type"
+#endif
+    }
+  }
+#elif defined(ARCH_rvv)
+  size_t i, j;
+  size_t off_HH, off_y, off_HHy, off_sc;
+  size_t sz, vl;
+
+  for (i = 0; i < NUM_TX; ++i) {
+    off_HHy = i * NUM_SC;
+    off_sc = 0;
+    sz = NUM_SC;
+
+    while (sz > 0) {
+      /* Initialize result registers */
+      /* v0 - HHy real part */
+      /* v1 - HHy imaginary part */
+      __asm__ volatile(
+        "vsetvli %0, %1, e32, m1, ta, ma\n"
+        "vmv.v.i v0, 0\n"
+        "vmv.v.i v1, 0\n"
+        : "=r"(vl)
+        : "r"(sz)
+      );
+
+      for (j = 0; j < NUM_RX; ++j) {
+        off_HH = i * NUM_RX * NUM_SC + j * NUM_SC + off_sc;
+        off_y = j * NUM_SC + off_sc;
+        __asm__ volatile(
+          "vle32.v v2, (%0)\n"
+          "vle32.v v3, (%1)\n"
+          "vle32.v v4, (%2)\n"
+          "vle32.v v5, (%3)\n"
+          /* real part */
+          "vfmacc.vv v0, v2, v4\n"
+          "vfnmsac.vv v0, v3, v5\n"
+          /* imaginary part */
+          "vfmacc.vv v1, v3, v4\n"
+          "vfmacc.vv v1, v2, v5\n"
+          :
+          : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]),
+            "r"(&g_y.re[off_y]), "r"(&g_y.im[off_y])
+          );
+      }
+
+      /* Store result */
+      __asm__ volatile(
+        "vse32.v v0, (%0)\n"
+        "vse32.v v1, (%1)\n"
+        :
+        : "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy])
+      );
+
+      sz -= vl;
+      off_HHy += vl;
+      off_sc += vl;
+    }
+  }
+#else
+#error "Unknown architecture"
+#endif
+}
diff --git a/src/mmserv.c b/src/mmserv.c
@@ -1,631 +0,0 @@
-#include "../include/define.h"
-
-#include <stddef.h> /* for size_t */
-#include <stdint.h> /* for uint64_t */
-
-/*
- * Debug
- */
-
-#ifdef DEBUG
-#include "printf.h"
-
-static uint64_t g_timer;
-void start_timer()
-{
-  __asm__ volatile("rdcycle %0" : "=r"(g_timer));
-}
-void stop_timer()
-{
-  __asm__ volatile(
-    "rdcycle t0\n"
-    "sub %0, t0, %0"
-    : "+r"(g_timer)
-  );
-}
-uint64_t get_timer()
-{
-  return g_timer;
-}
-
-#define TIME(msg, func, ...) \
-  start_timer(); \
-  func(__VA_ARGS__); \
-  stop_timer(); \
-  printf(msg, get_timer());
-
-#else
-#define TIME(msg, func, ...) func(__VA_ARGS__);
-
-#endif
-
-/*
- * Global variables
- */
-
-/* Externs */
-extern vcomplex g_x, g_H, g_R, g_y;
-extern vcomplex g_x_MMSE;
-extern size_t num_rx_cur, num_tx_cur, num_sc_cur;
-
-
-/* Raw data */
-data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC];
-data_t HH_im[NUM_TX_ANT][NUM_RX_ANT][NUM_SC];
-data_t HH_H_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t HH_H_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t L_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t L_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t HHy_re[NUM_TX_ANT][NUM_SC];
-data_t HHy_im[NUM_TX_ANT][NUM_SC];
-data_t z_re[NUM_TX_ANT][NUM_SC];
-data_t z_im[NUM_TX_ANT][NUM_SC];
-data_t LH_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t LH_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-
-/* Same data but casted to vcomplex */
-vcomplex g_HH = { .re = (data_t *)HH_re, .im = (data_t *)HH_im };
-vcomplex g_HH_H = { .re = (data_t *)HH_H_re, .im = (data_t *)HH_H_im };
-vcomplex g_L = { .re = (data_t *)L_re, .im = (data_t *)L_im };
-vcomplex g_HHy = { .re = (data_t *)HHy_re, .im = (data_t *)HHy_im };
-vcomplex g_z = { .re = (data_t *)z_re, .im = (data_t *)z_im };
-vcomplex g_LH = { .re = (data_t *)LH_re, .im = (data_t *)LH_im };
-
-/*
- * Complex matrix operations
- */
-
-/** Complex Gram matrix H^H*H and add complex matrix R
- * 
- * HH_H = H^H*H + R
- * 
- * \global g_H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_HH_H output Gram matrix + R. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- */
-void cmatgram_TxRx_cadd()
-{
-  size_t t1, t2, r;
-  size_t sz, vl;
-  size_t off_sc, off_A, off_AH;
-  size_t off_HH_H_L, off_HH_H_U;
-  data_t *A_re, *A_im, *AH_re, *AH_im;
-  data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im;
-  data_t *HH_H_L_re, *HH_H_L_im, *HH_H_U_re, *HH_H_U_im;
-
-  for (t1 = 0; t1 != num_tx_cur; ++t1)
-    for (t2 = t1; t2 != num_tx_cur; ++t2) {
-      off_sc = 0;
-      off_HH_H_L = t1 * num_tx_cur * num_sc_cur + t2 * num_sc_cur;
-      off_HH_H_U = t2 * num_tx_cur * num_sc_cur + t1 * num_sc_cur;
-      HH_H_L_re = &g_HH_H.re[off_HH_H_L];
-      HH_H_L_im = &g_HH_H.im[off_HH_H_L];
-      HH_H_U_re = &g_HH_H.re[off_HH_H_U];
-      HH_H_U_im = &g_HH_H.im[off_HH_H_U];
-      sz = num_sc_cur;
-
-      while (sz > 0) {
-        /* Initialize HH_H registers */
-        /* v0 - HH_H real part */
-        /* v1 - HH_H imaginary part */
-        __asm__ volatile(
-          "vsetvli %0, %1, e32, m1, ta, ma\n"
-          "vmv.v.i v0, 0\n"
-          "vmv.v.i v1, 0\n"
-          : "=r"(vl) : "r"(sz));
-
-        for (r = 0; r != num_rx_cur; ++r) {
-          off_A = r * num_tx_cur * num_sc_cur + t1 * num_sc_cur + off_sc;
-          off_AH = r * num_tx_cur * num_sc_cur + t2 * num_sc_cur + off_sc;
-          A_re = &g_H.re[off_A];
-          A_im = &g_H.im[off_A];
-          AH_re = &g_H.re[off_AH];
-          AH_im = &g_H.im[off_AH];
-
-          /* Calculate A^H*A */
-          /* v2 - A real part */
-          /* v3 - A imaginary part */
-          /* v4 - A^H real part */
-          /* v5 - A^H imaginary part */
-          __asm__ volatile(
-            "vle32.v v2, (%0)\n"
-            "vle32.v v3, (%1)\n"
-            "vle32.v v4, (%2)\n"
-            "vle32.v v5, (%3)\n"
-            /* real part */
-            "vfmacc.vv v0, v2, v4\n"
-            "vfmacc.vv v0, v3, v5\n"
-            /* imaginary part */
-            "vfmacc.vv v1, v3, v4\n"
-            "vfnmsac.vv v1, v2, v5\n"
-            :
-            : "r"(A_re), "r"(A_im), "r"(AH_re), "r"(AH_im)
-          );
-        }
-
-        /* Add R */
-        /* v2 - R real part */
-        /* v3 - R imaginary part */
-        R_U_re = &g_R.re[off_HH_H_U];
-        R_U_im = &g_R.im[off_HH_H_U];
-        __asm__ volatile(
-          "vle32.v v2, (%0)\n"
-          "vle32.v v3, (%1)\n"
-          "vfadd.vv v2, v2, v0\n"
-          "vfadd.vv v3, v3, v1\n"
-          "vse32.v v2, (%2)\n"
-          "vse32.v v3, (%3)\n"
-          :
-          : "r"(R_U_re), "r"(R_U_im), "r"(HH_H_U_re), "r"(HH_H_U_im));
-        if (t1 != t2) {
-          R_L_re = &g_R.re[off_HH_H_L];
-          R_L_im = &g_R.im[off_HH_H_L];
-          __asm__ volatile(
-            /* Lower triangle */
-            "vle32.v v2, (%0)\n"
-            "vle32.v v3, (%1)\n"
-            "vfadd.vv v2, v2, v0\n"
-            "vfsub.vv v3, v3, v1\n"
-            "vse32.v v2, (%2)\n"
-            "vse32.v v3, (%3)\n"
-            :
-            : "r"(R_L_re), "r"(R_L_im), "r"(HH_H_L_re), "r"(HH_H_L_im)
-          );
-        }
-
-        sz -= vl;
-        off_sc += vl;
-      }
-    }
-}
-
-/** Complex Cholesky decomposition L of a Hermitian positive-definite matrix HH_H
- *
- * HH_H = L*L^H
- * L_ij = (HH_H_ij - \sum_{k=0}^{j-1} L_ik L_jk^*) / L_jj
- * L_ii = sqrt(HH_H_ii - \sum_{k=0}^{i-1} L_ik L_ik^*)
- *
- * \global g_HH_H matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_L output lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- */
-void ccholesky_TxTx()
-{
-  size_t i, j, k;
-  size_t sz, vl;
-  size_t off_sc;
-
-  /* Init float registers */
-  register float f0 __asm__("f0") = 2.0f;
-
-  for (i = 0; i < num_tx_cur; ++i)
-    for (j = 0; j <= i; ++j) {
-      sz = num_sc_cur;
-      off_sc = 0;
-
-      while (sz > 0) {
-        /* Initialize L registers */
-        /* v0 - L_ij real part = HH_H_ij.re */
-        /* v1 - L_ij imaginary part  = HH_H_ij.im */
-        __asm__ volatile(
-          "vsetvli %0, %1, e32, m1, ta, ma\n"
-          : "=r"(vl) : "r"(sz));
-        __asm__ volatile(
-          "vle32.v v0, (%0)\n"
-          "vle32.v v1, (%1)\n"
-        :
-        : "r"(&g_HH_H.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
-          "r"(&g_HH_H.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
-        );
-
-        /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */
-        /* v2 - sum real part */
-        /* v3 - sum imaginary part */
-        __asm__ volatile(
-          "vmv.v.i v2, 0\n"
-          "vmv.v.i v3, 0\n"
-        );
-        for (k = 0; k < j; ++k) {
-          __asm__ volatile(
-            "vle32.v v4, (%0)\n"
-            "vle32.v v5, (%1)\n"
-            "vle32.v v6, (%2)\n"
-            "vle32.v v7, (%3)\n"
-            /* real part */
-            "vfmacc.vv v2, v4, v6\n"
-            "vfmacc.vv v2, v5, v7\n"
-            /* imaginary part */
-            "vfmacc.vv v3, v5, v6\n"
-            "vfnmsac.vv v3, v4, v7\n"
-          :
-          : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
-            "r"(&g_L.im[i * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
-            "r"(&g_L.re[j * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
-            "r"(&g_L.im[j * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc])
-          );
-        }
-
-        /* HH_H_ii - sum */
-        __asm__ volatile(
-          "vfsub.vv v0, v0, v2\n"
-          "vfsub.vv v1, v1, v3\n"
-        );
-
-        if (i == j) {
-          /* Calculate L_ii = sqrt(HH_H_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */
-          __asm__ volatile(
-            /* Complex sqrt */
-
-            /* v2 = r = sqrt(re^2 + im^2) */
-            "vfmul.vv v2, v0, v0\n"
-            "vfmacc.vv v2, v1, v1\n"
-            "vfsqrt.v v2, v2\n"
-
-            /* v3 - real part */
-            "vfadd.vv v3, v2, v0\n" /* r + re */
-            "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */
-            "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */
-            /* v4 - imaginary part */
-            "vfsub.vv v4, v2, v0\n" /* r - re */
-            "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */
-            "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */
-            "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */
-
-            /* TODO handle im == 0 */
-
-            /* Move the result to v0 and v1 */
-            "vmv.v.v v0, v3\n"
-            "vmv.v.v v1, v4\n"
-          );
-        } else {
-          /* Calculate L_ij = (HH_H_ij - sum) / L_jj */
-          __asm__ volatile(
-            /* L_jj */
-            "vle32.v v2, (%0)\n"
-            "vle32.v v3, (%1)\n"
-            /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */
-            "vfmul.vv v4, v2, v2\n"
-            "vfmacc.vv v4, v3, v3\n"
-            /* real part */
-            "vfmul.vv v5, v0, v2\n"
-            "vfmacc.vv v5, v1, v3\n"
-            /* imaginary part */
-            "vfmul.vv v6, v1, v2\n"
-            "vfnmsac.vv v6, v0, v3\n"
-            /* divide and store at v0 and v1 */
-            "vfdiv.vv v0, v5, v4\n"
-            "vfdiv.vv v1, v6, v4\n"
-          :
-          : "r"(&g_L.re[j * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
-            "r"(&g_L.im[j * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
-          );
-        }
-
-        /* Store result */
-        __asm__ volatile(
-          "vse32.v v0, (%0)\n"
-          "vse32.v v1, (%1)\n"
-          :
-          : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
-            "r"(&g_L.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
-        );
-
-        sz -= vl;
-        off_sc += vl;
-      }
-    }
-}
-
-/** Complex matrix-vector multiplication HHy = HH*y
- *
- * \global g_HH matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC]
- * \global g_y vector. Shape [NUM_RX_ANT][NUM_SC]
- * \global g_HHy output vector. Shape [NUM_TX_ANT][NUM_SC]
- */
-void cmatvecmul_TxRx()
-{
-  size_t i, j;
-  size_t off_HH, off_y, off_HHy, off_sc;
-  size_t sz, vl;
-
-  for (i = 0; i < num_tx_cur; ++i) {
-    off_HHy = i * num_sc_cur;
-    off_sc = 0;
-    sz = num_sc_cur;
-
-    while (sz > 0) {
-      /* Initialize result registers */
-      /* v0 - HHy real part */
-      /* v1 - HHy imaginary part */
-      __asm__ volatile(
-        "vsetvli %0, %1, e32, m1, ta, ma\n"
-        "vmv.v.i v0, 0\n"
-        "vmv.v.i v1, 0\n"
-        : "=r"(vl)
-        : "r"(sz)
-      );
-
-      for (j = 0; j < num_rx_cur; ++j) {
-        off_HH = i * num_rx_cur * num_sc_cur + j * num_sc_cur + off_sc;
-        off_y = j * num_sc_cur + off_sc;
-        __asm__ volatile(
-          "vle32.v v2, (%0)\n"
-          "vle32.v v3, (%1)\n"
-          "vle32.v v4, (%2)\n"
-          "vle32.v v5, (%3)\n"
-          /* real part */
-          "vfmacc.vv v0, v2, v4\n"
-          "vfnmsac.vv v0, v3, v5\n"
-          /* imaginary part */
-          "vfmacc.vv v1, v3, v4\n"
-          "vfmacc.vv v1, v2, v5\n"
-          :
-          : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]),
-            "r"(&g_y.re[off_y]), "r"(&g_y.im[off_y])
-          );
-      }
-
-      /* Store result */
-      __asm__ volatile(
-        "vse32.v v0, (%0)\n"
-        "vse32.v v1, (%1)\n"
-        :
-        : "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy])
-      );
-
-      sz -= vl;
-      off_HHy += vl;
-      off_sc += vl;
-    }
-  }
-}
-
-/** Complex forward substitution L*z = HHy
- * 
- * z_i = (HHy_i - \sum_{k=0}^{i-1} L_{ik} z_k) / L_{ii}
- * 
- * \global g_L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_HHy vector. Shape [NUM_TX_ANT][NUM_SC]
- * \global g_z output vector. Shape [NUM_TX_ANT][NUM_SC]
- */
-void cforwardsub_TxTx()
-{
-  size_t i, j;
-  size_t sz, vl;
-  size_t off_sc;
-
-  for (i = 0; i < num_tx_cur; ++i) {
-    off_sc = 0;
-    sz = num_sc_cur;
-
-    while (sz > 0) {
-      // printf("sz: %lu\n", sz);
-      // printf("vl: %lu\n", vl);
-      /* Initialize result registers as b */
-      /* v0 - result real part */
-      /* v1 - result imaginary part */
-      __asm__ volatile (
-        "vsetvli %0, %1, e32, m1, ta, ma\n"
-        : "=r"(vl)
-        : "r"(sz));
-      __asm__ volatile (
-        "vle32.v v0, (%0)\n"
-        "vle32.v v1, (%1)\n"
-        :
-        : "r"(&g_HHy.re[i * num_sc_cur + off_sc]),
-          "r"(&g_HHy.im[i * num_sc_cur + off_sc]));
-      // printf("vl: %lu\n", vl);
-
-      for (j = 0; j != i; ++j) {
-        /* b - sum L_ij * z_j */
-        /* v2 - L real part */
-        /* v3 - L imaginary part */
-        /* v4 - result_j real part */
-        /* v5 - result_j imaginary part */
-        __asm__ volatile (
-          "vle32.v v2, (%0)\n"
-          "vle32.v v3, (%1)\n"
-          "vle32.v v4, (%2)\n"
-          "vle32.v v5, (%3)\n"
-          /* real part */
-          "vfnmsac.vv v0, v2, v4\n"
-          "vfmacc.vv v0, v3, v5\n"
-          /* imaginary part */
-          "vfnmsac.vv v1, v3, v4\n"
-          "vfnmsac.vv v1, v2, v5\n"
-          :
-          : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
-            "r"(&g_L.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
-            "r"(&g_z.re[j * num_sc_cur + off_sc]),
-            "r"(&g_z.im[j * num_sc_cur + off_sc])
-        );
-      }
-
-      /* Divide by L_ii */
-      /* v2 - L_ii real part */
-      /* v3 - L_ii imaginary part */
-      __asm__ volatile (
-        "vle32.v v2, (%0)\n"
-        "vle32.v v3, (%1)\n"
-        /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
-        "vfmul.vv v4, v2, v2\n"
-        "vfmacc.vv v4, v3, v3\n"
-        /* real part */
-        "vfmul.vv v5, v0, v2\n"
-        "vfmacc.vv v5, v1, v3\n"
-        "vdiv.vv v0, v5, v4\n"
-        /* imaginary part */
-        "vfmul.vv v6, v1, v2\n"
-        "vfnmsac.vv v6, v0, v3\n"
-        "vfdiv.vv v1, v6, v4\n"
-        /* store result */
-        "vse32.v v0, (%2)\n"
-        "vse32.v v1, (%3)\n"
-        :
-        : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
-          "r"(&g_L.im[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
-          "r"(&g_z.re[i * num_sc_cur + off_sc]),
-          "r"(&g_z.im[i * num_sc_cur + off_sc])
-      );
-
-      off_sc += vl;
-      sz -= vl;
-    }
-  }
-}
-
-/** Complex backward substitution L^H*x_MMSE = z
- * 
- * x_i = (z_i - \sum_{j=i+1}^{n-1} L_{ji} x_k) / L_{ii}
- * 
- * \global g_L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_z rhs vector. Shape [NUM_TX_ANT][NUM_SC]
- * \global g_x_MMSE output vector. Shape [NUM_TX_ANT][NUM_SC]
- */
-void cbackwardsub_TxTx()
-{
-  size_t i, j;
-  size_t sz, vl;
-  size_t off_sc;
-
-  for (i = num_tx_cur - 1; i != (size_t)-1; --i) {
-    off_sc = 0;
-    sz = num_sc_cur;
-
-    while (sz > 0){
-      /* Initialize result registers as z */
-      /* v0 - result real part */
-      /* v1 - result imaginary part */
-      __asm__ volatile(
-        "vsetvli %0, %1, e32, m1, ta, ma\n"
-        : "=r"(vl)
-        : "r"(sz));
-      __asm__ volatile(
-        "vle32.v v0, (%0)\n"
-        "vle32.v v1, (%1)\n"
-        :
-        : "r"(&g_z.re[i * num_sc_cur + off_sc]),
-          "r"(&g_z.im[i * num_sc_cur + off_sc]));
-
-      for (j = i + 1; j < num_tx_cur; ++j) {
-        /* b - sum L_ji * z_j */
-        /* v2 - L real part */
-        /* v3 - L imaginary part */
-        /* v4 - x_MMSE_j real part */
-        /* v5 - x_MMSE_j imaginary part */
-        __asm__ volatile(
-          "vle32.v v2, (%0)\n"
-          "vle32.v v3, (%1)\n"
-          "vle32.v v4, (%2)\n"
-          "vle32.v v5, (%3)\n"
-          /* real part */
-          "vfnmsac.vv v0, v2, v4\n"
-          "vfmacc.vv v0, v3, v5\n"
-          /* imaginary part */
-          "vfnmsac.vv v1, v3, v4\n"
-          "vfnmsac.vv v1, v2, v5\n"
-          :
-          : "r"(&g_L.re[j * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
-            "r"(&g_L.im[j * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
-            "r"(&g_x_MMSE.re[j * num_sc_cur + off_sc]),
-            "r"(&g_x_MMSE.im[j * num_sc_cur + off_sc])
-        );
-      }
-
-      /* Divide by L_ii */
-      /* v2 - L_ii real part */
-      /* v3 - L_ii imaginary part */
-      __asm__ volatile (
-        "vle32.v v2, (%0)\n"
-        "vle32.v v3, (%1)\n"
-        /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
-        "vfmul.vv v4, v2, v2\n"
-        "vfmacc.vv v4, v3, v3\n"
-        /* real part */
-        "vfmul.vv v5, v0, v2\n"
-        "vfmacc.vv v5, v1, v3\n"
-        "vdiv.vv v0, v5, v4\n"
-        /* imaginary part */
-        "vfmul.vv v6, v1, v2\n"
-        "vfnmsac.vv v6, v0, v3\n"
-        "vfdiv.vv v1, v6, v4\n"
-        /* store HH_H */
-        "vse32.v v0, (%2)\n"
-        "vse32.v v1, (%3)\n"
-      :
-      : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
-        "r"(&g_L.im[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
-        "r"(&g_x_MMSE.re[i * num_sc_cur + off_sc]),
-        "r"(&g_x_MMSE.im[i * num_sc_cur + off_sc])
-      );
-
-      sz -= vl;
-      off_sc += vl;
-    }
-  }
-}
-
-/*
- * MMSE
- */
-
-void mmse()
-{
-  /* H^H*H + R */
-  TIME(
-    "Gram and add (RxTx x TxRx + TxTx): %ld\n",
-    cmatgram_TxRx_cadd);
-  /* L: (H^H*H + R) = L*L^H */
-  TIME(
-    "Cholesky (TxTx): %ld\n",
-    ccholesky_TxTx);
-  /* z: L*z = H^H*y */
-  TIME(
-    "Matrix-vector multiplication (TxRx x Rx): %ld\n",
-    cmatvecmul_TxRx);
-  TIME(
-    "Forward substitution (TxTx): %ld\n",
-    cforwardsub_TxTx);
-  /* x_MMSE: L^H*x_MMSE = z */
-  TIME(
-    "Backward substitution (TxTx): %ld\n",
-    cbackwardsub_TxTx);
-}
-
-acc_t mse()
-{
-  acc_t sum = 0.;
-  size_t off = 0;
-  data_t sub1, sub2;
-  size_t sz = num_tx_cur * num_sc_cur, vl;
-  register data_t num_tx_num_sc_reg __asm__("f0") = (data_t)(num_tx_cur * NUM_SC); 
-
-  while (sz > 0) {
-    __asm__ volatile (
-      "vsetvli %0, %1, e32, m1, ta, ma\n"
-      : "=r"(vl)
-      : "r"(sz));
-    __asm__ volatile (
-      "vle32.v v0, (%1)\n"
-      "vle32.v v1, (%2)\n"
-      "vle32.v v2, (%3)\n"
-      "vle32.v v3, (%4)\n"
-      "vfsub.vv v0, v0, v2\n"
-      "vfsub.vv v1, v1, v3\n"
-      "vfmul.vv v0, v0, v0\n"
-      "vfmul.vv v1, v1, v1\n"
-      "vfadd.vv v0, v0, v1\n"
-      "vfdiv.vf v0, v0, %5\n"
-      "vfredusum.vs v0, v4, v5\n"
-      "vmv.x.s %0, v0\n"
-      : "+r"(sum)
-      : "r"(&g_x.re[off]),
-        "r"(&g_x.re[off]),
-        "r"(&g_x_MMSE.re[off]),
-        "r"(&g_x_MMSE.re[off]),
-        "f"(num_tx_num_sc_reg));
-    sz -= vl;
-    off += vl;
-  }
-
-  return sum;
-}

	mmserv Minimum Mean Square Error detection on RISC-V Vector Extention
	git clone https://git.ea.contact/mmserv
	Log \| Files \| Refs \| README

M	GNUmakefile	\|	108	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
A	include/common.h	\|	81	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	include/define.h	\|	20	--------------------
M	main.c	\|	173	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	script/gen_data.py	\|	98	++++++++++++++++++++++---------------------------------------------------------
M	script/util.py	\|	36	+++++-------------------------------
A	src/cbackwardsub.c	\|	132	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/ccholesky.c	\|	236	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/cforwardsub.c	\|	131	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/cmatgram.c	\|	131	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	src/cmatvecmul.c	\|	100	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
D	src/mmserv.c	\|	631	-------------------------------------------------------------------------------