commit cd5c7cbc34e6593198ef39c5fc61661a6998ae86
parent c3c44a832f0f41859028f99bd12f46af43e5131b
Author: Egor Achkasov <eaachkasov@gmail.com>
Date: Tue, 20 May 2025 00:48:28 +0200
Massive refactor. Squash all the branches into master.
Diffstat:
| M | GNUmakefile | | | 108 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------- |
| A | include/common.h | | | 81 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| D | include/define.h | | | 20 | -------------------- |
| M | main.c | | | 173 | +++++++++++++++++++++++++++++++++++++++++++++++++++---------------------------- |
| M | script/gen_data.py | | | 98 | ++++++++++++++++++++++--------------------------------------------------------- |
| M | script/util.py | | | 36 | +++++------------------------------- |
| A | src/cbackwardsub.c | | | 132 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/ccholesky.c | | | 236 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/cforwardsub.c | | | 131 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/cmatgram.c | | | 131 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | src/cmatvecmul.c | | | 100 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| D | src/mmserv.c | | | 631 | ------------------------------------------------------------------------------- |
12 files changed, 1036 insertions(+), 841 deletions(-)
diff --git a/GNUmakefile b/GNUmakefile
@@ -1,37 +1,91 @@
-CC = gcc
-CFLAGS = -Iinclude -fno-builtin -Wall -Wextra
-OBJDIR = obj
-BUILDDIR = .
+# Default values
+ARCH ?= x86
+DATA_TYPE ?= float
+PLATFORM ?= linux
+NUM_RX ?= 4
+NUM_TX ?= 4
+NUM_SC ?= 1024
-SRC = $(wildcard src/*.c)
-OBJ = $(SRC:src/%.c=$(OBJDIR)/%.o)
-OBJ += $(OBJDIR)/main.o
+# Valid values
+VALID_ARCHS := x86 rv rvv
+VALID_DATA_TYPES := float fixed
+VALID_PLATFORMS := linux ara baremetal
-TARGET = $(BUILDDIR)/mmse
+# Validate inputs
+ifneq ($(filter $(ARCH),$(VALID_ARCHS)),$(ARCH))
+ $(error Invalid ARCH: $(ARCH). Supported: $(VALID_ARCHS))
+endif
+ifneq ($(filter $(DATA_TYPE),$(VALID_DATA_TYPES)),$(DATA_TYPE))
+ $(error Invalid DATA_TYPE: $(DATA_TYPE). Supported: $(VALID_DATA_TYPES))
+endif
+ifneq ($(filter $(PLATFORM),$(VALID_PLATFORMS)),$(PLATFORM))
+ $(error Invalid PLATFORM: $(PLATFORM). Supported: $(VALID_PLATFORMS))
+endif
+ifneq ($(shell test $(NUM_RX) -gt 0 >/dev/null 2>&1 && echo valid),valid)
+ $(error NUM_RX must be an integer > 0)
+endif
+ifneq ($(shell test $(NUM_TX) -gt 0 >/dev/null 2>&1 && echo valid),valid)
+ $(error NUM_TX must be an integer > 0)
+endif
+ifneq ($(shell test $(NUM_SC) -gt 0 >/dev/null 2>&1 && echo valid),valid)
+ $(error NUM_SC must be an integer > 0)
+endif
-all: $(TARGET)
+# CFLAGS
+CFLAGS += -DARCH_$(ARCH)
+CFLAGS += -DDATA_TYPE_$(DATA_TYPE)
+CFLAGS += -DPLATFORM_$(PLATFORM)
+CFLAGS += -DNUM_RX=$(NUM_RX)
+CFLAGS += -DNUM_TX=$(NUM_TX)
+CFLAGS += -DNUM_SC=$(NUM_SC)
-# Debug target
-dbg: CFLAGS += -g
-dbg: $(TARGET)
+# Compiler selection
+ifeq ($(ARCH),x86)
+ CC := gcc
+else
+ CC := riscv64-unknown-elf-gcc
+endif
-# Compile the elf
-$(TARGET): $(OBJ)
- @mkdir -p $(BUILDDIR)
- $(CC) -o $@ $^
+# Output file
+OUTPUT := build/mmse_$(ARCH)_$(DATA_TYPE)_$(PLATFORM)_$(NUM_RX)x$(NUM_TX)x$(NUM_SC).elf
-# Compile the src/*.c object files
-$(OBJDIR)/%.o: src/%.c
- @mkdir -p $(OBJDIR)
- $(CC) $(CFLAGS) -c $< -o $@
+# Source files
+SRCS := main.c $(wildcard src/*.c)
-# Compile the main.c object files
-$(OBJDIR)/main.o: main.c
- @mkdir -p $(OBJDIR)
- $(CC) $(CFLAGS) -c $< -o $@
+# Phony targets
+.PHONY: all help gen_data
-clean:
- rm -rf $(OBJDIR)/* $(BUILDDIR)/mmse
+# Default target
+all: gen_data $(OUTPUT)
-.PHONY: all dbg clean
+# Run data generation
+gen_data:
+ python script/gen_data.py $(NUM_TX) $(NUM_RX) $(NUM_SC)
+# Compile
+$(OUTPUT): $(SRCS)
+ mkdir -p build
+ $(CC) $(CFLAGS) $^ -o $@
+
+# Help target
+help:
+ @echo "Usage:"
+ @echo " make [ARCH=<arch>] [DATA_TYPE=<type>] [PLATFORM=<platform>] [NUM_RX=<num_rx>] [NUM_TX=<num_tx>] [NUM_SC=<num_sc>]"
+ @echo ""
+ @echo "Supported ARCH values:"
+ @echo " - x86 (default)"
+ @echo " - rv"
+ @echo " - rvv"
+ @echo ""
+ @echo "Supported DATA_TYPE values:"
+ @echo " - float (default)"
+ @echo " - fixed"
+ @echo ""
+ @echo "Supported PLATFORM values:"
+ @echo " - linux (default)"
+ @echo " - ara"
+ @echo " - baremetal"
+ @echo ""
+ @echo "Supported NUM_RX values: integers > 0 (default = 4)"
+ @echo "Supported NUM_TX values: integers > 0 (default = 4)"
+ @echo "Supported NUM_SC values: integers > 0 (default = 1024)"
diff --git a/include/common.h b/include/common.h
@@ -0,0 +1,81 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#include <stdint.h> /* for uint64_t, uint32_t */
+
+
+/*
+ * Typedefs
+ */
+
+#if defined(DATA_TYPE_float)
+typedef float data_t;
+typedef float acc_t;
+#elif defined(DATA_TYPE_fixed)
+typedef int32_t data_t;
+typedef int64_t acc_t;
+#define FP_Q 31
+#else
+#error "Please define DATA_TYPE_float or DATA_TYPE_fixed"
+#endif
+
+typedef struct {
+ data_t *re;
+ data_t *im;
+} vcomplex;
+
+
+/*
+ * Global variables
+ */
+
+/* Raw data */
+/* Transmitted signal */
+extern data_t x_re[NUM_TX][NUM_SC];
+extern data_t x_im[NUM_TX][NUM_SC];
+/* Channel */
+extern data_t H_re[NUM_RX][NUM_TX][NUM_SC];
+extern data_t H_im[NUM_RX][NUM_TX][NUM_SC];
+/* Noise covariance matrix */
+extern data_t R_re[NUM_TX][NUM_TX][NUM_SC];
+extern data_t R_im[NUM_TX][NUM_TX][NUM_SC];
+/* Received signal */
+extern data_t y_re[NUM_RX][NUM_SC];
+extern data_t y_im[NUM_RX][NUM_SC];
+/* MMSE raw data */
+extern data_t G_re[NUM_TX][NUM_TX][NUM_SC];
+extern data_t G_im[NUM_TX][NUM_TX][NUM_SC];
+extern data_t L_re[NUM_TX][NUM_TX][NUM_SC];
+extern data_t L_im[NUM_TX][NUM_TX][NUM_SC];
+extern data_t g_D[NUM_TX][NUM_SC]; /* no imaginary part in D */
+extern data_t HHy_re[NUM_TX][NUM_SC];
+extern data_t HHy_im[NUM_TX][NUM_SC];
+extern data_t z_re[NUM_TX][NUM_SC];
+extern data_t z_im[NUM_TX][NUM_SC];
+/* Result of MMSE approximation */
+extern data_t x_MMSE_re[NUM_TX][NUM_SC];
+extern data_t x_MMSE_im[NUM_TX][NUM_SC];
+
+/* Same data but casted to vcomplex */
+extern vcomplex g_x;
+extern vcomplex g_H;
+extern vcomplex g_R;
+extern vcomplex g_y;
+extern vcomplex g_G;
+extern vcomplex g_L;
+extern vcomplex g_HHy;
+extern vcomplex g_z;
+extern vcomplex g_x_MMSE;
+
+
+/*
+ * Complex matrix operations
+ */
+
+extern void cmatgram();
+extern void ccholesky();
+extern void cmatvecmul();
+extern void cforwardsub();
+extern void cbackwardsub();
+
+#endif
diff --git a/include/define.h b/include/define.h
@@ -1,20 +0,0 @@
-#ifndef __DEFINE_H
-#define __DEFINE_H
-
-#define NUM_RX_ANT 4
-#define NUM_TX_ANT 4
-#define NUM_SC 1
-
-#define DEBUG
-
-#define IN
-#define OUT
-
-typedef float data_t;
-typedef float acc_t;
-typedef struct {
- data_t *re;
- data_t *im;
-} vcomplex;
-
-#endif
diff --git a/main.c b/main.c
@@ -1,72 +1,123 @@
-#include "include/define.h"
-
-#include "../common/printf.h"
-
-/* extern functions */
-extern void cmatgram_TxRx_cadd();
-extern void ccholesky_TxTx();
-extern void cmatvecmul_TxRx();
-extern void cforwardsub_TxTx();
-extern void cbackwardsub_TxTx();
-extern acc_t mse();
-
-/* Raw data */
-/* Transmitted signal */
-extern data_t x_re[NUM_TX_ANT][NUM_SC];
-extern data_t x_im[NUM_TX_ANT][NUM_SC];
-/* Channel */
-extern data_t H_re[NUM_RX_ANT][NUM_TX_ANT][NUM_SC];
-extern data_t H_im[NUM_RX_ANT][NUM_TX_ANT][NUM_SC];
-/* Noise covariance matrix */
-extern data_t R_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-extern data_t R_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-/* Received signal */
-extern data_t y_re[NUM_RX_ANT][NUM_SC];
-extern data_t y_im[NUM_RX_ANT][NUM_SC];
-
-/* Same data but casted to vcomplex */
+#include "include/common.h"
+
+#include <stddef.h> /* for size_t */
+
+
+/*
+ * Defines
+ */
+
+/* Got from https://elm-chan.org/junk/32bit/binclude.html */
+/* Import a binary file */
+#define IMPORT_BIN(sect, file, sym) __asm__ (\
+ ".section " #sect "\n" /* Change section */\
+ ".balign 4\n" /* Word alignment */\
+ ".global " #sym "\n" /* Export the object address */\
+ #sym ":\n" /* Define the object label */\
+ ".incbin \"" file "\"\n" /* Import the file */\
+ ".global _sizeof_" #sym "\n" /* Export the object size */\
+ ".set _sizeof_" #sym ", . - " #sym "\n" /* Define the object size */\
+ ".balign 4\n" /* Word alignment */\
+ ".section \".text\"\n") /* Restore section */
+
+
+/*
+ * Global variables
+ */
+
+/* Import data from binary files */
+IMPORT_BIN(.rodata, "data/x_re.bin", x_re);
+IMPORT_BIN(.rodata, "data/x_im.bin", x_im);
+IMPORT_BIN(.rodata, "data/H_re.bin", H_re);
+IMPORT_BIN(.rodata, "data/H_im.bin", H_im);
+IMPORT_BIN(.rodata, "data/R_re.bin", R_re);
+IMPORT_BIN(.rodata, "data/R_im.bin", R_im);
+IMPORT_BIN(.rodata, "data/y_re.bin", y_re);
+IMPORT_BIN(.rodata, "data/y_im.bin", y_im);
+
+/* Allocate space for MMSE raw data */
+data_t G_re[NUM_TX][NUM_TX][NUM_SC];
+data_t G_im[NUM_TX][NUM_TX][NUM_SC];
+data_t L_re[NUM_TX][NUM_TX][NUM_SC];
+data_t L_im[NUM_TX][NUM_TX][NUM_SC];
+data_t g_D[NUM_TX][NUM_SC]; /* no imaginary part in D */
+data_t HHy_re[NUM_TX][NUM_SC];
+data_t HHy_im[NUM_TX][NUM_SC];
+data_t z_re[NUM_TX][NUM_SC];
+data_t z_im[NUM_TX][NUM_SC];
+data_t x_MMSE_re[NUM_TX][NUM_SC];
+data_t x_MMSE_im[NUM_TX][NUM_SC];
+
+/* Initialize data */
vcomplex g_x = { .re = (data_t *)x_re, .im = (data_t *)x_im };
vcomplex g_H = { .re = (data_t *)H_re, .im = (data_t *)H_im };
vcomplex g_R = { .re = (data_t *)R_re, .im = (data_t *)R_im };
vcomplex g_y = { .re = (data_t *)y_re, .im = (data_t *)y_im };
-
-/* MMSE approximation will be stored in this global*/
-data_t x_MMSE_re[NUM_TX_ANT][NUM_SC];
-data_t x_MMSE_im[NUM_TX_ANT][NUM_SC];
+vcomplex g_G = { .re = (data_t *)G_re, .im = (data_t *)G_im };
+vcomplex g_L = { .re = (data_t *)L_re, .im = (data_t *)L_im };
+vcomplex g_HHy = { .re = (data_t *)HHy_re, .im = (data_t *)HHy_im };
+vcomplex g_z = { .re = (data_t *)z_re, .im = (data_t *)z_im };
vcomplex g_x_MMSE = { .re = (data_t *)x_MMSE_re, .im = (data_t *)x_MMSE_im };
-extern vcomplex g_HH, h_y, g_HHy;
-size_t num_rx_cur=1, num_tx_cur=1, num_sc_cur=1;
+
+/*
+ * Read cycles macro and function
+ */
+
+/** Read and return the cycle counter value */
+uint64_t readcycle() {
+#if defined(ARCH_rvv) || defined(ARCH_rv)
+ uint64_t val;
+ __asm__ volatile("rdcycle %0" : "=r"(val));
+ return val;
+#elif defined(ARCH_x86)
+ unsigned int hi, lo;
+ __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi));
+ return ((uint64_t)hi << 32) | lo;
+#else
+#error "Unknown architecture"
+#endif
+}
+
+/** Read cycles for a function call and store the result in out */
+#define FUNC_CYCLES(func, out) \
+ do { \
+ uint64_t start = readcycle(); \
+ func; \
+ uint64_t end = readcycle(); \
+ out = end - start; \
+ } while (0)
+
+
+/*
+ * Printf function
+ */
+
+#if defined(PLATFORM_ara)
+#include "../../common/printf.h"
+#elif defined(PLATFORM_linux)
+#include <stdio.h>
+#elif defined(PLATFORM_baremetal)
+/* TODO */
+#else
+#error "Unknown platform"
+#endif
+
+
+/*
+ * Main
+ */
int main() {
- unsigned long long start, end;
-
- size_t t1,t2,t3,t4,t5;
- for (num_rx_cur = 1; num_rx_cur <= NUM_RX_ANT; ++num_rx_cur)
- for (num_tx_cur = 1; num_tx_cur <= NUM_TX_ANT; ++num_tx_cur)
- for (num_sc_cur = 1; num_sc_cur <= NUM_SC; num_sc_cur += 1) {
- __asm__ volatile("rdcycle %0" : "=r"(start));
- cmatgram_TxRx_cadd();
- __asm__ volatile("rdcycle %0" : "=r"(end));
- t1 = end - start;
- __asm__ volatile("rdcycle %0" : "=r"(start));
- ccholesky_TxTx();
- __asm__ volatile("rdcycle %0" : "=r"(end));
- t2 = end - start;
- __asm__ volatile("rdcycle %0" : "=r"(start));
- cmatvecmul_TxRx();
- __asm__ volatile("rdcycle %0" : "=r"(end));
- t3 = end - start;
- __asm__ volatile("rdcycle %0" : "=r"(start));
- cforwardsub_TxTx();
- __asm__ volatile("rdcycle %0" : "=r"(end));
- t4 = end - start;
- __asm__ volatile("rdcycle %0" : "=r"(start));
- cbackwardsub_TxTx();
- __asm__ volatile("rdcycle %0" : "=r"(end));
- t5 = end - start;
- printf("%llu,%llu,%llu,%llu,%llu,", t1, t2, t3, t4, t5);
- }
+ uint64_t start, end;
+ uint64_t t1,t2,t3,t4,t5;
+
+ FUNC_CYCLES(cmatgram(), t1);
+ FUNC_CYCLES(ccholesky(), t2);
+ FUNC_CYCLES(cmatvecmul(), t3);
+ FUNC_CYCLES(cforwardsub(), t4);
+ FUNC_CYCLES(cbackwardsub(), t5);
+ printf("%lu,%lu,%lu,%lu,%lu,", t1, t2, t3, t4, t5);
return 0;
}
diff --git a/script/gen_data.py b/script/gen_data.py
@@ -1,42 +1,34 @@
#!/usr/bin/python
-from util import read_defines
-
import numpy as np
from numpy.random import random, normal
from sys import argv
from os import path, makedirs
-if "--help" in argv:
- print("Usage: python scripts/gen_data.py [--txt] [--bin] [--s]")
- print(" --txt: write the data/*.txt files")
- print(" --bin: write the data/*.bin files")
- print(" --s: print the .S file to stdout")
- exit(0)
-
-# Change these flags to control the output
-WRITE_DATA_TXT = "--txt" in argv
-WRITE_DATA_BIN = "--bin" in argv
-WRITE_DATA_S = "--s" in argv or len(argv) == 1
+if len(argv) != 4:
+ print("Usage: python gen_data.py <NUM_RX> <NUM_TX> <NUM_SC>")
+ exit(1)
+NUM_RX = int(argv[1]) # number of receive antennas
+NUM_TX = int(argv[2]) # number of transmit antennas
+NUM_SC = int(argv[3]) # number of subcarriers
-NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines()
NOISE_STD_DEVIATION = np.sqrt(.5) / 100 # noise standard deviation
# Transmitter signal
-x = random((NUM_TX_ANT, NUM_SC)) \
- + 1.j * random((NUM_TX_ANT, NUM_SC))
+x = random((NUM_TX, NUM_SC)) \
+ + 1.j * random((NUM_TX, NUM_SC))
x = (x - 0.5 - 0.5j) * 2 # scale it from [0, 1] to [-1, 1]
# Channel
-H = random((NUM_RX_ANT, NUM_TX_ANT, NUM_SC)) \
- + 1.j * random((NUM_RX_ANT, NUM_TX_ANT, NUM_SC))
+H = random((NUM_RX, NUM_TX, NUM_SC)) \
+ + 1.j * random((NUM_RX, NUM_TX, NUM_SC))
H = (H - 0.5 - 0.5j) * 2 # scale it from [0, 1] to [-1, 1]
# Noise
-n = normal(0, NOISE_STD_DEVIATION, (NUM_RX_ANT, NUM_SC)) \
- + 1.j * normal(0, NOISE_STD_DEVIATION, (NUM_RX_ANT, NUM_SC))
+n = normal(0, NOISE_STD_DEVIATION, (NUM_RX, NUM_SC)) \
+ + 1.j * normal(0, NOISE_STD_DEVIATION, (NUM_RX, NUM_SC))
# Received signal
y = np.einsum("ijk,jk->ik", H, x) + n
# Noise covariance matrix
-R = np.eye(NUM_TX_ANT, NUM_TX_ANT, dtype=np.complex64) * NOISE_STD_DEVIATION**2
+R = np.eye(NUM_TX, NUM_TX, dtype=np.complex64) * NOISE_STD_DEVIATION**2
data_tuple = (
x.real.astype(np.float32), x.imag.astype(np.float32),
@@ -44,55 +36,19 @@ data_tuple = (
R.real.astype(np.float32), R.imag.astype(np.float32),
y.real.astype(np.float32), y.imag.astype(np.float32)
)
-
-
-class Section:
- def __init__(self, name, source, align, sizeof, length, duplicate=False):
- self.name = name
- self.source = source
- self.align = align
- self.sizeof = sizeof
- self.length = length
- self.duplicate = duplicate
-
-
-sections = [
- Section("x_re", "data/x_re.txt", "3", 32, x.size),
- Section("x_im", "data/x_im.txt", "3", 32, x.size),
- Section("H_re", "data/H_re.txt", "3", 32, H.size),
- Section("H_im", "data/H_im.txt", "3", 32, H.size),
- Section("R_re", "data/R_re.txt", "3", 32, R.size),
- Section("R_im", "data/R_im.txt", "3", 32, R.size),
- Section("y_re", "data/y_re.txt", "3", 32, y.size),
- Section("y_im", "data/y_im.txt", "3", 32, y.size),
-]
+data_filenames = (
+ "data/x_re.bin", "data/x_im.bin",
+ "data/H_re.bin", "data/H_im.bin",
+ "data/R_re.bin", "data/R_im.bin",
+ "data/y_re.bin", "data/y_im.bin"
+)
# Create "data" directory if it does not exist
-if WRITE_DATA_BIN or WRITE_DATA_TXT:
- data_dir = path.join(path.dirname(__file__), "..", "data")
- if not path.exists(data_dir):
- makedirs(data_dir)
-
-if WRITE_DATA_TXT:
- for data, sec in zip(data_tuple, sections):
- with open(sec.source, "w") as f:
- for sample in data:
- f.write(f"{sample}\n")
-
-if WRITE_DATA_BIN:
- for data, sec in zip(data_tuple, sections):
- with open(sec.source.replace(".txt", ".bin"), "wb") as f:
- f.write(data.tobytes())
-
-if WRITE_DATA_S:
- print(".section .data,\"aw\",@progbits")
- for data, sec in zip(data_tuple, sections):
- print(f".global {sec.name}")
- print(f"{sec.name}:")
- for sample in data:
- bs = sample.tobytes()
- for i in range(0, len(bs), 4):
- s = ""
- for n in range(4):
- s += "%02x" % bs[i+3-n]
- print(" .word 0x%s" % s)
+data_dir = path.join(path.dirname(__file__), "..", "data")
+if not path.exists(data_dir):
+ makedirs(data_dir)
+
+# Write data to bin files
+for data, filename in zip(data_tuple, data_filenames):
+ with open(filename, "wb") as f:
+ f.write(data.tobytes())
diff --git a/script/util.py b/script/util.py
@@ -2,33 +2,7 @@ from os import path
import numpy as np
-def read_defines():
- """Read the defines from the define.h file
-
- Returns:
- int: Number of receive antennas
- int: Number of transmit antennas
- int: Number of subcarriers
- """
- with open(path.join(path.dirname(__file__), "..", "include", "define.h"), "r") as f:
- lines = f.read().split("\n")
- for line in lines:
- if line.startswith("#define NUM_RX_ANT "):
- NUM_RX_ANT = int(line[19:])
- if line.startswith("#define NUM_TX_ANT "):
- NUM_TX_ANT = int(line[19:])
- if line.startswith("#define NUM_SC "):
- NUM_SC = int(line[15:])
-
- # Assert that all the defines are read
- assert NUM_RX_ANT
- assert NUM_TX_ANT
- assert NUM_SC
-
- return NUM_RX_ANT, NUM_TX_ANT, NUM_SC
-
-
-def load_xHRy(NUM_RX_ANT, NUM_TX_ANT, NUM_SC) -> tuple:
+def load_xHRy(NUM_RX, NUM_TX, NUM_SC) -> tuple:
"""Load x, H, R and y from the data files.
Assumes that the following files are present in the data directory:
x_re.bin, x_im.bin, H_re.bin, H_im.bin, R_re.bin, R_im.bin, y_re.bin, y_im.bin
@@ -39,15 +13,15 @@ def load_xHRy(NUM_RX_ANT, NUM_TX_ANT, NUM_SC) -> tuple:
"""
x = np.fromfile("data/x_re.bin", dtype=np.float32)
x = x + 1j*np.fromfile("data/x_im.bin", dtype=np.float32)
- x = x.reshape((NUM_TX_ANT, NUM_SC))
+ x = x.reshape((NUM_TX, NUM_SC))
H = np.fromfile("data/H_re.bin", dtype=np.float32)
H = H + 1j*np.fromfile("data/H_im.bin", dtype=np.float32)
- H = H.reshape((NUM_RX_ANT, NUM_TX_ANT, NUM_SC))
+ H = H.reshape((NUM_RX, NUM_TX, NUM_SC))
R = np.fromfile("data/R_re.bin", dtype=np.float32)
R = R + 1j*np.fromfile("data/R_im.bin", dtype=np.float32)
- R = R.reshape((NUM_TX_ANT, NUM_TX_ANT, NUM_SC))
+ R = R.reshape((NUM_TX, NUM_TX, NUM_SC))
y = np.fromfile("data/y_re.bin", dtype=np.float32)
y = y + 1j*np.fromfile("data/y_im.bin", dtype=np.float32)
- y = y.reshape((NUM_RX_ANT, NUM_SC))
+ y = y.reshape((NUM_RX, NUM_SC))
return x, H, R, y
diff --git a/src/cbackwardsub.c b/src/cbackwardsub.c
@@ -0,0 +1,132 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex backward substitution L^H*x_MMSE = z
+ *
+ * x_MMSE_t = (z_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) / L_{t t} (for LL / float solution)
+ * x_MMSE_t = (z_t / D_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) (for LDL / fixed solution)
+ *
+ * \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_D diagonal matrix (only if DATA_TYPE_fixed is defined). Shape [NUM_TX][NUM_SC]
+ * \global g_z rhs vector. Shape [NUM_TX][NUM_SC]
+ * \global g_x_MMSE output vector. Shape [NUM_TX][NUM_SC]
+ */
+void cbackwardsub()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+ size_t t, tt, s;
+ size_t off_L, off_z, off_x_MMSE, off_D;
+ acc_t sum_re, sum_im;
+ for (t = NUM_TX - 1; t != (size_t)-1; --t) {
+ for (s = 0; s < NUM_SC; ++s) {
+ sum_re = sum_im = 0;
+ for (tt = t + 1; tt < NUM_TX; ++tt) {
+ off_L = tt * NUM_TX * NUM_SC + t * NUM_SC + s;
+ off_x_MMSE = tt * NUM_SC + s;
+ sum_re += (acc_t)g_L.re[off_L] * (acc_t)g_x_MMSE.re[off_x_MMSE]
+ - (acc_t)g_L.im[off_L] * (acc_t)g_x_MMSE.im[off_x_MMSE];
+ sum_im += (acc_t)g_L.re[off_L] * (acc_t)g_x_MMSE.im[off_x_MMSE]
+ + (acc_t)g_L.im[off_L] * (acc_t)g_x_MMSE.re[off_x_MMSE];
+ }
+ off_z = t * NUM_SC + s;
+ off_x_MMSE = t * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+ off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s;
+ g_x_MMSE.re[off_x_MMSE] = (g_z.re[off_z] - (data_t)sum_re) / g_L.re[off_L];
+ g_x_MMSE.im[off_x_MMSE] = (g_z.im[off_z] - (data_t)sum_im) / g_L.re[off_L];
+#elif defined(DATA_TYPE_fixed)
+ off_D = t * NUM_SC + s;
+ g_x_MMSE.re[off_x_MMSE] = (data_t)((acc_t)(g_z.re[off_z] << FP_Q) / g_D[off_D])
+ - (data_t)(sum_re >> FP_Q);
+ g_x_MMSE.im[off_x_MMSE] = (data_t)((acc_t)(g_z.im[off_z] << FP_Q) / g_D[off_D])
+ - (data_t)(sum_im >> FP_Q);
+#else
+#error "Unknown data type"
+#endif
+ }
+ }
+#elif defined(ARCH_rvv)
+ size_t i, j;
+ size_t sz, vl;
+ size_t off_sc;
+
+ for (i = NUM_TX - 1; i != (size_t)-1; --i) {
+ off_sc = 0;
+ sz = NUM_SC;
+
+ while (sz > 0){
+ /* Initialize result registers as z */
+ /* v0 - result real part */
+ /* v1 - result imaginary part */
+ __asm__ volatile(
+ "vsetvli %0, %1, e32, m1, ta, ma\n"
+ : "=r"(vl)
+ : "r"(sz));
+ __asm__ volatile(
+ "vle32.v v0, (%0)\n"
+ "vle32.v v1, (%1)\n"
+ :
+ : "r"(&g_z.re[i * NUM_SC + off_sc]),
+ "r"(&g_z.im[i * NUM_SC + off_sc]));
+
+ for (j = i + 1; j < NUM_TX; ++j) {
+ /* b - sum L_ji * z_j */
+ /* v2 - L real part */
+ /* v3 - L imaginary part */
+ /* v4 - x_MMSE_j real part */
+ /* v5 - x_MMSE_j imaginary part */
+ __asm__ volatile(
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ "vle32.v v4, (%2)\n"
+ "vle32.v v5, (%3)\n"
+ /* real part */
+ "vfnmsac.vv v0, v2, v4\n"
+ "vfmacc.vv v0, v3, v5\n"
+ /* imaginary part */
+ "vfnmsac.vv v1, v3, v4\n"
+ "vfnmsac.vv v1, v2, v5\n"
+ :
+ : "r"(&g_L.re[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+ "r"(&g_L.im[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+ "r"(&g_x_MMSE.re[j * NUM_SC + off_sc]),
+ "r"(&g_x_MMSE.im[j * NUM_SC + off_sc])
+ );
+ }
+
+ /* Divide by L_ii */
+ /* v2 - L_ii real part */
+ /* v3 - L_ii imaginary part */
+ __asm__ volatile (
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
+ "vfmul.vv v4, v2, v2\n"
+ "vfmacc.vv v4, v3, v3\n"
+ /* real part */
+ "vfmul.vv v5, v0, v2\n"
+ "vfmacc.vv v5, v1, v3\n"
+ "vdiv.vv v0, v5, v4\n"
+ /* imaginary part */
+ "vfmul.vv v6, v1, v2\n"
+ "vfnmsac.vv v6, v0, v3\n"
+ "vfdiv.vv v1, v6, v4\n"
+ /* store HH_H */
+ "vse32.v v0, (%2)\n"
+ "vse32.v v1, (%3)\n"
+ :
+ : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+ "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+ "r"(&g_x_MMSE.re[i * NUM_SC + off_sc]),
+ "r"(&g_x_MMSE.im[i * NUM_SC + off_sc])
+ );
+
+ sz -= vl;
+ off_sc += vl;
+ }
+ }
+#else
+#error "Unknown architecture"
+#endif
+}
diff --git a/src/ccholesky.c b/src/ccholesky.c
@@ -0,0 +1,236 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex Cholesky decomposition of a Hermitian positive-definite matrix G
+ *
+ * LL (floating point solution):
+ * G = L*L^H
+ * L_ij = (G_ij - \sum_{k=0}^{j-1} L_ik L_jk^*) / L_jj
+ * L_ii = sqrt(G_ii - \sum_{k=0}^{i-1} L_ik L_ik^*)
+ *
+ * LDL (fixed point solution):
+ * G = L*D*L^H
+ * L_ij = (G_ij - \sum_{k=0}^{j-1} L_ik D_k L_jk^*) / D_j
+ * D_i = G_ii - \sum_{k=0}^{i-1} L_ik D_k L_ik^*
+ *
+ * \global g_G matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_L output lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_D output diagonal matrix (if DATA_TYPE_fixed is defined). Shape [NUM_TX][NUM_SC]
+ */
+void ccholesky()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+ size_t i, j, k, s;
+ size_t off_ij, off_jj, off_ii;
+ size_t off_ik, off_jk;
+ size_t off_i, off_j, off_k;
+ data_t tmp; /* Temporary variable for sqrt */
+ acc_t sum_re, sum_im;
+ for (i = 0; i < NUM_TX; ++i) {
+ for (j = 0; j <= i; ++j) {
+ for (s = 0; s < NUM_SC; ++s) {
+ off_ij = i * NUM_TX * NUM_SC + j * NUM_SC + s;
+ sum_im = sum_re = 0;
+
+ /* Calculate the sum */
+ for (k = 0; k < j; ++k) {
+ off_ik = i * NUM_TX * NUM_SC + k * NUM_SC + s;
+ off_jk = j * NUM_TX * NUM_SC + k * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+ sum_re += g_L.re[off_ik] * g_L.re[off_jk]
+ - g_L.im[off_ik] * g_L.im[off_jk];
+#elif defined(DATA_TYPE_fixed)
+ sum_re += (g_L.re[off_ik] * g_L.re[off_jk]
+ - g_L.im[off_ik] * g_L.im[off_jk])
+ * g_D[k * NUM_SC + s];
+#else
+#error "Unknown data type"
+#endif
+ sum_im += g_L.re[off_ik] * g_L.im[off_jk]
+ + g_L.im[off_ik] * g_L.re[off_jk];
+ }
+
+ if (i == j) {
+ off_ii = i * NUM_TX * NUM_SC + i * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+
+#if defined(ARCH_x86)
+ __asm__ volatile (
+ "flds %1\n"
+ "fsubs %2\n"
+ "fsqrt\n"
+ "fstps %0\n"
+ : "=m" (g_L.re[off_ii])
+ : "m" (g_G.re[off_ij]), "m" (sum_re)
+ );
+#elif defined(ARCH_rv)
+ __asm__ volatile (
+ "fsub.s %0, %1, %2\n" /* tmp = g_G.re[off_ij] - sum_re */
+ "fsqrt.s %0, %0\n" /* tmp = sqrtf(tmp) */
+ : "=&f"(tmp) : "f"(g_G.re[off_ij]), "f"(sum_re)
+ );
+ g_L.re[off_ii] = tmp;
+ g_L.im[off_ii] = 0;
+#else
+#error "Unknown architecture"
+#endif
+
+#elif defined(DATA_TYPE_fixed)
+ /* Calculate D_i = G_ii - sum */
+ g_D[i * NUM_SC + s] = g_G.re[off_ii] - (data_t)(sum_re >> FP_Q);
+#else
+#error "Unknown data type"
+#endif
+ } else { /* i != j */
+#if defined(DATA_TYPE_float)
+ /* Calculate L_ij = (G_ij - sum) / L_jj */
+ off_jj = j * NUM_TX * NUM_SC + j * NUM_SC + s;
+ g_L.re[off_ij] = (g_G.re[off_ij] - sum_re) / g_L.re[off_jj];
+ g_L.im[off_ij] = (g_G.im[off_ij] - sum_im) / g_L.re[off_jj];
+#elif defined(DATA_TYPE_fixed)
+ /* Calculate L_ij = (G_ij - sum) / D_j */
+ off_j = j * NUM_SC + s;
+ /* real */
+ sum_re = ((acc_t)g_G.re[off_ij] << FP_Q) - sum_re;
+ /* TODO roubding? */
+ g_L.re[off_ij] = (data_t)(sum_re / (acc_t)g_D[off_j]);
+ /* imaginary */
+ sum_im = ((acc_t)g_G.im[off_ij] << FP_Q) - sum_im;
+ /* TODO roubding? */
+ g_L.im[off_ij] = (data_t)(sum_im / (acc_t)g_D[off_j]);
+#else
+#error "Unknown data type"
+#endif
+ }
+ }
+ }
+ }
+#elif defined(ARCH_rvv)
+ size_t i, j, k;
+ size_t sz, vl;
+ size_t off_sc;
+
+ /* Init float registers */
+ register float f0 __asm__("f0") = 2.0f;
+
+ for (i = 0; i < NUM_TX; ++i)
+ for (j = 0; j <= i; ++j) {
+ sz = NUM_SC;
+ off_sc = 0;
+
+ while (sz > 0) {
+ /* Initialize L registers */
+ /* v0 - L_ij real part = G_ij.re */
+ /* v1 - L_ij imaginary part = G_ij.im */
+ __asm__ volatile(
+ "vsetvli %0, %1, e32, m1, ta, ma\n"
+ : "=r"(vl) : "r"(sz));
+ __asm__ volatile(
+ "vle32.v v0, (%0)\n"
+ "vle32.v v1, (%1)\n"
+ :
+ : "r"(&g_G.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+ "r"(&g_G.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
+ );
+
+ /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */
+ /* v2 - sum real part */
+ /* v3 - sum imaginary part */
+ __asm__ volatile(
+ "vmv.v.i v2, 0\n"
+ "vmv.v.i v3, 0\n"
+ );
+ for (k = 0; k < j; ++k) {
+ __asm__ volatile(
+ "vle32.v v4, (%0)\n"
+ "vle32.v v5, (%1)\n"
+ "vle32.v v6, (%2)\n"
+ "vle32.v v7, (%3)\n"
+ /* real part */
+ "vfmacc.vv v2, v4, v6\n"
+ "vfmacc.vv v2, v5, v7\n"
+ /* imaginary part */
+ "vfmacc.vv v3, v5, v6\n"
+ "vfnmsac.vv v3, v4, v7\n"
+ :
+ : "r"(&g_L.re[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
+ "r"(&g_L.im[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
+ "r"(&g_L.re[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc]),
+ "r"(&g_L.im[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc])
+ );
+ }
+
+ /* G_ii - sum */
+ __asm__ volatile(
+ "vfsub.vv v0, v0, v2\n"
+ "vfsub.vv v1, v1, v3\n"
+ );
+
+ if (i == j) {
+ /* Calculate L_ii = sqrt(G_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */
+ __asm__ volatile(
+ /* Complex sqrt */
+
+ /* v2 = r = sqrt(re^2 + im^2) */
+ "vfmul.vv v2, v0, v0\n"
+ "vfmacc.vv v2, v1, v1\n"
+ "vfsqrt.v v2, v2\n"
+
+ /* v3 - real part */
+ "vfadd.vv v3, v2, v0\n" /* r + re */
+ "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */
+ "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */
+ /* v4 - imaginary part */
+ "vfsub.vv v4, v2, v0\n" /* r - re */
+ "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */
+ "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */
+ "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */
+
+ /* TODO handle im == 0 */
+
+ /* Move the result to v0 and v1 */
+ "vmv.v.v v0, v3\n"
+ "vmv.v.v v1, v4\n"
+ );
+ } else {
+ /* Calculate L_ij = (G_ij - sum) / L_jj */
+ __asm__ volatile(
+ /* L_jj */
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */
+ "vfmul.vv v4, v2, v2\n"
+ "vfmacc.vv v4, v3, v3\n"
+ /* real part */
+ "vfmul.vv v5, v0, v2\n"
+ "vfmacc.vv v5, v1, v3\n"
+ /* imaginary part */
+ "vfmul.vv v6, v1, v2\n"
+ "vfnmsac.vv v6, v0, v3\n"
+ /* divide and store at v0 and v1 */
+ "vfdiv.vv v0, v5, v4\n"
+ "vfdiv.vv v1, v6, v4\n"
+ :
+ : "r"(&g_L.re[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+ "r"(&g_L.im[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
+ );
+ }
+
+ /* Store result */
+ __asm__ volatile(
+ "vse32.v v0, (%0)\n"
+ "vse32.v v1, (%1)\n"
+ :
+ : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+ "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc])
+ );
+
+ sz -= vl;
+ off_sc += vl;
+ }
+ }
+#else
+#error "Unknown architecture"
+#endif
+}
diff --git a/src/cforwardsub.c b/src/cforwardsub.c
@@ -0,0 +1,131 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex forward substitution L*z = HHy
+ *
+ * z_t = (HHy_t - \sum_{tt=0}^{t-1} L_{t tt} z_tt) / L_{t t} (for LL / float solution)
+ * z_t = (HHy_t - \sum_{tt=0}^{t-1} L_{t tt} z_tt) (for LDL / fixed solution)
+ *
+ * \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_HHy vector. Shape [NUM_TX][NUM_SC]
+ * \global g_z output vector. Shape [NUM_TX][NUM_SC]
+ */
+void cforwardsub()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+ size_t t, tt, s;
+ size_t off_L, off_HHy, off_z;
+ acc_t sum_re, sum_im;
+ for (t = 0; t < NUM_TX; ++t) {
+ for (s = 0; s < NUM_SC; ++s) {
+ sum_re = sum_im = 0;
+ for (tt = 0; tt < t; ++tt) {
+ off_L = t * NUM_TX * NUM_SC + tt * NUM_SC + s;
+ off_z = tt * NUM_SC + s;
+ sum_re += (acc_t)g_L.re[off_L] * (acc_t)g_z.re[off_z]
+ - (acc_t)g_L.im[off_L] * (acc_t)g_z.im[off_z];
+ sum_im += (acc_t)g_L.re[off_L] * (acc_t)g_z.im[off_z]
+ + (acc_t)g_L.im[off_L] * (acc_t)g_z.re[off_z];
+ }
+ off_HHy = t * NUM_SC + s;
+ off_z = t * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+ off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s;
+ g_z.re[off_z] = (g_HHy.re[off_HHy] - (data_t)sum_re) / g_L.re[off_L];
+ g_z.im[off_z] = (g_HHy.im[off_HHy] - (data_t)sum_im) / g_L.re[off_L];
+#elif defined(DATA_TYPE_fixed)
+ g_z.re[off_z] = g_HHy.re[off_HHy] - (data_t)(sum_re >> FP_Q);
+ g_z.im[off_z] = g_HHy.im[off_HHy] - (data_t)(sum_im >> FP_Q);
+#else
+#error "Unknown data type"
+#endif
+ }
+ }
+#elif defined(ARCH_rvv)
+ size_t i, j;
+ size_t sz, vl;
+ size_t off_sc;
+
+ for (i = 0; i < NUM_TX; ++i) {
+ off_sc = 0;
+ sz = NUM_SC;
+
+ while (sz > 0) {
+ // printf("sz: %lu\n", sz);
+ // printf("vl: %lu\n", vl);
+ /* Initialize result registers as b */
+ /* v0 - result real part */
+ /* v1 - result imaginary part */
+ __asm__ volatile (
+ "vsetvli %0, %1, e32, m1, ta, ma\n"
+ : "=r"(vl)
+ : "r"(sz));
+ __asm__ volatile (
+ "vle32.v v0, (%0)\n"
+ "vle32.v v1, (%1)\n"
+ :
+ : "r"(&g_HHy.re[i * NUM_SC + off_sc]),
+ "r"(&g_HHy.im[i * NUM_SC + off_sc]));
+ // printf("vl: %lu\n", vl);
+
+ for (j = 0; j != i; ++j) {
+ /* b - sum L_ij * z_j */
+ /* v2 - L real part */
+ /* v3 - L imaginary part */
+ /* v4 - result_j real part */
+ /* v5 - result_j imaginary part */
+ __asm__ volatile (
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ "vle32.v v4, (%2)\n"
+ "vle32.v v5, (%3)\n"
+ /* real part */
+ "vfnmsac.vv v0, v2, v4\n"
+ "vfmacc.vv v0, v3, v5\n"
+ /* imaginary part */
+ "vfnmsac.vv v1, v3, v4\n"
+ "vfnmsac.vv v1, v2, v5\n"
+ :
+ : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+ "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]),
+ "r"(&g_z.re[j * NUM_SC + off_sc]),
+ "r"(&g_z.im[j * NUM_SC + off_sc])
+ );
+ }
+
+ /* Divide by L_ii */
+ /* v2 - L_ii real part */
+ /* v3 - L_ii imaginary part */
+ __asm__ volatile (
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
+ "vfmul.vv v4, v2, v2\n"
+ "vfmacc.vv v4, v3, v3\n"
+ /* real part */
+ "vfmul.vv v5, v0, v2\n"
+ "vfmacc.vv v5, v1, v3\n"
+ "vdiv.vv v0, v5, v4\n"
+ /* imaginary part */
+ "vfmul.vv v6, v1, v2\n"
+ "vfnmsac.vv v6, v0, v3\n"
+ "vfdiv.vv v1, v6, v4\n"
+ /* store result */
+ "vse32.v v0, (%2)\n"
+ "vse32.v v1, (%3)\n"
+ :
+ : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+ "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]),
+ "r"(&g_z.re[i * NUM_SC + off_sc]),
+ "r"(&g_z.im[i * NUM_SC + off_sc])
+ );
+
+ off_sc += vl;
+ sz -= vl;
+ }
+ }
+#else
+#error "Unknown architecture"
+#endif
+}
diff --git a/src/cmatgram.c b/src/cmatgram.c
@@ -0,0 +1,131 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex Gram matrix H^H*H and add complex matrix R
+ *
+ * G = H^H*H + R
+ * G_{t1t2} = \sum_{r=0}^{NUM_RX - 1} (H_{rt1}^* H_{rt2}) + R_{t1t2}
+ *
+ * \global g_H matrix of channel coefficients. Shape [NUM_RX][NUM_TX][NUM_SC]
+ * \global g_R noise covariance matrix. Shape [NUM_TX][NUM_TX][NUM_SC]
+ * \global g_G output Gram matrix + R. Shape [NUM_TX][NUM_TX][NUM_SC]
+ */
+void cmatgram()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+ size_t r, t1, t2, s;
+ size_t off_G, off_R, off_H1, off_H2;
+ for (t1 = 0; t1 < NUM_TX; ++t1) {
+ for (t2 = 0; t2 < NUM_TX; ++t2) {
+ for (s = 0; s < NUM_SC; ++s) {
+ off_R = off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s;
+ g_G.re[off_G] = g_R.re[off_R];
+ g_G.im[off_G] = g_R.im[off_R];
+
+ for (r = 0; r < NUM_RX; ++r) {
+ off_H1 = r * NUM_TX * NUM_SC + t1 * NUM_SC + s;
+ off_H2 = r * NUM_TX * NUM_SC + t2 * NUM_SC + s;
+ g_G.re[off_G] += g_H.re[off_H1] * g_H.re[off_H2]
+ + g_H.im[off_H1] * g_H.im[off_H2];
+ g_G.im[off_G] += g_H.re[off_H1] * g_H.im[off_H2]
+ - g_H.im[off_H1] * g_H.re[off_H2];
+ }
+ }
+ }
+ }
+#elif defined(ARCH_rvv)
+ size_t t1, t2, r;
+ size_t sz, vl;
+ size_t off_sc, off_A, off_AH;
+ size_t off_G_L, off_G_U;
+ data_t *A_re, *A_im, *AH_re, *AH_im;
+ data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im;
+ data_t *G_L_re, *G_L_im, *G_U_re, *G_U_im;
+
+ for (t1 = 0; t1 != NUM_TX; ++t1)
+ for (t2 = t1; t2 != NUM_TX; ++t2) {
+ off_sc = 0;
+ off_G_L = t1 * NUM_TX * NUM_SC + t2 * NUM_SC;
+ off_G_U = t2 * NUM_TX * NUM_SC + t1 * NUM_SC;
+ G_L_re = &g_G.re[off_G_L];
+ G_L_im = &g_G.im[off_G_L];
+ G_U_re = &g_G.re[off_G_U];
+ G_U_im = &g_G.im[off_G_U];
+ sz = NUM_SC;
+
+ while (sz > 0) {
+ /* Initialize G registers */
+ /* v0 - G real part */
+ /* v1 - G imaginary part */
+ __asm__ volatile(
+ "vsetvli %0, %1, e32, m1, ta, ma\n"
+ "vmv.v.i v0, 0\n"
+ "vmv.v.i v1, 0\n"
+ : "=r"(vl) : "r"(sz));
+
+ for (r = 0; r != NUM_RX; ++r) {
+ off_A = r * NUM_TX * NUM_SC + t1 * NUM_SC + off_sc;
+ off_AH = r * NUM_TX * NUM_SC + t2 * NUM_SC + off_sc;
+ A_re = &g_H.re[off_A];
+ A_im = &g_H.im[off_A];
+ AH_re = &g_H.re[off_AH];
+ AH_im = &g_H.im[off_AH];
+
+ /* Calculate A^H*A */
+ /* v2 - A real part */
+ /* v3 - A imaginary part */
+ /* v4 - A^H real part */
+ /* v5 - A^H imaginary part */
+ __asm__ volatile(
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ "vle32.v v4, (%2)\n"
+ "vle32.v v5, (%3)\n"
+ /* real part */
+ "vfmacc.vv v0, v2, v4\n"
+ "vfmacc.vv v0, v3, v5\n"
+ /* imaginary part */
+ "vfmacc.vv v1, v3, v4\n"
+ "vfnmsac.vv v1, v2, v5\n"
+ :
+ : "r"(A_re), "r"(A_im), "r"(AH_re), "r"(AH_im)
+ );
+ }
+
+ /* Add R */
+ /* v2 - R real part */
+ /* v3 - R imaginary part */
+ R_U_re = &g_R.re[off_G_U];
+ R_U_im = &g_R.im[off_G_U];
+ __asm__ volatile(
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ "vfadd.vv v2, v2, v0\n"
+ "vfadd.vv v3, v3, v1\n"
+ "vse32.v v2, (%2)\n"
+ "vse32.v v3, (%3)\n"
+ :
+ : "r"(R_U_re), "r"(R_U_im), "r"(G_U_re), "r"(G_U_im));
+ if (t1 != t2) {
+ R_L_re = &g_R.re[off_G_L];
+ R_L_im = &g_R.im[off_G_L];
+ __asm__ volatile(
+ /* Lower triangle */
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ "vfadd.vv v2, v2, v0\n"
+ "vfsub.vv v3, v3, v1\n"
+ "vse32.v v2, (%2)\n"
+ "vse32.v v3, (%3)\n"
+ :
+ : "r"(R_L_re), "r"(R_L_im), "r"(G_L_re), "r"(G_L_im)
+ );
+ }
+
+ sz -= vl;
+ off_sc += vl;
+ }
+ }
+#endif
+}
diff --git a/src/cmatvecmul.c b/src/cmatvecmul.c
@@ -0,0 +1,100 @@
+#include "../include/common.h"
+
+#include <stddef.h>
+
+/** Complex matrix-vector multiplication HHy = HH*y
+ *
+ * HHy_t = \sum_{r=0}^{NUM_RX-1} H_{rt}^* * y_r
+ *
+ * \global g_H matrix. Shape [NUM_RX][NUM_TX][NUM_SC]
+ * \global g_y vector. Shape [NUM_RX][NUM_SC]
+ * \global g_HHy output vector. Shape [NUM_TX][NUM_SC]
+ */
+void cmatvecmul()
+{
+#if defined(ARCH_x86) || defined(ARCH_rv)
+ size_t t, r, s;
+ size_t off_H, off_y, off_HHy;
+ acc_t sum_re, sum_im;
+ for (t = 0; t < NUM_TX; ++t) {
+ for (s = 0; s < NUM_SC; ++s) {
+ sum_re = sum_im = 0;
+ for (r = 0; r < NUM_RX; ++r) {
+ off_H = r * NUM_TX * NUM_SC + t * NUM_SC + s;
+ off_y = r * NUM_SC + s;
+ sum_re += (acc_t)g_H.re[off_H] * (acc_t)g_y.re[off_y]
+ - (acc_t)g_H.im[off_H] * (acc_t)g_y.im[off_y];
+ sum_im += (acc_t)g_H.re[off_H] * (acc_t)g_y.im[off_y]
+ + (acc_t)g_H.im[off_H] * (acc_t)g_y.re[off_y];
+ }
+ off_HHy = t * NUM_SC + s;
+#if defined(DATA_TYPE_float)
+ g_HHy.re[off_HHy] = (data_t)sum_re;
+ g_HHy.im[off_HHy] = (data_t)sum_im;
+#elif defined(DATA_TYPE_fixed)
+ g_HHy.re[off_HHy] = (data_t)(sum_re >> FP_Q);
+ g_HHy.im[off_HHy] = (data_t)(sum_im >> FP_Q);
+#else
+#error "Unknown data type"
+#endif
+ }
+ }
+#elif defined(ARCH_rvv)
+ size_t i, j;
+ size_t off_HH, off_y, off_HHy, off_sc;
+ size_t sz, vl;
+
+ for (i = 0; i < NUM_TX; ++i) {
+ off_HHy = i * NUM_SC;
+ off_sc = 0;
+ sz = NUM_SC;
+
+ while (sz > 0) {
+ /* Initialize result registers */
+ /* v0 - HHy real part */
+ /* v1 - HHy imaginary part */
+ __asm__ volatile(
+ "vsetvli %0, %1, e32, m1, ta, ma\n"
+ "vmv.v.i v0, 0\n"
+ "vmv.v.i v1, 0\n"
+ : "=r"(vl)
+ : "r"(sz)
+ );
+
+ for (j = 0; j < NUM_RX; ++j) {
+ off_HH = i * NUM_RX * NUM_SC + j * NUM_SC + off_sc;
+ off_y = j * NUM_SC + off_sc;
+ __asm__ volatile(
+ "vle32.v v2, (%0)\n"
+ "vle32.v v3, (%1)\n"
+ "vle32.v v4, (%2)\n"
+ "vle32.v v5, (%3)\n"
+ /* real part */
+ "vfmacc.vv v0, v2, v4\n"
+ "vfnmsac.vv v0, v3, v5\n"
+ /* imaginary part */
+ "vfmacc.vv v1, v3, v4\n"
+ "vfmacc.vv v1, v2, v5\n"
+ :
+ : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]),
+ "r"(&g_y.re[off_y]), "r"(&g_y.im[off_y])
+ );
+ }
+
+ /* Store result */
+ __asm__ volatile(
+ "vse32.v v0, (%0)\n"
+ "vse32.v v1, (%1)\n"
+ :
+ : "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy])
+ );
+
+ sz -= vl;
+ off_HHy += vl;
+ off_sc += vl;
+ }
+ }
+#else
+#error "Unknown architecture"
+#endif
+}
diff --git a/src/mmserv.c b/src/mmserv.c
@@ -1,631 +0,0 @@
-#include "../include/define.h"
-
-#include <stddef.h> /* for size_t */
-#include <stdint.h> /* for uint64_t */
-
-/*
- * Debug
- */
-
-#ifdef DEBUG
-#include "printf.h"
-
-static uint64_t g_timer;
-void start_timer()
-{
- __asm__ volatile("rdcycle %0" : "=r"(g_timer));
-}
-void stop_timer()
-{
- __asm__ volatile(
- "rdcycle t0\n"
- "sub %0, t0, %0"
- : "+r"(g_timer)
- );
-}
-uint64_t get_timer()
-{
- return g_timer;
-}
-
-#define TIME(msg, func, ...) \
- start_timer(); \
- func(__VA_ARGS__); \
- stop_timer(); \
- printf(msg, get_timer());
-
-#else
-#define TIME(msg, func, ...) func(__VA_ARGS__);
-
-#endif
-
-/*
- * Global variables
- */
-
-/* Externs */
-extern vcomplex g_x, g_H, g_R, g_y;
-extern vcomplex g_x_MMSE;
-extern size_t num_rx_cur, num_tx_cur, num_sc_cur;
-
-
-/* Raw data */
-data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC];
-data_t HH_im[NUM_TX_ANT][NUM_RX_ANT][NUM_SC];
-data_t HH_H_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t HH_H_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t L_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t L_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t HHy_re[NUM_TX_ANT][NUM_SC];
-data_t HHy_im[NUM_TX_ANT][NUM_SC];
-data_t z_re[NUM_TX_ANT][NUM_SC];
-data_t z_im[NUM_TX_ANT][NUM_SC];
-data_t LH_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-data_t LH_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC];
-
-/* Same data but casted to vcomplex */
-vcomplex g_HH = { .re = (data_t *)HH_re, .im = (data_t *)HH_im };
-vcomplex g_HH_H = { .re = (data_t *)HH_H_re, .im = (data_t *)HH_H_im };
-vcomplex g_L = { .re = (data_t *)L_re, .im = (data_t *)L_im };
-vcomplex g_HHy = { .re = (data_t *)HHy_re, .im = (data_t *)HHy_im };
-vcomplex g_z = { .re = (data_t *)z_re, .im = (data_t *)z_im };
-vcomplex g_LH = { .re = (data_t *)LH_re, .im = (data_t *)LH_im };
-
-/*
- * Complex matrix operations
- */
-
-/** Complex Gram matrix H^H*H and add complex matrix R
- *
- * HH_H = H^H*H + R
- *
- * \global g_H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_HH_H output Gram matrix + R. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- */
-void cmatgram_TxRx_cadd()
-{
- size_t t1, t2, r;
- size_t sz, vl;
- size_t off_sc, off_A, off_AH;
- size_t off_HH_H_L, off_HH_H_U;
- data_t *A_re, *A_im, *AH_re, *AH_im;
- data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im;
- data_t *HH_H_L_re, *HH_H_L_im, *HH_H_U_re, *HH_H_U_im;
-
- for (t1 = 0; t1 != num_tx_cur; ++t1)
- for (t2 = t1; t2 != num_tx_cur; ++t2) {
- off_sc = 0;
- off_HH_H_L = t1 * num_tx_cur * num_sc_cur + t2 * num_sc_cur;
- off_HH_H_U = t2 * num_tx_cur * num_sc_cur + t1 * num_sc_cur;
- HH_H_L_re = &g_HH_H.re[off_HH_H_L];
- HH_H_L_im = &g_HH_H.im[off_HH_H_L];
- HH_H_U_re = &g_HH_H.re[off_HH_H_U];
- HH_H_U_im = &g_HH_H.im[off_HH_H_U];
- sz = num_sc_cur;
-
- while (sz > 0) {
- /* Initialize HH_H registers */
- /* v0 - HH_H real part */
- /* v1 - HH_H imaginary part */
- __asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
- "vmv.v.i v0, 0\n"
- "vmv.v.i v1, 0\n"
- : "=r"(vl) : "r"(sz));
-
- for (r = 0; r != num_rx_cur; ++r) {
- off_A = r * num_tx_cur * num_sc_cur + t1 * num_sc_cur + off_sc;
- off_AH = r * num_tx_cur * num_sc_cur + t2 * num_sc_cur + off_sc;
- A_re = &g_H.re[off_A];
- A_im = &g_H.im[off_A];
- AH_re = &g_H.re[off_AH];
- AH_im = &g_H.im[off_AH];
-
- /* Calculate A^H*A */
- /* v2 - A real part */
- /* v3 - A imaginary part */
- /* v4 - A^H real part */
- /* v5 - A^H imaginary part */
- __asm__ volatile(
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vle32.v v4, (%2)\n"
- "vle32.v v5, (%3)\n"
- /* real part */
- "vfmacc.vv v0, v2, v4\n"
- "vfmacc.vv v0, v3, v5\n"
- /* imaginary part */
- "vfmacc.vv v1, v3, v4\n"
- "vfnmsac.vv v1, v2, v5\n"
- :
- : "r"(A_re), "r"(A_im), "r"(AH_re), "r"(AH_im)
- );
- }
-
- /* Add R */
- /* v2 - R real part */
- /* v3 - R imaginary part */
- R_U_re = &g_R.re[off_HH_H_U];
- R_U_im = &g_R.im[off_HH_H_U];
- __asm__ volatile(
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vfadd.vv v2, v2, v0\n"
- "vfadd.vv v3, v3, v1\n"
- "vse32.v v2, (%2)\n"
- "vse32.v v3, (%3)\n"
- :
- : "r"(R_U_re), "r"(R_U_im), "r"(HH_H_U_re), "r"(HH_H_U_im));
- if (t1 != t2) {
- R_L_re = &g_R.re[off_HH_H_L];
- R_L_im = &g_R.im[off_HH_H_L];
- __asm__ volatile(
- /* Lower triangle */
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vfadd.vv v2, v2, v0\n"
- "vfsub.vv v3, v3, v1\n"
- "vse32.v v2, (%2)\n"
- "vse32.v v3, (%3)\n"
- :
- : "r"(R_L_re), "r"(R_L_im), "r"(HH_H_L_re), "r"(HH_H_L_im)
- );
- }
-
- sz -= vl;
- off_sc += vl;
- }
- }
-}
-
-/** Complex Cholesky decomposition L of a Hermitian positive-definite matrix HH_H
- *
- * HH_H = L*L^H
- * L_ij = (HH_H_ij - \sum_{k=0}^{j-1} L_ik L_jk^*) / L_jj
- * L_ii = sqrt(HH_H_ii - \sum_{k=0}^{i-1} L_ik L_ik^*)
- *
- * \global g_HH_H matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_L output lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- */
-void ccholesky_TxTx()
-{
- size_t i, j, k;
- size_t sz, vl;
- size_t off_sc;
-
- /* Init float registers */
- register float f0 __asm__("f0") = 2.0f;
-
- for (i = 0; i < num_tx_cur; ++i)
- for (j = 0; j <= i; ++j) {
- sz = num_sc_cur;
- off_sc = 0;
-
- while (sz > 0) {
- /* Initialize L registers */
- /* v0 - L_ij real part = HH_H_ij.re */
- /* v1 - L_ij imaginary part = HH_H_ij.im */
- __asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
- : "=r"(vl) : "r"(sz));
- __asm__ volatile(
- "vle32.v v0, (%0)\n"
- "vle32.v v1, (%1)\n"
- :
- : "r"(&g_HH_H.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
- "r"(&g_HH_H.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
- );
-
- /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */
- /* v2 - sum real part */
- /* v3 - sum imaginary part */
- __asm__ volatile(
- "vmv.v.i v2, 0\n"
- "vmv.v.i v3, 0\n"
- );
- for (k = 0; k < j; ++k) {
- __asm__ volatile(
- "vle32.v v4, (%0)\n"
- "vle32.v v5, (%1)\n"
- "vle32.v v6, (%2)\n"
- "vle32.v v7, (%3)\n"
- /* real part */
- "vfmacc.vv v2, v4, v6\n"
- "vfmacc.vv v2, v5, v7\n"
- /* imaginary part */
- "vfmacc.vv v3, v5, v6\n"
- "vfnmsac.vv v3, v4, v7\n"
- :
- : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
- "r"(&g_L.im[i * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
- "r"(&g_L.re[j * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]),
- "r"(&g_L.im[j * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc])
- );
- }
-
- /* HH_H_ii - sum */
- __asm__ volatile(
- "vfsub.vv v0, v0, v2\n"
- "vfsub.vv v1, v1, v3\n"
- );
-
- if (i == j) {
- /* Calculate L_ii = sqrt(HH_H_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */
- __asm__ volatile(
- /* Complex sqrt */
-
- /* v2 = r = sqrt(re^2 + im^2) */
- "vfmul.vv v2, v0, v0\n"
- "vfmacc.vv v2, v1, v1\n"
- "vfsqrt.v v2, v2\n"
-
- /* v3 - real part */
- "vfadd.vv v3, v2, v0\n" /* r + re */
- "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */
- "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */
- /* v4 - imaginary part */
- "vfsub.vv v4, v2, v0\n" /* r - re */
- "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */
- "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */
- "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */
-
- /* TODO handle im == 0 */
-
- /* Move the result to v0 and v1 */
- "vmv.v.v v0, v3\n"
- "vmv.v.v v1, v4\n"
- );
- } else {
- /* Calculate L_ij = (HH_H_ij - sum) / L_jj */
- __asm__ volatile(
- /* L_jj */
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */
- "vfmul.vv v4, v2, v2\n"
- "vfmacc.vv v4, v3, v3\n"
- /* real part */
- "vfmul.vv v5, v0, v2\n"
- "vfmacc.vv v5, v1, v3\n"
- /* imaginary part */
- "vfmul.vv v6, v1, v2\n"
- "vfnmsac.vv v6, v0, v3\n"
- /* divide and store at v0 and v1 */
- "vfdiv.vv v0, v5, v4\n"
- "vfdiv.vv v1, v6, v4\n"
- :
- : "r"(&g_L.re[j * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
- "r"(&g_L.im[j * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
- );
- }
-
- /* Store result */
- __asm__ volatile(
- "vse32.v v0, (%0)\n"
- "vse32.v v1, (%1)\n"
- :
- : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
- "r"(&g_L.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc])
- );
-
- sz -= vl;
- off_sc += vl;
- }
- }
-}
-
-/** Complex matrix-vector multiplication HHy = HH*y
- *
- * \global g_HH matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC]
- * \global g_y vector. Shape [NUM_RX_ANT][NUM_SC]
- * \global g_HHy output vector. Shape [NUM_TX_ANT][NUM_SC]
- */
-void cmatvecmul_TxRx()
-{
- size_t i, j;
- size_t off_HH, off_y, off_HHy, off_sc;
- size_t sz, vl;
-
- for (i = 0; i < num_tx_cur; ++i) {
- off_HHy = i * num_sc_cur;
- off_sc = 0;
- sz = num_sc_cur;
-
- while (sz > 0) {
- /* Initialize result registers */
- /* v0 - HHy real part */
- /* v1 - HHy imaginary part */
- __asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
- "vmv.v.i v0, 0\n"
- "vmv.v.i v1, 0\n"
- : "=r"(vl)
- : "r"(sz)
- );
-
- for (j = 0; j < num_rx_cur; ++j) {
- off_HH = i * num_rx_cur * num_sc_cur + j * num_sc_cur + off_sc;
- off_y = j * num_sc_cur + off_sc;
- __asm__ volatile(
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vle32.v v4, (%2)\n"
- "vle32.v v5, (%3)\n"
- /* real part */
- "vfmacc.vv v0, v2, v4\n"
- "vfnmsac.vv v0, v3, v5\n"
- /* imaginary part */
- "vfmacc.vv v1, v3, v4\n"
- "vfmacc.vv v1, v2, v5\n"
- :
- : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]),
- "r"(&g_y.re[off_y]), "r"(&g_y.im[off_y])
- );
- }
-
- /* Store result */
- __asm__ volatile(
- "vse32.v v0, (%0)\n"
- "vse32.v v1, (%1)\n"
- :
- : "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy])
- );
-
- sz -= vl;
- off_HHy += vl;
- off_sc += vl;
- }
- }
-}
-
-/** Complex forward substitution L*z = HHy
- *
- * z_i = (HHy_i - \sum_{k=0}^{i-1} L_{ik} z_k) / L_{ii}
- *
- * \global g_L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_HHy vector. Shape [NUM_TX_ANT][NUM_SC]
- * \global g_z output vector. Shape [NUM_TX_ANT][NUM_SC]
- */
-void cforwardsub_TxTx()
-{
- size_t i, j;
- size_t sz, vl;
- size_t off_sc;
-
- for (i = 0; i < num_tx_cur; ++i) {
- off_sc = 0;
- sz = num_sc_cur;
-
- while (sz > 0) {
- // printf("sz: %lu\n", sz);
- // printf("vl: %lu\n", vl);
- /* Initialize result registers as b */
- /* v0 - result real part */
- /* v1 - result imaginary part */
- __asm__ volatile (
- "vsetvli %0, %1, e32, m1, ta, ma\n"
- : "=r"(vl)
- : "r"(sz));
- __asm__ volatile (
- "vle32.v v0, (%0)\n"
- "vle32.v v1, (%1)\n"
- :
- : "r"(&g_HHy.re[i * num_sc_cur + off_sc]),
- "r"(&g_HHy.im[i * num_sc_cur + off_sc]));
- // printf("vl: %lu\n", vl);
-
- for (j = 0; j != i; ++j) {
- /* b - sum L_ij * z_j */
- /* v2 - L real part */
- /* v3 - L imaginary part */
- /* v4 - result_j real part */
- /* v5 - result_j imaginary part */
- __asm__ volatile (
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vle32.v v4, (%2)\n"
- "vle32.v v5, (%3)\n"
- /* real part */
- "vfnmsac.vv v0, v2, v4\n"
- "vfmacc.vv v0, v3, v5\n"
- /* imaginary part */
- "vfnmsac.vv v1, v3, v4\n"
- "vfnmsac.vv v1, v2, v5\n"
- :
- : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
- "r"(&g_L.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]),
- "r"(&g_z.re[j * num_sc_cur + off_sc]),
- "r"(&g_z.im[j * num_sc_cur + off_sc])
- );
- }
-
- /* Divide by L_ii */
- /* v2 - L_ii real part */
- /* v3 - L_ii imaginary part */
- __asm__ volatile (
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
- "vfmul.vv v4, v2, v2\n"
- "vfmacc.vv v4, v3, v3\n"
- /* real part */
- "vfmul.vv v5, v0, v2\n"
- "vfmacc.vv v5, v1, v3\n"
- "vdiv.vv v0, v5, v4\n"
- /* imaginary part */
- "vfmul.vv v6, v1, v2\n"
- "vfnmsac.vv v6, v0, v3\n"
- "vfdiv.vv v1, v6, v4\n"
- /* store result */
- "vse32.v v0, (%2)\n"
- "vse32.v v1, (%3)\n"
- :
- : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
- "r"(&g_L.im[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
- "r"(&g_z.re[i * num_sc_cur + off_sc]),
- "r"(&g_z.im[i * num_sc_cur + off_sc])
- );
-
- off_sc += vl;
- sz -= vl;
- }
- }
-}
-
-/** Complex backward substitution L^H*x_MMSE = z
- *
- * x_i = (z_i - \sum_{j=i+1}^{n-1} L_{ji} x_k) / L_{ii}
- *
- * \global g_L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC]
- * \global g_z rhs vector. Shape [NUM_TX_ANT][NUM_SC]
- * \global g_x_MMSE output vector. Shape [NUM_TX_ANT][NUM_SC]
- */
-void cbackwardsub_TxTx()
-{
- size_t i, j;
- size_t sz, vl;
- size_t off_sc;
-
- for (i = num_tx_cur - 1; i != (size_t)-1; --i) {
- off_sc = 0;
- sz = num_sc_cur;
-
- while (sz > 0){
- /* Initialize result registers as z */
- /* v0 - result real part */
- /* v1 - result imaginary part */
- __asm__ volatile(
- "vsetvli %0, %1, e32, m1, ta, ma\n"
- : "=r"(vl)
- : "r"(sz));
- __asm__ volatile(
- "vle32.v v0, (%0)\n"
- "vle32.v v1, (%1)\n"
- :
- : "r"(&g_z.re[i * num_sc_cur + off_sc]),
- "r"(&g_z.im[i * num_sc_cur + off_sc]));
-
- for (j = i + 1; j < num_tx_cur; ++j) {
- /* b - sum L_ji * z_j */
- /* v2 - L real part */
- /* v3 - L imaginary part */
- /* v4 - x_MMSE_j real part */
- /* v5 - x_MMSE_j imaginary part */
- __asm__ volatile(
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- "vle32.v v4, (%2)\n"
- "vle32.v v5, (%3)\n"
- /* real part */
- "vfnmsac.vv v0, v2, v4\n"
- "vfmacc.vv v0, v3, v5\n"
- /* imaginary part */
- "vfnmsac.vv v1, v3, v4\n"
- "vfnmsac.vv v1, v2, v5\n"
- :
- : "r"(&g_L.re[j * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
- "r"(&g_L.im[j * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
- "r"(&g_x_MMSE.re[j * num_sc_cur + off_sc]),
- "r"(&g_x_MMSE.im[j * num_sc_cur + off_sc])
- );
- }
-
- /* Divide by L_ii */
- /* v2 - L_ii real part */
- /* v3 - L_ii imaginary part */
- __asm__ volatile (
- "vle32.v v2, (%0)\n"
- "vle32.v v3, (%1)\n"
- /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */
- "vfmul.vv v4, v2, v2\n"
- "vfmacc.vv v4, v3, v3\n"
- /* real part */
- "vfmul.vv v5, v0, v2\n"
- "vfmacc.vv v5, v1, v3\n"
- "vdiv.vv v0, v5, v4\n"
- /* imaginary part */
- "vfmul.vv v6, v1, v2\n"
- "vfnmsac.vv v6, v0, v3\n"
- "vfdiv.vv v1, v6, v4\n"
- /* store HH_H */
- "vse32.v v0, (%2)\n"
- "vse32.v v1, (%3)\n"
- :
- : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
- "r"(&g_L.im[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]),
- "r"(&g_x_MMSE.re[i * num_sc_cur + off_sc]),
- "r"(&g_x_MMSE.im[i * num_sc_cur + off_sc])
- );
-
- sz -= vl;
- off_sc += vl;
- }
- }
-}
-
-/*
- * MMSE
- */
-
-void mmse()
-{
- /* H^H*H + R */
- TIME(
- "Gram and add (RxTx x TxRx + TxTx): %ld\n",
- cmatgram_TxRx_cadd);
- /* L: (H^H*H + R) = L*L^H */
- TIME(
- "Cholesky (TxTx): %ld\n",
- ccholesky_TxTx);
- /* z: L*z = H^H*y */
- TIME(
- "Matrix-vector multiplication (TxRx x Rx): %ld\n",
- cmatvecmul_TxRx);
- TIME(
- "Forward substitution (TxTx): %ld\n",
- cforwardsub_TxTx);
- /* x_MMSE: L^H*x_MMSE = z */
- TIME(
- "Backward substitution (TxTx): %ld\n",
- cbackwardsub_TxTx);
-}
-
-acc_t mse()
-{
- acc_t sum = 0.;
- size_t off = 0;
- data_t sub1, sub2;
- size_t sz = num_tx_cur * num_sc_cur, vl;
- register data_t num_tx_num_sc_reg __asm__("f0") = (data_t)(num_tx_cur * NUM_SC);
-
- while (sz > 0) {
- __asm__ volatile (
- "vsetvli %0, %1, e32, m1, ta, ma\n"
- : "=r"(vl)
- : "r"(sz));
- __asm__ volatile (
- "vle32.v v0, (%1)\n"
- "vle32.v v1, (%2)\n"
- "vle32.v v2, (%3)\n"
- "vle32.v v3, (%4)\n"
- "vfsub.vv v0, v0, v2\n"
- "vfsub.vv v1, v1, v3\n"
- "vfmul.vv v0, v0, v0\n"
- "vfmul.vv v1, v1, v1\n"
- "vfadd.vv v0, v0, v1\n"
- "vfdiv.vf v0, v0, %5\n"
- "vfredusum.vs v0, v4, v5\n"
- "vmv.x.s %0, v0\n"
- : "+r"(sum)
- : "r"(&g_x.re[off]),
- "r"(&g_x.re[off]),
- "r"(&g_x_MMSE.re[off]),
- "r"(&g_x_MMSE.re[off]),
- "f"(num_tx_num_sc_reg));
- sz -= vl;
- off += vl;
- }
-
- return sum;
-}