mmserv

Minimum Mean Square Error detection on RISC-V Vector Extention
git clone https://git.ea.contact/mmserv
Log | Files | Refs | README

commit cd5c7cbc34e6593198ef39c5fc61661a6998ae86
parent c3c44a832f0f41859028f99bd12f46af43e5131b
Author: Egor Achkasov <eaachkasov@gmail.com>
Date:   Tue, 20 May 2025 00:48:28 +0200

Massive refactor. Squash all the branches into master.

Diffstat:
MGNUmakefile | 108+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
Ainclude/common.h | 81+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dinclude/define.h | 20--------------------
Mmain.c | 173+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
Mscript/gen_data.py | 98++++++++++++++++++++++---------------------------------------------------------
Mscript/util.py | 36+++++-------------------------------
Asrc/cbackwardsub.c | 132+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/ccholesky.c | 236+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/cforwardsub.c | 131+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/cmatgram.c | 131+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/cmatvecmul.c | 100+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/mmserv.c | 631-------------------------------------------------------------------------------
12 files changed, 1036 insertions(+), 841 deletions(-)

diff --git a/GNUmakefile b/GNUmakefile @@ -1,37 +1,91 @@ -CC = gcc -CFLAGS = -Iinclude -fno-builtin -Wall -Wextra -OBJDIR = obj -BUILDDIR = . +# Default values +ARCH ?= x86 +DATA_TYPE ?= float +PLATFORM ?= linux +NUM_RX ?= 4 +NUM_TX ?= 4 +NUM_SC ?= 1024 -SRC = $(wildcard src/*.c) -OBJ = $(SRC:src/%.c=$(OBJDIR)/%.o) -OBJ += $(OBJDIR)/main.o +# Valid values +VALID_ARCHS := x86 rv rvv +VALID_DATA_TYPES := float fixed +VALID_PLATFORMS := linux ara baremetal -TARGET = $(BUILDDIR)/mmse +# Validate inputs +ifneq ($(filter $(ARCH),$(VALID_ARCHS)),$(ARCH)) + $(error Invalid ARCH: $(ARCH). Supported: $(VALID_ARCHS)) +endif +ifneq ($(filter $(DATA_TYPE),$(VALID_DATA_TYPES)),$(DATA_TYPE)) + $(error Invalid DATA_TYPE: $(DATA_TYPE). Supported: $(VALID_DATA_TYPES)) +endif +ifneq ($(filter $(PLATFORM),$(VALID_PLATFORMS)),$(PLATFORM)) + $(error Invalid PLATFORM: $(PLATFORM). Supported: $(VALID_PLATFORMS)) +endif +ifneq ($(shell test $(NUM_RX) -gt 0 >/dev/null 2>&1 && echo valid),valid) + $(error NUM_RX must be an integer > 0) +endif +ifneq ($(shell test $(NUM_TX) -gt 0 >/dev/null 2>&1 && echo valid),valid) + $(error NUM_TX must be an integer > 0) +endif +ifneq ($(shell test $(NUM_SC) -gt 0 >/dev/null 2>&1 && echo valid),valid) + $(error NUM_SC must be an integer > 0) +endif -all: $(TARGET) +# CFLAGS +CFLAGS += -DARCH_$(ARCH) +CFLAGS += -DDATA_TYPE_$(DATA_TYPE) +CFLAGS += -DPLATFORM_$(PLATFORM) +CFLAGS += -DNUM_RX=$(NUM_RX) +CFLAGS += -DNUM_TX=$(NUM_TX) +CFLAGS += -DNUM_SC=$(NUM_SC) -# Debug target -dbg: CFLAGS += -g -dbg: $(TARGET) +# Compiler selection +ifeq ($(ARCH),x86) + CC := gcc +else + CC := riscv64-unknown-elf-gcc +endif -# Compile the elf -$(TARGET): $(OBJ) - @mkdir -p $(BUILDDIR) - $(CC) -o $@ $^ +# Output file +OUTPUT := build/mmse_$(ARCH)_$(DATA_TYPE)_$(PLATFORM)_$(NUM_RX)x$(NUM_TX)x$(NUM_SC).elf -# Compile the src/*.c object files -$(OBJDIR)/%.o: src/%.c - @mkdir -p $(OBJDIR) - $(CC) $(CFLAGS) -c $< -o $@ +# Source files +SRCS := main.c $(wildcard src/*.c) -# Compile the main.c object files -$(OBJDIR)/main.o: main.c - @mkdir -p $(OBJDIR) - $(CC) $(CFLAGS) -c $< -o $@ +# Phony targets +.PHONY: all help gen_data -clean: - rm -rf $(OBJDIR)/* $(BUILDDIR)/mmse +# Default target +all: gen_data $(OUTPUT) -.PHONY: all dbg clean +# Run data generation +gen_data: + python script/gen_data.py $(NUM_TX) $(NUM_RX) $(NUM_SC) +# Compile +$(OUTPUT): $(SRCS) + mkdir -p build + $(CC) $(CFLAGS) $^ -o $@ + +# Help target +help: + @echo "Usage:" + @echo " make [ARCH=<arch>] [DATA_TYPE=<type>] [PLATFORM=<platform>] [NUM_RX=<num_rx>] [NUM_TX=<num_tx>] [NUM_SC=<num_sc>]" + @echo "" + @echo "Supported ARCH values:" + @echo " - x86 (default)" + @echo " - rv" + @echo " - rvv" + @echo "" + @echo "Supported DATA_TYPE values:" + @echo " - float (default)" + @echo " - fixed" + @echo "" + @echo "Supported PLATFORM values:" + @echo " - linux (default)" + @echo " - ara" + @echo " - baremetal" + @echo "" + @echo "Supported NUM_RX values: integers > 0 (default = 4)" + @echo "Supported NUM_TX values: integers > 0 (default = 4)" + @echo "Supported NUM_SC values: integers > 0 (default = 1024)" diff --git a/include/common.h b/include/common.h @@ -0,0 +1,81 @@ +#ifndef COMMON_H +#define COMMON_H + +#include <stdint.h> /* for uint64_t, uint32_t */ + + +/* + * Typedefs + */ + +#if defined(DATA_TYPE_float) +typedef float data_t; +typedef float acc_t; +#elif defined(DATA_TYPE_fixed) +typedef int32_t data_t; +typedef int64_t acc_t; +#define FP_Q 31 +#else +#error "Please define DATA_TYPE_float or DATA_TYPE_fixed" +#endif + +typedef struct { + data_t *re; + data_t *im; +} vcomplex; + + +/* + * Global variables + */ + +/* Raw data */ +/* Transmitted signal */ +extern data_t x_re[NUM_TX][NUM_SC]; +extern data_t x_im[NUM_TX][NUM_SC]; +/* Channel */ +extern data_t H_re[NUM_RX][NUM_TX][NUM_SC]; +extern data_t H_im[NUM_RX][NUM_TX][NUM_SC]; +/* Noise covariance matrix */ +extern data_t R_re[NUM_TX][NUM_TX][NUM_SC]; +extern data_t R_im[NUM_TX][NUM_TX][NUM_SC]; +/* Received signal */ +extern data_t y_re[NUM_RX][NUM_SC]; +extern data_t y_im[NUM_RX][NUM_SC]; +/* MMSE raw data */ +extern data_t G_re[NUM_TX][NUM_TX][NUM_SC]; +extern data_t G_im[NUM_TX][NUM_TX][NUM_SC]; +extern data_t L_re[NUM_TX][NUM_TX][NUM_SC]; +extern data_t L_im[NUM_TX][NUM_TX][NUM_SC]; +extern data_t g_D[NUM_TX][NUM_SC]; /* no imaginary part in D */ +extern data_t HHy_re[NUM_TX][NUM_SC]; +extern data_t HHy_im[NUM_TX][NUM_SC]; +extern data_t z_re[NUM_TX][NUM_SC]; +extern data_t z_im[NUM_TX][NUM_SC]; +/* Result of MMSE approximation */ +extern data_t x_MMSE_re[NUM_TX][NUM_SC]; +extern data_t x_MMSE_im[NUM_TX][NUM_SC]; + +/* Same data but casted to vcomplex */ +extern vcomplex g_x; +extern vcomplex g_H; +extern vcomplex g_R; +extern vcomplex g_y; +extern vcomplex g_G; +extern vcomplex g_L; +extern vcomplex g_HHy; +extern vcomplex g_z; +extern vcomplex g_x_MMSE; + + +/* + * Complex matrix operations + */ + +extern void cmatgram(); +extern void ccholesky(); +extern void cmatvecmul(); +extern void cforwardsub(); +extern void cbackwardsub(); + +#endif diff --git a/include/define.h b/include/define.h @@ -1,20 +0,0 @@ -#ifndef __DEFINE_H -#define __DEFINE_H - -#define NUM_RX_ANT 4 -#define NUM_TX_ANT 4 -#define NUM_SC 1 - -#define DEBUG - -#define IN -#define OUT - -typedef float data_t; -typedef float acc_t; -typedef struct { - data_t *re; - data_t *im; -} vcomplex; - -#endif diff --git a/main.c b/main.c @@ -1,72 +1,123 @@ -#include "include/define.h" - -#include "../common/printf.h" - -/* extern functions */ -extern void cmatgram_TxRx_cadd(); -extern void ccholesky_TxTx(); -extern void cmatvecmul_TxRx(); -extern void cforwardsub_TxTx(); -extern void cbackwardsub_TxTx(); -extern acc_t mse(); - -/* Raw data */ -/* Transmitted signal */ -extern data_t x_re[NUM_TX_ANT][NUM_SC]; -extern data_t x_im[NUM_TX_ANT][NUM_SC]; -/* Channel */ -extern data_t H_re[NUM_RX_ANT][NUM_TX_ANT][NUM_SC]; -extern data_t H_im[NUM_RX_ANT][NUM_TX_ANT][NUM_SC]; -/* Noise covariance matrix */ -extern data_t R_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; -extern data_t R_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; -/* Received signal */ -extern data_t y_re[NUM_RX_ANT][NUM_SC]; -extern data_t y_im[NUM_RX_ANT][NUM_SC]; - -/* Same data but casted to vcomplex */ +#include "include/common.h" + +#include <stddef.h> /* for size_t */ + + +/* + * Defines + */ + +/* Got from https://elm-chan.org/junk/32bit/binclude.html */ +/* Import a binary file */ +#define IMPORT_BIN(sect, file, sym) __asm__ (\ + ".section " #sect "\n" /* Change section */\ + ".balign 4\n" /* Word alignment */\ + ".global " #sym "\n" /* Export the object address */\ + #sym ":\n" /* Define the object label */\ + ".incbin \"" file "\"\n" /* Import the file */\ + ".global _sizeof_" #sym "\n" /* Export the object size */\ + ".set _sizeof_" #sym ", . - " #sym "\n" /* Define the object size */\ + ".balign 4\n" /* Word alignment */\ + ".section \".text\"\n") /* Restore section */ + + +/* + * Global variables + */ + +/* Import data from binary files */ +IMPORT_BIN(.rodata, "data/x_re.bin", x_re); +IMPORT_BIN(.rodata, "data/x_im.bin", x_im); +IMPORT_BIN(.rodata, "data/H_re.bin", H_re); +IMPORT_BIN(.rodata, "data/H_im.bin", H_im); +IMPORT_BIN(.rodata, "data/R_re.bin", R_re); +IMPORT_BIN(.rodata, "data/R_im.bin", R_im); +IMPORT_BIN(.rodata, "data/y_re.bin", y_re); +IMPORT_BIN(.rodata, "data/y_im.bin", y_im); + +/* Allocate space for MMSE raw data */ +data_t G_re[NUM_TX][NUM_TX][NUM_SC]; +data_t G_im[NUM_TX][NUM_TX][NUM_SC]; +data_t L_re[NUM_TX][NUM_TX][NUM_SC]; +data_t L_im[NUM_TX][NUM_TX][NUM_SC]; +data_t g_D[NUM_TX][NUM_SC]; /* no imaginary part in D */ +data_t HHy_re[NUM_TX][NUM_SC]; +data_t HHy_im[NUM_TX][NUM_SC]; +data_t z_re[NUM_TX][NUM_SC]; +data_t z_im[NUM_TX][NUM_SC]; +data_t x_MMSE_re[NUM_TX][NUM_SC]; +data_t x_MMSE_im[NUM_TX][NUM_SC]; + +/* Initialize data */ vcomplex g_x = { .re = (data_t *)x_re, .im = (data_t *)x_im }; vcomplex g_H = { .re = (data_t *)H_re, .im = (data_t *)H_im }; vcomplex g_R = { .re = (data_t *)R_re, .im = (data_t *)R_im }; vcomplex g_y = { .re = (data_t *)y_re, .im = (data_t *)y_im }; - -/* MMSE approximation will be stored in this global*/ -data_t x_MMSE_re[NUM_TX_ANT][NUM_SC]; -data_t x_MMSE_im[NUM_TX_ANT][NUM_SC]; +vcomplex g_G = { .re = (data_t *)G_re, .im = (data_t *)G_im }; +vcomplex g_L = { .re = (data_t *)L_re, .im = (data_t *)L_im }; +vcomplex g_HHy = { .re = (data_t *)HHy_re, .im = (data_t *)HHy_im }; +vcomplex g_z = { .re = (data_t *)z_re, .im = (data_t *)z_im }; vcomplex g_x_MMSE = { .re = (data_t *)x_MMSE_re, .im = (data_t *)x_MMSE_im }; -extern vcomplex g_HH, h_y, g_HHy; -size_t num_rx_cur=1, num_tx_cur=1, num_sc_cur=1; + +/* + * Read cycles macro and function + */ + +/** Read and return the cycle counter value */ +uint64_t readcycle() { +#if defined(ARCH_rvv) || defined(ARCH_rv) + uint64_t val; + __asm__ volatile("rdcycle %0" : "=r"(val)); + return val; +#elif defined(ARCH_x86) + unsigned int hi, lo; + __asm__ volatile("rdtsc" : "=a"(lo), "=d"(hi)); + return ((uint64_t)hi << 32) | lo; +#else +#error "Unknown architecture" +#endif +} + +/** Read cycles for a function call and store the result in out */ +#define FUNC_CYCLES(func, out) \ + do { \ + uint64_t start = readcycle(); \ + func; \ + uint64_t end = readcycle(); \ + out = end - start; \ + } while (0) + + +/* + * Printf function + */ + +#if defined(PLATFORM_ara) +#include "../../common/printf.h" +#elif defined(PLATFORM_linux) +#include <stdio.h> +#elif defined(PLATFORM_baremetal) +/* TODO */ +#else +#error "Unknown platform" +#endif + + +/* + * Main + */ int main() { - unsigned long long start, end; - - size_t t1,t2,t3,t4,t5; - for (num_rx_cur = 1; num_rx_cur <= NUM_RX_ANT; ++num_rx_cur) - for (num_tx_cur = 1; num_tx_cur <= NUM_TX_ANT; ++num_tx_cur) - for (num_sc_cur = 1; num_sc_cur <= NUM_SC; num_sc_cur += 1) { - __asm__ volatile("rdcycle %0" : "=r"(start)); - cmatgram_TxRx_cadd(); - __asm__ volatile("rdcycle %0" : "=r"(end)); - t1 = end - start; - __asm__ volatile("rdcycle %0" : "=r"(start)); - ccholesky_TxTx(); - __asm__ volatile("rdcycle %0" : "=r"(end)); - t2 = end - start; - __asm__ volatile("rdcycle %0" : "=r"(start)); - cmatvecmul_TxRx(); - __asm__ volatile("rdcycle %0" : "=r"(end)); - t3 = end - start; - __asm__ volatile("rdcycle %0" : "=r"(start)); - cforwardsub_TxTx(); - __asm__ volatile("rdcycle %0" : "=r"(end)); - t4 = end - start; - __asm__ volatile("rdcycle %0" : "=r"(start)); - cbackwardsub_TxTx(); - __asm__ volatile("rdcycle %0" : "=r"(end)); - t5 = end - start; - printf("%llu,%llu,%llu,%llu,%llu,", t1, t2, t3, t4, t5); - } + uint64_t start, end; + uint64_t t1,t2,t3,t4,t5; + + FUNC_CYCLES(cmatgram(), t1); + FUNC_CYCLES(ccholesky(), t2); + FUNC_CYCLES(cmatvecmul(), t3); + FUNC_CYCLES(cforwardsub(), t4); + FUNC_CYCLES(cbackwardsub(), t5); + printf("%lu,%lu,%lu,%lu,%lu,", t1, t2, t3, t4, t5); return 0; } diff --git a/script/gen_data.py b/script/gen_data.py @@ -1,42 +1,34 @@ #!/usr/bin/python -from util import read_defines - import numpy as np from numpy.random import random, normal from sys import argv from os import path, makedirs -if "--help" in argv: - print("Usage: python scripts/gen_data.py [--txt] [--bin] [--s]") - print(" --txt: write the data/*.txt files") - print(" --bin: write the data/*.bin files") - print(" --s: print the .S file to stdout") - exit(0) - -# Change these flags to control the output -WRITE_DATA_TXT = "--txt" in argv -WRITE_DATA_BIN = "--bin" in argv -WRITE_DATA_S = "--s" in argv or len(argv) == 1 +if len(argv) != 4: + print("Usage: python gen_data.py <NUM_RX> <NUM_TX> <NUM_SC>") + exit(1) +NUM_RX = int(argv[1]) # number of receive antennas +NUM_TX = int(argv[2]) # number of transmit antennas +NUM_SC = int(argv[3]) # number of subcarriers -NUM_RX_ANT, NUM_TX_ANT, NUM_SC = read_defines() NOISE_STD_DEVIATION = np.sqrt(.5) / 100 # noise standard deviation # Transmitter signal -x = random((NUM_TX_ANT, NUM_SC)) \ - + 1.j * random((NUM_TX_ANT, NUM_SC)) +x = random((NUM_TX, NUM_SC)) \ + + 1.j * random((NUM_TX, NUM_SC)) x = (x - 0.5 - 0.5j) * 2 # scale it from [0, 1] to [-1, 1] # Channel -H = random((NUM_RX_ANT, NUM_TX_ANT, NUM_SC)) \ - + 1.j * random((NUM_RX_ANT, NUM_TX_ANT, NUM_SC)) +H = random((NUM_RX, NUM_TX, NUM_SC)) \ + + 1.j * random((NUM_RX, NUM_TX, NUM_SC)) H = (H - 0.5 - 0.5j) * 2 # scale it from [0, 1] to [-1, 1] # Noise -n = normal(0, NOISE_STD_DEVIATION, (NUM_RX_ANT, NUM_SC)) \ - + 1.j * normal(0, NOISE_STD_DEVIATION, (NUM_RX_ANT, NUM_SC)) +n = normal(0, NOISE_STD_DEVIATION, (NUM_RX, NUM_SC)) \ + + 1.j * normal(0, NOISE_STD_DEVIATION, (NUM_RX, NUM_SC)) # Received signal y = np.einsum("ijk,jk->ik", H, x) + n # Noise covariance matrix -R = np.eye(NUM_TX_ANT, NUM_TX_ANT, dtype=np.complex64) * NOISE_STD_DEVIATION**2 +R = np.eye(NUM_TX, NUM_TX, dtype=np.complex64) * NOISE_STD_DEVIATION**2 data_tuple = ( x.real.astype(np.float32), x.imag.astype(np.float32), @@ -44,55 +36,19 @@ data_tuple = ( R.real.astype(np.float32), R.imag.astype(np.float32), y.real.astype(np.float32), y.imag.astype(np.float32) ) - - -class Section: - def __init__(self, name, source, align, sizeof, length, duplicate=False): - self.name = name - self.source = source - self.align = align - self.sizeof = sizeof - self.length = length - self.duplicate = duplicate - - -sections = [ - Section("x_re", "data/x_re.txt", "3", 32, x.size), - Section("x_im", "data/x_im.txt", "3", 32, x.size), - Section("H_re", "data/H_re.txt", "3", 32, H.size), - Section("H_im", "data/H_im.txt", "3", 32, H.size), - Section("R_re", "data/R_re.txt", "3", 32, R.size), - Section("R_im", "data/R_im.txt", "3", 32, R.size), - Section("y_re", "data/y_re.txt", "3", 32, y.size), - Section("y_im", "data/y_im.txt", "3", 32, y.size), -] +data_filenames = ( + "data/x_re.bin", "data/x_im.bin", + "data/H_re.bin", "data/H_im.bin", + "data/R_re.bin", "data/R_im.bin", + "data/y_re.bin", "data/y_im.bin" +) # Create "data" directory if it does not exist -if WRITE_DATA_BIN or WRITE_DATA_TXT: - data_dir = path.join(path.dirname(__file__), "..", "data") - if not path.exists(data_dir): - makedirs(data_dir) - -if WRITE_DATA_TXT: - for data, sec in zip(data_tuple, sections): - with open(sec.source, "w") as f: - for sample in data: - f.write(f"{sample}\n") - -if WRITE_DATA_BIN: - for data, sec in zip(data_tuple, sections): - with open(sec.source.replace(".txt", ".bin"), "wb") as f: - f.write(data.tobytes()) - -if WRITE_DATA_S: - print(".section .data,\"aw\",@progbits") - for data, sec in zip(data_tuple, sections): - print(f".global {sec.name}") - print(f"{sec.name}:") - for sample in data: - bs = sample.tobytes() - for i in range(0, len(bs), 4): - s = "" - for n in range(4): - s += "%02x" % bs[i+3-n] - print(" .word 0x%s" % s) +data_dir = path.join(path.dirname(__file__), "..", "data") +if not path.exists(data_dir): + makedirs(data_dir) + +# Write data to bin files +for data, filename in zip(data_tuple, data_filenames): + with open(filename, "wb") as f: + f.write(data.tobytes()) diff --git a/script/util.py b/script/util.py @@ -2,33 +2,7 @@ from os import path import numpy as np -def read_defines(): - """Read the defines from the define.h file - - Returns: - int: Number of receive antennas - int: Number of transmit antennas - int: Number of subcarriers - """ - with open(path.join(path.dirname(__file__), "..", "include", "define.h"), "r") as f: - lines = f.read().split("\n") - for line in lines: - if line.startswith("#define NUM_RX_ANT "): - NUM_RX_ANT = int(line[19:]) - if line.startswith("#define NUM_TX_ANT "): - NUM_TX_ANT = int(line[19:]) - if line.startswith("#define NUM_SC "): - NUM_SC = int(line[15:]) - - # Assert that all the defines are read - assert NUM_RX_ANT - assert NUM_TX_ANT - assert NUM_SC - - return NUM_RX_ANT, NUM_TX_ANT, NUM_SC - - -def load_xHRy(NUM_RX_ANT, NUM_TX_ANT, NUM_SC) -> tuple: +def load_xHRy(NUM_RX, NUM_TX, NUM_SC) -> tuple: """Load x, H, R and y from the data files. Assumes that the following files are present in the data directory: x_re.bin, x_im.bin, H_re.bin, H_im.bin, R_re.bin, R_im.bin, y_re.bin, y_im.bin @@ -39,15 +13,15 @@ def load_xHRy(NUM_RX_ANT, NUM_TX_ANT, NUM_SC) -> tuple: """ x = np.fromfile("data/x_re.bin", dtype=np.float32) x = x + 1j*np.fromfile("data/x_im.bin", dtype=np.float32) - x = x.reshape((NUM_TX_ANT, NUM_SC)) + x = x.reshape((NUM_TX, NUM_SC)) H = np.fromfile("data/H_re.bin", dtype=np.float32) H = H + 1j*np.fromfile("data/H_im.bin", dtype=np.float32) - H = H.reshape((NUM_RX_ANT, NUM_TX_ANT, NUM_SC)) + H = H.reshape((NUM_RX, NUM_TX, NUM_SC)) R = np.fromfile("data/R_re.bin", dtype=np.float32) R = R + 1j*np.fromfile("data/R_im.bin", dtype=np.float32) - R = R.reshape((NUM_TX_ANT, NUM_TX_ANT, NUM_SC)) + R = R.reshape((NUM_TX, NUM_TX, NUM_SC)) y = np.fromfile("data/y_re.bin", dtype=np.float32) y = y + 1j*np.fromfile("data/y_im.bin", dtype=np.float32) - y = y.reshape((NUM_RX_ANT, NUM_SC)) + y = y.reshape((NUM_RX, NUM_SC)) return x, H, R, y diff --git a/src/cbackwardsub.c b/src/cbackwardsub.c @@ -0,0 +1,132 @@ +#include "../include/common.h" + +#include <stddef.h> + +/** Complex backward substitution L^H*x_MMSE = z + * + * x_MMSE_t = (z_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) / L_{t t} (for LL / float solution) + * x_MMSE_t = (z_t / D_t - \sum_{tt=t+1}^{NUM_TX-1} L_{tt t} x_tt) (for LDL / fixed solution) + * + * \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC] + * \global g_D diagonal matrix (only if DATA_TYPE_fixed is defined). Shape [NUM_TX][NUM_SC] + * \global g_z rhs vector. Shape [NUM_TX][NUM_SC] + * \global g_x_MMSE output vector. Shape [NUM_TX][NUM_SC] + */ +void cbackwardsub() +{ +#if defined(ARCH_x86) || defined(ARCH_rv) + size_t t, tt, s; + size_t off_L, off_z, off_x_MMSE, off_D; + acc_t sum_re, sum_im; + for (t = NUM_TX - 1; t != (size_t)-1; --t) { + for (s = 0; s < NUM_SC; ++s) { + sum_re = sum_im = 0; + for (tt = t + 1; tt < NUM_TX; ++tt) { + off_L = tt * NUM_TX * NUM_SC + t * NUM_SC + s; + off_x_MMSE = tt * NUM_SC + s; + sum_re += (acc_t)g_L.re[off_L] * (acc_t)g_x_MMSE.re[off_x_MMSE] + - (acc_t)g_L.im[off_L] * (acc_t)g_x_MMSE.im[off_x_MMSE]; + sum_im += (acc_t)g_L.re[off_L] * (acc_t)g_x_MMSE.im[off_x_MMSE] + + (acc_t)g_L.im[off_L] * (acc_t)g_x_MMSE.re[off_x_MMSE]; + } + off_z = t * NUM_SC + s; + off_x_MMSE = t * NUM_SC + s; +#if defined(DATA_TYPE_float) + off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s; + g_x_MMSE.re[off_x_MMSE] = (g_z.re[off_z] - (data_t)sum_re) / g_L.re[off_L]; + g_x_MMSE.im[off_x_MMSE] = (g_z.im[off_z] - (data_t)sum_im) / g_L.re[off_L]; +#elif defined(DATA_TYPE_fixed) + off_D = t * NUM_SC + s; + g_x_MMSE.re[off_x_MMSE] = (data_t)((acc_t)(g_z.re[off_z] << FP_Q) / g_D[off_D]) + - (data_t)(sum_re >> FP_Q); + g_x_MMSE.im[off_x_MMSE] = (data_t)((acc_t)(g_z.im[off_z] << FP_Q) / g_D[off_D]) + - (data_t)(sum_im >> FP_Q); +#else +#error "Unknown data type" +#endif + } + } +#elif defined(ARCH_rvv) + size_t i, j; + size_t sz, vl; + size_t off_sc; + + for (i = NUM_TX - 1; i != (size_t)-1; --i) { + off_sc = 0; + sz = NUM_SC; + + while (sz > 0){ + /* Initialize result registers as z */ + /* v0 - result real part */ + /* v1 - result imaginary part */ + __asm__ volatile( + "vsetvli %0, %1, e32, m1, ta, ma\n" + : "=r"(vl) + : "r"(sz)); + __asm__ volatile( + "vle32.v v0, (%0)\n" + "vle32.v v1, (%1)\n" + : + : "r"(&g_z.re[i * NUM_SC + off_sc]), + "r"(&g_z.im[i * NUM_SC + off_sc])); + + for (j = i + 1; j < NUM_TX; ++j) { + /* b - sum L_ji * z_j */ + /* v2 - L real part */ + /* v3 - L imaginary part */ + /* v4 - x_MMSE_j real part */ + /* v5 - x_MMSE_j imaginary part */ + __asm__ volatile( + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + "vle32.v v4, (%2)\n" + "vle32.v v5, (%3)\n" + /* real part */ + "vfnmsac.vv v0, v2, v4\n" + "vfmacc.vv v0, v3, v5\n" + /* imaginary part */ + "vfnmsac.vv v1, v3, v4\n" + "vfnmsac.vv v1, v2, v5\n" + : + : "r"(&g_L.re[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_L.im[j * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_x_MMSE.re[j * NUM_SC + off_sc]), + "r"(&g_x_MMSE.im[j * NUM_SC + off_sc]) + ); + } + + /* Divide by L_ii */ + /* v2 - L_ii real part */ + /* v3 - L_ii imaginary part */ + __asm__ volatile ( + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */ + "vfmul.vv v4, v2, v2\n" + "vfmacc.vv v4, v3, v3\n" + /* real part */ + "vfmul.vv v5, v0, v2\n" + "vfmacc.vv v5, v1, v3\n" + "vdiv.vv v0, v5, v4\n" + /* imaginary part */ + "vfmul.vv v6, v1, v2\n" + "vfnmsac.vv v6, v0, v3\n" + "vfdiv.vv v1, v6, v4\n" + /* store HH_H */ + "vse32.v v0, (%2)\n" + "vse32.v v1, (%3)\n" + : + : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_x_MMSE.re[i * NUM_SC + off_sc]), + "r"(&g_x_MMSE.im[i * NUM_SC + off_sc]) + ); + + sz -= vl; + off_sc += vl; + } + } +#else +#error "Unknown architecture" +#endif +} diff --git a/src/ccholesky.c b/src/ccholesky.c @@ -0,0 +1,236 @@ +#include "../include/common.h" + +#include <stddef.h> + +/** Complex Cholesky decomposition of a Hermitian positive-definite matrix G + * + * LL (floating point solution): + * G = L*L^H + * L_ij = (G_ij - \sum_{k=0}^{j-1} L_ik L_jk^*) / L_jj + * L_ii = sqrt(G_ii - \sum_{k=0}^{i-1} L_ik L_ik^*) + * + * LDL (fixed point solution): + * G = L*D*L^H + * L_ij = (G_ij - \sum_{k=0}^{j-1} L_ik D_k L_jk^*) / D_j + * D_i = G_ii - \sum_{k=0}^{i-1} L_ik D_k L_ik^* + * + * \global g_G matrix. Shape [NUM_TX][NUM_TX][NUM_SC] + * \global g_L output lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC] + * \global g_D output diagonal matrix (if DATA_TYPE_fixed is defined). Shape [NUM_TX][NUM_SC] + */ +void ccholesky() +{ +#if defined(ARCH_x86) || defined(ARCH_rv) + size_t i, j, k, s; + size_t off_ij, off_jj, off_ii; + size_t off_ik, off_jk; + size_t off_i, off_j, off_k; + data_t tmp; /* Temporary variable for sqrt */ + acc_t sum_re, sum_im; + for (i = 0; i < NUM_TX; ++i) { + for (j = 0; j <= i; ++j) { + for (s = 0; s < NUM_SC; ++s) { + off_ij = i * NUM_TX * NUM_SC + j * NUM_SC + s; + sum_im = sum_re = 0; + + /* Calculate the sum */ + for (k = 0; k < j; ++k) { + off_ik = i * NUM_TX * NUM_SC + k * NUM_SC + s; + off_jk = j * NUM_TX * NUM_SC + k * NUM_SC + s; +#if defined(DATA_TYPE_float) + sum_re += g_L.re[off_ik] * g_L.re[off_jk] + - g_L.im[off_ik] * g_L.im[off_jk]; +#elif defined(DATA_TYPE_fixed) + sum_re += (g_L.re[off_ik] * g_L.re[off_jk] + - g_L.im[off_ik] * g_L.im[off_jk]) + * g_D[k * NUM_SC + s]; +#else +#error "Unknown data type" +#endif + sum_im += g_L.re[off_ik] * g_L.im[off_jk] + + g_L.im[off_ik] * g_L.re[off_jk]; + } + + if (i == j) { + off_ii = i * NUM_TX * NUM_SC + i * NUM_SC + s; +#if defined(DATA_TYPE_float) + +#if defined(ARCH_x86) + __asm__ volatile ( + "flds %1\n" + "fsubs %2\n" + "fsqrt\n" + "fstps %0\n" + : "=m" (g_L.re[off_ii]) + : "m" (g_G.re[off_ij]), "m" (sum_re) + ); +#elif defined(ARCH_rv) + __asm__ volatile ( + "fsub.s %0, %1, %2\n" /* tmp = g_G.re[off_ij] - sum_re */ + "fsqrt.s %0, %0\n" /* tmp = sqrtf(tmp) */ + : "=&f"(tmp) : "f"(g_G.re[off_ij]), "f"(sum_re) + ); + g_L.re[off_ii] = tmp; + g_L.im[off_ii] = 0; +#else +#error "Unknown architecture" +#endif + +#elif defined(DATA_TYPE_fixed) + /* Calculate D_i = G_ii - sum */ + g_D[i * NUM_SC + s] = g_G.re[off_ii] - (data_t)(sum_re >> FP_Q); +#else +#error "Unknown data type" +#endif + } else { /* i != j */ +#if defined(DATA_TYPE_float) + /* Calculate L_ij = (G_ij - sum) / L_jj */ + off_jj = j * NUM_TX * NUM_SC + j * NUM_SC + s; + g_L.re[off_ij] = (g_G.re[off_ij] - sum_re) / g_L.re[off_jj]; + g_L.im[off_ij] = (g_G.im[off_ij] - sum_im) / g_L.re[off_jj]; +#elif defined(DATA_TYPE_fixed) + /* Calculate L_ij = (G_ij - sum) / D_j */ + off_j = j * NUM_SC + s; + /* real */ + sum_re = ((acc_t)g_G.re[off_ij] << FP_Q) - sum_re; + /* TODO roubding? */ + g_L.re[off_ij] = (data_t)(sum_re / (acc_t)g_D[off_j]); + /* imaginary */ + sum_im = ((acc_t)g_G.im[off_ij] << FP_Q) - sum_im; + /* TODO roubding? */ + g_L.im[off_ij] = (data_t)(sum_im / (acc_t)g_D[off_j]); +#else +#error "Unknown data type" +#endif + } + } + } + } +#elif defined(ARCH_rvv) + size_t i, j, k; + size_t sz, vl; + size_t off_sc; + + /* Init float registers */ + register float f0 __asm__("f0") = 2.0f; + + for (i = 0; i < NUM_TX; ++i) + for (j = 0; j <= i; ++j) { + sz = NUM_SC; + off_sc = 0; + + while (sz > 0) { + /* Initialize L registers */ + /* v0 - L_ij real part = G_ij.re */ + /* v1 - L_ij imaginary part = G_ij.im */ + __asm__ volatile( + "vsetvli %0, %1, e32, m1, ta, ma\n" + : "=r"(vl) : "r"(sz)); + __asm__ volatile( + "vle32.v v0, (%0)\n" + "vle32.v v1, (%1)\n" + : + : "r"(&g_G.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), + "r"(&g_G.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]) + ); + + /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */ + /* v2 - sum real part */ + /* v3 - sum imaginary part */ + __asm__ volatile( + "vmv.v.i v2, 0\n" + "vmv.v.i v3, 0\n" + ); + for (k = 0; k < j; ++k) { + __asm__ volatile( + "vle32.v v4, (%0)\n" + "vle32.v v5, (%1)\n" + "vle32.v v6, (%2)\n" + "vle32.v v7, (%3)\n" + /* real part */ + "vfmacc.vv v2, v4, v6\n" + "vfmacc.vv v2, v5, v7\n" + /* imaginary part */ + "vfmacc.vv v3, v5, v6\n" + "vfnmsac.vv v3, v4, v7\n" + : + : "r"(&g_L.re[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]), + "r"(&g_L.im[i * NUM_TX * NUM_SC + k * NUM_SC + off_sc]), + "r"(&g_L.re[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc]), + "r"(&g_L.im[j * NUM_TX * NUM_SC + k * NUM_SC + off_sc]) + ); + } + + /* G_ii - sum */ + __asm__ volatile( + "vfsub.vv v0, v0, v2\n" + "vfsub.vv v1, v1, v3\n" + ); + + if (i == j) { + /* Calculate L_ii = sqrt(G_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */ + __asm__ volatile( + /* Complex sqrt */ + + /* v2 = r = sqrt(re^2 + im^2) */ + "vfmul.vv v2, v0, v0\n" + "vfmacc.vv v2, v1, v1\n" + "vfsqrt.v v2, v2\n" + + /* v3 - real part */ + "vfadd.vv v3, v2, v0\n" /* r + re */ + "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */ + "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */ + /* v4 - imaginary part */ + "vfsub.vv v4, v2, v0\n" /* r - re */ + "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */ + "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */ + "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */ + + /* TODO handle im == 0 */ + + /* Move the result to v0 and v1 */ + "vmv.v.v v0, v3\n" + "vmv.v.v v1, v4\n" + ); + } else { + /* Calculate L_ij = (G_ij - sum) / L_jj */ + __asm__ volatile( + /* L_jj */ + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */ + "vfmul.vv v4, v2, v2\n" + "vfmacc.vv v4, v3, v3\n" + /* real part */ + "vfmul.vv v5, v0, v2\n" + "vfmacc.vv v5, v1, v3\n" + /* imaginary part */ + "vfmul.vv v6, v1, v2\n" + "vfnmsac.vv v6, v0, v3\n" + /* divide and store at v0 and v1 */ + "vfdiv.vv v0, v5, v4\n" + "vfdiv.vv v1, v6, v4\n" + : + : "r"(&g_L.re[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), + "r"(&g_L.im[j * NUM_TX * NUM_SC + j * NUM_SC + off_sc]) + ); + } + + /* Store result */ + __asm__ volatile( + "vse32.v v0, (%0)\n" + "vse32.v v1, (%1)\n" + : + : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), + "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]) + ); + + sz -= vl; + off_sc += vl; + } + } +#else +#error "Unknown architecture" +#endif +} diff --git a/src/cforwardsub.c b/src/cforwardsub.c @@ -0,0 +1,131 @@ +#include "../include/common.h" + +#include <stddef.h> + +/** Complex forward substitution L*z = HHy + * + * z_t = (HHy_t - \sum_{tt=0}^{t-1} L_{t tt} z_tt) / L_{t t} (for LL / float solution) + * z_t = (HHy_t - \sum_{tt=0}^{t-1} L_{t tt} z_tt) (for LDL / fixed solution) + * + * \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC] + * \global g_HHy vector. Shape [NUM_TX][NUM_SC] + * \global g_z output vector. Shape [NUM_TX][NUM_SC] + */ +void cforwardsub() +{ +#if defined(ARCH_x86) || defined(ARCH_rv) + size_t t, tt, s; + size_t off_L, off_HHy, off_z; + acc_t sum_re, sum_im; + for (t = 0; t < NUM_TX; ++t) { + for (s = 0; s < NUM_SC; ++s) { + sum_re = sum_im = 0; + for (tt = 0; tt < t; ++tt) { + off_L = t * NUM_TX * NUM_SC + tt * NUM_SC + s; + off_z = tt * NUM_SC + s; + sum_re += (acc_t)g_L.re[off_L] * (acc_t)g_z.re[off_z] + - (acc_t)g_L.im[off_L] * (acc_t)g_z.im[off_z]; + sum_im += (acc_t)g_L.re[off_L] * (acc_t)g_z.im[off_z] + + (acc_t)g_L.im[off_L] * (acc_t)g_z.re[off_z]; + } + off_HHy = t * NUM_SC + s; + off_z = t * NUM_SC + s; +#if defined(DATA_TYPE_float) + off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s; + g_z.re[off_z] = (g_HHy.re[off_HHy] - (data_t)sum_re) / g_L.re[off_L]; + g_z.im[off_z] = (g_HHy.im[off_HHy] - (data_t)sum_im) / g_L.re[off_L]; +#elif defined(DATA_TYPE_fixed) + g_z.re[off_z] = g_HHy.re[off_HHy] - (data_t)(sum_re >> FP_Q); + g_z.im[off_z] = g_HHy.im[off_HHy] - (data_t)(sum_im >> FP_Q); +#else +#error "Unknown data type" +#endif + } + } +#elif defined(ARCH_rvv) + size_t i, j; + size_t sz, vl; + size_t off_sc; + + for (i = 0; i < NUM_TX; ++i) { + off_sc = 0; + sz = NUM_SC; + + while (sz > 0) { + // printf("sz: %lu\n", sz); + // printf("vl: %lu\n", vl); + /* Initialize result registers as b */ + /* v0 - result real part */ + /* v1 - result imaginary part */ + __asm__ volatile ( + "vsetvli %0, %1, e32, m1, ta, ma\n" + : "=r"(vl) + : "r"(sz)); + __asm__ volatile ( + "vle32.v v0, (%0)\n" + "vle32.v v1, (%1)\n" + : + : "r"(&g_HHy.re[i * NUM_SC + off_sc]), + "r"(&g_HHy.im[i * NUM_SC + off_sc])); + // printf("vl: %lu\n", vl); + + for (j = 0; j != i; ++j) { + /* b - sum L_ij * z_j */ + /* v2 - L real part */ + /* v3 - L imaginary part */ + /* v4 - result_j real part */ + /* v5 - result_j imaginary part */ + __asm__ volatile ( + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + "vle32.v v4, (%2)\n" + "vle32.v v5, (%3)\n" + /* real part */ + "vfnmsac.vv v0, v2, v4\n" + "vfmacc.vv v0, v3, v5\n" + /* imaginary part */ + "vfnmsac.vv v1, v3, v4\n" + "vfnmsac.vv v1, v2, v5\n" + : + : "r"(&g_L.re[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), + "r"(&g_L.im[i * NUM_TX * NUM_SC + j * NUM_SC + off_sc]), + "r"(&g_z.re[j * NUM_SC + off_sc]), + "r"(&g_z.im[j * NUM_SC + off_sc]) + ); + } + + /* Divide by L_ii */ + /* v2 - L_ii real part */ + /* v3 - L_ii imaginary part */ + __asm__ volatile ( + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */ + "vfmul.vv v4, v2, v2\n" + "vfmacc.vv v4, v3, v3\n" + /* real part */ + "vfmul.vv v5, v0, v2\n" + "vfmacc.vv v5, v1, v3\n" + "vdiv.vv v0, v5, v4\n" + /* imaginary part */ + "vfmul.vv v6, v1, v2\n" + "vfnmsac.vv v6, v0, v3\n" + "vfdiv.vv v1, v6, v4\n" + /* store result */ + "vse32.v v0, (%2)\n" + "vse32.v v1, (%3)\n" + : + : "r"(&g_L.re[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_L.im[i * NUM_TX * NUM_SC + i * NUM_SC + off_sc]), + "r"(&g_z.re[i * NUM_SC + off_sc]), + "r"(&g_z.im[i * NUM_SC + off_sc]) + ); + + off_sc += vl; + sz -= vl; + } + } +#else +#error "Unknown architecture" +#endif +} diff --git a/src/cmatgram.c b/src/cmatgram.c @@ -0,0 +1,131 @@ +#include "../include/common.h" + +#include <stddef.h> + +/** Complex Gram matrix H^H*H and add complex matrix R + * + * G = H^H*H + R + * G_{t1t2} = \sum_{r=0}^{NUM_RX - 1} (H_{rt1}^* H_{rt2}) + R_{t1t2} + * + * \global g_H matrix of channel coefficients. Shape [NUM_RX][NUM_TX][NUM_SC] + * \global g_R noise covariance matrix. Shape [NUM_TX][NUM_TX][NUM_SC] + * \global g_G output Gram matrix + R. Shape [NUM_TX][NUM_TX][NUM_SC] + */ +void cmatgram() +{ +#if defined(ARCH_x86) || defined(ARCH_rv) + size_t r, t1, t2, s; + size_t off_G, off_R, off_H1, off_H2; + for (t1 = 0; t1 < NUM_TX; ++t1) { + for (t2 = 0; t2 < NUM_TX; ++t2) { + for (s = 0; s < NUM_SC; ++s) { + off_R = off_G = t1 * NUM_TX * NUM_SC + t2 * NUM_SC + s; + g_G.re[off_G] = g_R.re[off_R]; + g_G.im[off_G] = g_R.im[off_R]; + + for (r = 0; r < NUM_RX; ++r) { + off_H1 = r * NUM_TX * NUM_SC + t1 * NUM_SC + s; + off_H2 = r * NUM_TX * NUM_SC + t2 * NUM_SC + s; + g_G.re[off_G] += g_H.re[off_H1] * g_H.re[off_H2] + + g_H.im[off_H1] * g_H.im[off_H2]; + g_G.im[off_G] += g_H.re[off_H1] * g_H.im[off_H2] + - g_H.im[off_H1] * g_H.re[off_H2]; + } + } + } + } +#elif defined(ARCH_rvv) + size_t t1, t2, r; + size_t sz, vl; + size_t off_sc, off_A, off_AH; + size_t off_G_L, off_G_U; + data_t *A_re, *A_im, *AH_re, *AH_im; + data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im; + data_t *G_L_re, *G_L_im, *G_U_re, *G_U_im; + + for (t1 = 0; t1 != NUM_TX; ++t1) + for (t2 = t1; t2 != NUM_TX; ++t2) { + off_sc = 0; + off_G_L = t1 * NUM_TX * NUM_SC + t2 * NUM_SC; + off_G_U = t2 * NUM_TX * NUM_SC + t1 * NUM_SC; + G_L_re = &g_G.re[off_G_L]; + G_L_im = &g_G.im[off_G_L]; + G_U_re = &g_G.re[off_G_U]; + G_U_im = &g_G.im[off_G_U]; + sz = NUM_SC; + + while (sz > 0) { + /* Initialize G registers */ + /* v0 - G real part */ + /* v1 - G imaginary part */ + __asm__ volatile( + "vsetvli %0, %1, e32, m1, ta, ma\n" + "vmv.v.i v0, 0\n" + "vmv.v.i v1, 0\n" + : "=r"(vl) : "r"(sz)); + + for (r = 0; r != NUM_RX; ++r) { + off_A = r * NUM_TX * NUM_SC + t1 * NUM_SC + off_sc; + off_AH = r * NUM_TX * NUM_SC + t2 * NUM_SC + off_sc; + A_re = &g_H.re[off_A]; + A_im = &g_H.im[off_A]; + AH_re = &g_H.re[off_AH]; + AH_im = &g_H.im[off_AH]; + + /* Calculate A^H*A */ + /* v2 - A real part */ + /* v3 - A imaginary part */ + /* v4 - A^H real part */ + /* v5 - A^H imaginary part */ + __asm__ volatile( + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + "vle32.v v4, (%2)\n" + "vle32.v v5, (%3)\n" + /* real part */ + "vfmacc.vv v0, v2, v4\n" + "vfmacc.vv v0, v3, v5\n" + /* imaginary part */ + "vfmacc.vv v1, v3, v4\n" + "vfnmsac.vv v1, v2, v5\n" + : + : "r"(A_re), "r"(A_im), "r"(AH_re), "r"(AH_im) + ); + } + + /* Add R */ + /* v2 - R real part */ + /* v3 - R imaginary part */ + R_U_re = &g_R.re[off_G_U]; + R_U_im = &g_R.im[off_G_U]; + __asm__ volatile( + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + "vfadd.vv v2, v2, v0\n" + "vfadd.vv v3, v3, v1\n" + "vse32.v v2, (%2)\n" + "vse32.v v3, (%3)\n" + : + : "r"(R_U_re), "r"(R_U_im), "r"(G_U_re), "r"(G_U_im)); + if (t1 != t2) { + R_L_re = &g_R.re[off_G_L]; + R_L_im = &g_R.im[off_G_L]; + __asm__ volatile( + /* Lower triangle */ + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + "vfadd.vv v2, v2, v0\n" + "vfsub.vv v3, v3, v1\n" + "vse32.v v2, (%2)\n" + "vse32.v v3, (%3)\n" + : + : "r"(R_L_re), "r"(R_L_im), "r"(G_L_re), "r"(G_L_im) + ); + } + + sz -= vl; + off_sc += vl; + } + } +#endif +} diff --git a/src/cmatvecmul.c b/src/cmatvecmul.c @@ -0,0 +1,100 @@ +#include "../include/common.h" + +#include <stddef.h> + +/** Complex matrix-vector multiplication HHy = HH*y + * + * HHy_t = \sum_{r=0}^{NUM_RX-1} H_{rt}^* * y_r + * + * \global g_H matrix. Shape [NUM_RX][NUM_TX][NUM_SC] + * \global g_y vector. Shape [NUM_RX][NUM_SC] + * \global g_HHy output vector. Shape [NUM_TX][NUM_SC] + */ +void cmatvecmul() +{ +#if defined(ARCH_x86) || defined(ARCH_rv) + size_t t, r, s; + size_t off_H, off_y, off_HHy; + acc_t sum_re, sum_im; + for (t = 0; t < NUM_TX; ++t) { + for (s = 0; s < NUM_SC; ++s) { + sum_re = sum_im = 0; + for (r = 0; r < NUM_RX; ++r) { + off_H = r * NUM_TX * NUM_SC + t * NUM_SC + s; + off_y = r * NUM_SC + s; + sum_re += (acc_t)g_H.re[off_H] * (acc_t)g_y.re[off_y] + - (acc_t)g_H.im[off_H] * (acc_t)g_y.im[off_y]; + sum_im += (acc_t)g_H.re[off_H] * (acc_t)g_y.im[off_y] + + (acc_t)g_H.im[off_H] * (acc_t)g_y.re[off_y]; + } + off_HHy = t * NUM_SC + s; +#if defined(DATA_TYPE_float) + g_HHy.re[off_HHy] = (data_t)sum_re; + g_HHy.im[off_HHy] = (data_t)sum_im; +#elif defined(DATA_TYPE_fixed) + g_HHy.re[off_HHy] = (data_t)(sum_re >> FP_Q); + g_HHy.im[off_HHy] = (data_t)(sum_im >> FP_Q); +#else +#error "Unknown data type" +#endif + } + } +#elif defined(ARCH_rvv) + size_t i, j; + size_t off_HH, off_y, off_HHy, off_sc; + size_t sz, vl; + + for (i = 0; i < NUM_TX; ++i) { + off_HHy = i * NUM_SC; + off_sc = 0; + sz = NUM_SC; + + while (sz > 0) { + /* Initialize result registers */ + /* v0 - HHy real part */ + /* v1 - HHy imaginary part */ + __asm__ volatile( + "vsetvli %0, %1, e32, m1, ta, ma\n" + "vmv.v.i v0, 0\n" + "vmv.v.i v1, 0\n" + : "=r"(vl) + : "r"(sz) + ); + + for (j = 0; j < NUM_RX; ++j) { + off_HH = i * NUM_RX * NUM_SC + j * NUM_SC + off_sc; + off_y = j * NUM_SC + off_sc; + __asm__ volatile( + "vle32.v v2, (%0)\n" + "vle32.v v3, (%1)\n" + "vle32.v v4, (%2)\n" + "vle32.v v5, (%3)\n" + /* real part */ + "vfmacc.vv v0, v2, v4\n" + "vfnmsac.vv v0, v3, v5\n" + /* imaginary part */ + "vfmacc.vv v1, v3, v4\n" + "vfmacc.vv v1, v2, v5\n" + : + : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]), + "r"(&g_y.re[off_y]), "r"(&g_y.im[off_y]) + ); + } + + /* Store result */ + __asm__ volatile( + "vse32.v v0, (%0)\n" + "vse32.v v1, (%1)\n" + : + : "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy]) + ); + + sz -= vl; + off_HHy += vl; + off_sc += vl; + } + } +#else +#error "Unknown architecture" +#endif +} diff --git a/src/mmserv.c b/src/mmserv.c @@ -1,631 +0,0 @@ -#include "../include/define.h" - -#include <stddef.h> /* for size_t */ -#include <stdint.h> /* for uint64_t */ - -/* - * Debug - */ - -#ifdef DEBUG -#include "printf.h" - -static uint64_t g_timer; -void start_timer() -{ - __asm__ volatile("rdcycle %0" : "=r"(g_timer)); -} -void stop_timer() -{ - __asm__ volatile( - "rdcycle t0\n" - "sub %0, t0, %0" - : "+r"(g_timer) - ); -} -uint64_t get_timer() -{ - return g_timer; -} - -#define TIME(msg, func, ...) \ - start_timer(); \ - func(__VA_ARGS__); \ - stop_timer(); \ - printf(msg, get_timer()); - -#else -#define TIME(msg, func, ...) func(__VA_ARGS__); - -#endif - -/* - * Global variables - */ - -/* Externs */ -extern vcomplex g_x, g_H, g_R, g_y; -extern vcomplex g_x_MMSE; -extern size_t num_rx_cur, num_tx_cur, num_sc_cur; - - -/* Raw data */ -data_t HH_re[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; -data_t HH_im[NUM_TX_ANT][NUM_RX_ANT][NUM_SC]; -data_t HH_H_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; -data_t HH_H_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; -data_t L_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; -data_t L_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; -data_t HHy_re[NUM_TX_ANT][NUM_SC]; -data_t HHy_im[NUM_TX_ANT][NUM_SC]; -data_t z_re[NUM_TX_ANT][NUM_SC]; -data_t z_im[NUM_TX_ANT][NUM_SC]; -data_t LH_re[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; -data_t LH_im[NUM_TX_ANT][NUM_TX_ANT][NUM_SC]; - -/* Same data but casted to vcomplex */ -vcomplex g_HH = { .re = (data_t *)HH_re, .im = (data_t *)HH_im }; -vcomplex g_HH_H = { .re = (data_t *)HH_H_re, .im = (data_t *)HH_H_im }; -vcomplex g_L = { .re = (data_t *)L_re, .im = (data_t *)L_im }; -vcomplex g_HHy = { .re = (data_t *)HHy_re, .im = (data_t *)HHy_im }; -vcomplex g_z = { .re = (data_t *)z_re, .im = (data_t *)z_im }; -vcomplex g_LH = { .re = (data_t *)LH_re, .im = (data_t *)LH_im }; - -/* - * Complex matrix operations - */ - -/** Complex Gram matrix H^H*H and add complex matrix R - * - * HH_H = H^H*H + R - * - * \global g_H matrix of channel coefficients. Shape [NUM_RX_ANT][NUM_TX_ANT][NUM_SC] - * \global g_R noise covariance matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \global g_HH_H output Gram matrix + R. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - */ -void cmatgram_TxRx_cadd() -{ - size_t t1, t2, r; - size_t sz, vl; - size_t off_sc, off_A, off_AH; - size_t off_HH_H_L, off_HH_H_U; - data_t *A_re, *A_im, *AH_re, *AH_im; - data_t *R_L_re, *R_L_im, *R_U_re, *R_U_im; - data_t *HH_H_L_re, *HH_H_L_im, *HH_H_U_re, *HH_H_U_im; - - for (t1 = 0; t1 != num_tx_cur; ++t1) - for (t2 = t1; t2 != num_tx_cur; ++t2) { - off_sc = 0; - off_HH_H_L = t1 * num_tx_cur * num_sc_cur + t2 * num_sc_cur; - off_HH_H_U = t2 * num_tx_cur * num_sc_cur + t1 * num_sc_cur; - HH_H_L_re = &g_HH_H.re[off_HH_H_L]; - HH_H_L_im = &g_HH_H.im[off_HH_H_L]; - HH_H_U_re = &g_HH_H.re[off_HH_H_U]; - HH_H_U_im = &g_HH_H.im[off_HH_H_U]; - sz = num_sc_cur; - - while (sz > 0) { - /* Initialize HH_H registers */ - /* v0 - HH_H real part */ - /* v1 - HH_H imaginary part */ - __asm__ volatile( - "vsetvli %0, %1, e32, m1, ta, ma\n" - "vmv.v.i v0, 0\n" - "vmv.v.i v1, 0\n" - : "=r"(vl) : "r"(sz)); - - for (r = 0; r != num_rx_cur; ++r) { - off_A = r * num_tx_cur * num_sc_cur + t1 * num_sc_cur + off_sc; - off_AH = r * num_tx_cur * num_sc_cur + t2 * num_sc_cur + off_sc; - A_re = &g_H.re[off_A]; - A_im = &g_H.im[off_A]; - AH_re = &g_H.re[off_AH]; - AH_im = &g_H.im[off_AH]; - - /* Calculate A^H*A */ - /* v2 - A real part */ - /* v3 - A imaginary part */ - /* v4 - A^H real part */ - /* v5 - A^H imaginary part */ - __asm__ volatile( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vle32.v v4, (%2)\n" - "vle32.v v5, (%3)\n" - /* real part */ - "vfmacc.vv v0, v2, v4\n" - "vfmacc.vv v0, v3, v5\n" - /* imaginary part */ - "vfmacc.vv v1, v3, v4\n" - "vfnmsac.vv v1, v2, v5\n" - : - : "r"(A_re), "r"(A_im), "r"(AH_re), "r"(AH_im) - ); - } - - /* Add R */ - /* v2 - R real part */ - /* v3 - R imaginary part */ - R_U_re = &g_R.re[off_HH_H_U]; - R_U_im = &g_R.im[off_HH_H_U]; - __asm__ volatile( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vfadd.vv v2, v2, v0\n" - "vfadd.vv v3, v3, v1\n" - "vse32.v v2, (%2)\n" - "vse32.v v3, (%3)\n" - : - : "r"(R_U_re), "r"(R_U_im), "r"(HH_H_U_re), "r"(HH_H_U_im)); - if (t1 != t2) { - R_L_re = &g_R.re[off_HH_H_L]; - R_L_im = &g_R.im[off_HH_H_L]; - __asm__ volatile( - /* Lower triangle */ - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vfadd.vv v2, v2, v0\n" - "vfsub.vv v3, v3, v1\n" - "vse32.v v2, (%2)\n" - "vse32.v v3, (%3)\n" - : - : "r"(R_L_re), "r"(R_L_im), "r"(HH_H_L_re), "r"(HH_H_L_im) - ); - } - - sz -= vl; - off_sc += vl; - } - } -} - -/** Complex Cholesky decomposition L of a Hermitian positive-definite matrix HH_H - * - * HH_H = L*L^H - * L_ij = (HH_H_ij - \sum_{k=0}^{j-1} L_ik L_jk^*) / L_jj - * L_ii = sqrt(HH_H_ii - \sum_{k=0}^{i-1} L_ik L_ik^*) - * - * \global g_HH_H matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \global g_L output lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - */ -void ccholesky_TxTx() -{ - size_t i, j, k; - size_t sz, vl; - size_t off_sc; - - /* Init float registers */ - register float f0 __asm__("f0") = 2.0f; - - for (i = 0; i < num_tx_cur; ++i) - for (j = 0; j <= i; ++j) { - sz = num_sc_cur; - off_sc = 0; - - while (sz > 0) { - /* Initialize L registers */ - /* v0 - L_ij real part = HH_H_ij.re */ - /* v1 - L_ij imaginary part = HH_H_ij.im */ - __asm__ volatile( - "vsetvli %0, %1, e32, m1, ta, ma\n" - : "=r"(vl) : "r"(sz)); - __asm__ volatile( - "vle32.v v0, (%0)\n" - "vle32.v v1, (%1)\n" - : - : "r"(&g_HH_H.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]), - "r"(&g_HH_H.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]) - ); - - /* Calculate sum_{k=0}^{j-1} L_ik L_jk^* */ - /* v2 - sum real part */ - /* v3 - sum imaginary part */ - __asm__ volatile( - "vmv.v.i v2, 0\n" - "vmv.v.i v3, 0\n" - ); - for (k = 0; k < j; ++k) { - __asm__ volatile( - "vle32.v v4, (%0)\n" - "vle32.v v5, (%1)\n" - "vle32.v v6, (%2)\n" - "vle32.v v7, (%3)\n" - /* real part */ - "vfmacc.vv v2, v4, v6\n" - "vfmacc.vv v2, v5, v7\n" - /* imaginary part */ - "vfmacc.vv v3, v5, v6\n" - "vfnmsac.vv v3, v4, v7\n" - : - : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]), - "r"(&g_L.im[i * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]), - "r"(&g_L.re[j * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]), - "r"(&g_L.im[j * num_tx_cur * num_sc_cur + k * num_sc_cur + off_sc]) - ); - } - - /* HH_H_ii - sum */ - __asm__ volatile( - "vfsub.vv v0, v0, v2\n" - "vfsub.vv v1, v1, v3\n" - ); - - if (i == j) { - /* Calculate L_ii = sqrt(HH_H_ii - sum_{k=0}^{i-1} L_ik L_ik^*) */ - __asm__ volatile( - /* Complex sqrt */ - - /* v2 = r = sqrt(re^2 + im^2) */ - "vfmul.vv v2, v0, v0\n" - "vfmacc.vv v2, v1, v1\n" - "vfsqrt.v v2, v2\n" - - /* v3 - real part */ - "vfadd.vv v3, v2, v0\n" /* r + re */ - "vfdiv.vf v3, v3, f0\n" /* (r + re) / 2 */ - "vfsqrt.v v3, v3\n" /* sqrt((r + re) / 2) */ - /* v4 - imaginary part */ - "vfsub.vv v4, v2, v0\n" /* r - re */ - "vfdiv.vf v4, v4, f0\n" /* (r - re) / 2 */ - "vfsqrt.v v4, v4\n" /* sqrt((r - re) / 2) */ - "vfsgnj.vv v4, v4, v1\n" /* sgn(im) * sqrt((r - re) / 2) */ - - /* TODO handle im == 0 */ - - /* Move the result to v0 and v1 */ - "vmv.v.v v0, v3\n" - "vmv.v.v v1, v4\n" - ); - } else { - /* Calculate L_ij = (HH_H_ij - sum) / L_jj */ - __asm__ volatile( - /* L_jj */ - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - /* calculate L_jj_re^2 + L_jj_im^2 -> v4 */ - "vfmul.vv v4, v2, v2\n" - "vfmacc.vv v4, v3, v3\n" - /* real part */ - "vfmul.vv v5, v0, v2\n" - "vfmacc.vv v5, v1, v3\n" - /* imaginary part */ - "vfmul.vv v6, v1, v2\n" - "vfnmsac.vv v6, v0, v3\n" - /* divide and store at v0 and v1 */ - "vfdiv.vv v0, v5, v4\n" - "vfdiv.vv v1, v6, v4\n" - : - : "r"(&g_L.re[j * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]), - "r"(&g_L.im[j * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]) - ); - } - - /* Store result */ - __asm__ volatile( - "vse32.v v0, (%0)\n" - "vse32.v v1, (%1)\n" - : - : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]), - "r"(&g_L.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]) - ); - - sz -= vl; - off_sc += vl; - } - } -} - -/** Complex matrix-vector multiplication HHy = HH*y - * - * \global g_HH matrix. Shape [NUM_TX_ANT][NUM_RX_ANT][NUM_SC] - * \global g_y vector. Shape [NUM_RX_ANT][NUM_SC] - * \global g_HHy output vector. Shape [NUM_TX_ANT][NUM_SC] - */ -void cmatvecmul_TxRx() -{ - size_t i, j; - size_t off_HH, off_y, off_HHy, off_sc; - size_t sz, vl; - - for (i = 0; i < num_tx_cur; ++i) { - off_HHy = i * num_sc_cur; - off_sc = 0; - sz = num_sc_cur; - - while (sz > 0) { - /* Initialize result registers */ - /* v0 - HHy real part */ - /* v1 - HHy imaginary part */ - __asm__ volatile( - "vsetvli %0, %1, e32, m1, ta, ma\n" - "vmv.v.i v0, 0\n" - "vmv.v.i v1, 0\n" - : "=r"(vl) - : "r"(sz) - ); - - for (j = 0; j < num_rx_cur; ++j) { - off_HH = i * num_rx_cur * num_sc_cur + j * num_sc_cur + off_sc; - off_y = j * num_sc_cur + off_sc; - __asm__ volatile( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vle32.v v4, (%2)\n" - "vle32.v v5, (%3)\n" - /* real part */ - "vfmacc.vv v0, v2, v4\n" - "vfnmsac.vv v0, v3, v5\n" - /* imaginary part */ - "vfmacc.vv v1, v3, v4\n" - "vfmacc.vv v1, v2, v5\n" - : - : "r"(&g_HH.re[off_HH]), "r"(&g_HH.im[off_HH]), - "r"(&g_y.re[off_y]), "r"(&g_y.im[off_y]) - ); - } - - /* Store result */ - __asm__ volatile( - "vse32.v v0, (%0)\n" - "vse32.v v1, (%1)\n" - : - : "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy]) - ); - - sz -= vl; - off_HHy += vl; - off_sc += vl; - } - } -} - -/** Complex forward substitution L*z = HHy - * - * z_i = (HHy_i - \sum_{k=0}^{i-1} L_{ik} z_k) / L_{ii} - * - * \global g_L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \global g_HHy vector. Shape [NUM_TX_ANT][NUM_SC] - * \global g_z output vector. Shape [NUM_TX_ANT][NUM_SC] - */ -void cforwardsub_TxTx() -{ - size_t i, j; - size_t sz, vl; - size_t off_sc; - - for (i = 0; i < num_tx_cur; ++i) { - off_sc = 0; - sz = num_sc_cur; - - while (sz > 0) { - // printf("sz: %lu\n", sz); - // printf("vl: %lu\n", vl); - /* Initialize result registers as b */ - /* v0 - result real part */ - /* v1 - result imaginary part */ - __asm__ volatile ( - "vsetvli %0, %1, e32, m1, ta, ma\n" - : "=r"(vl) - : "r"(sz)); - __asm__ volatile ( - "vle32.v v0, (%0)\n" - "vle32.v v1, (%1)\n" - : - : "r"(&g_HHy.re[i * num_sc_cur + off_sc]), - "r"(&g_HHy.im[i * num_sc_cur + off_sc])); - // printf("vl: %lu\n", vl); - - for (j = 0; j != i; ++j) { - /* b - sum L_ij * z_j */ - /* v2 - L real part */ - /* v3 - L imaginary part */ - /* v4 - result_j real part */ - /* v5 - result_j imaginary part */ - __asm__ volatile ( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vle32.v v4, (%2)\n" - "vle32.v v5, (%3)\n" - /* real part */ - "vfnmsac.vv v0, v2, v4\n" - "vfmacc.vv v0, v3, v5\n" - /* imaginary part */ - "vfnmsac.vv v1, v3, v4\n" - "vfnmsac.vv v1, v2, v5\n" - : - : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]), - "r"(&g_L.im[i * num_tx_cur * num_sc_cur + j * num_sc_cur + off_sc]), - "r"(&g_z.re[j * num_sc_cur + off_sc]), - "r"(&g_z.im[j * num_sc_cur + off_sc]) - ); - } - - /* Divide by L_ii */ - /* v2 - L_ii real part */ - /* v3 - L_ii imaginary part */ - __asm__ volatile ( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */ - "vfmul.vv v4, v2, v2\n" - "vfmacc.vv v4, v3, v3\n" - /* real part */ - "vfmul.vv v5, v0, v2\n" - "vfmacc.vv v5, v1, v3\n" - "vdiv.vv v0, v5, v4\n" - /* imaginary part */ - "vfmul.vv v6, v1, v2\n" - "vfnmsac.vv v6, v0, v3\n" - "vfdiv.vv v1, v6, v4\n" - /* store result */ - "vse32.v v0, (%2)\n" - "vse32.v v1, (%3)\n" - : - : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]), - "r"(&g_L.im[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]), - "r"(&g_z.re[i * num_sc_cur + off_sc]), - "r"(&g_z.im[i * num_sc_cur + off_sc]) - ); - - off_sc += vl; - sz -= vl; - } - } -} - -/** Complex backward substitution L^H*x_MMSE = z - * - * x_i = (z_i - \sum_{j=i+1}^{n-1} L_{ji} x_k) / L_{ii} - * - * \global g_L lower triangular matrix. Shape [NUM_TX_ANT][NUM_TX_ANT][NUM_SC] - * \global g_z rhs vector. Shape [NUM_TX_ANT][NUM_SC] - * \global g_x_MMSE output vector. Shape [NUM_TX_ANT][NUM_SC] - */ -void cbackwardsub_TxTx() -{ - size_t i, j; - size_t sz, vl; - size_t off_sc; - - for (i = num_tx_cur - 1; i != (size_t)-1; --i) { - off_sc = 0; - sz = num_sc_cur; - - while (sz > 0){ - /* Initialize result registers as z */ - /* v0 - result real part */ - /* v1 - result imaginary part */ - __asm__ volatile( - "vsetvli %0, %1, e32, m1, ta, ma\n" - : "=r"(vl) - : "r"(sz)); - __asm__ volatile( - "vle32.v v0, (%0)\n" - "vle32.v v1, (%1)\n" - : - : "r"(&g_z.re[i * num_sc_cur + off_sc]), - "r"(&g_z.im[i * num_sc_cur + off_sc])); - - for (j = i + 1; j < num_tx_cur; ++j) { - /* b - sum L_ji * z_j */ - /* v2 - L real part */ - /* v3 - L imaginary part */ - /* v4 - x_MMSE_j real part */ - /* v5 - x_MMSE_j imaginary part */ - __asm__ volatile( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - "vle32.v v4, (%2)\n" - "vle32.v v5, (%3)\n" - /* real part */ - "vfnmsac.vv v0, v2, v4\n" - "vfmacc.vv v0, v3, v5\n" - /* imaginary part */ - "vfnmsac.vv v1, v3, v4\n" - "vfnmsac.vv v1, v2, v5\n" - : - : "r"(&g_L.re[j * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]), - "r"(&g_L.im[j * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]), - "r"(&g_x_MMSE.re[j * num_sc_cur + off_sc]), - "r"(&g_x_MMSE.im[j * num_sc_cur + off_sc]) - ); - } - - /* Divide by L_ii */ - /* v2 - L_ii real part */ - /* v3 - L_ii imaginary part */ - __asm__ volatile ( - "vle32.v v2, (%0)\n" - "vle32.v v3, (%1)\n" - /* calculate L_ii_re^2 + L_ii_im^2 -> v4 */ - "vfmul.vv v4, v2, v2\n" - "vfmacc.vv v4, v3, v3\n" - /* real part */ - "vfmul.vv v5, v0, v2\n" - "vfmacc.vv v5, v1, v3\n" - "vdiv.vv v0, v5, v4\n" - /* imaginary part */ - "vfmul.vv v6, v1, v2\n" - "vfnmsac.vv v6, v0, v3\n" - "vfdiv.vv v1, v6, v4\n" - /* store HH_H */ - "vse32.v v0, (%2)\n" - "vse32.v v1, (%3)\n" - : - : "r"(&g_L.re[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]), - "r"(&g_L.im[i * num_tx_cur * num_sc_cur + i * num_sc_cur + off_sc]), - "r"(&g_x_MMSE.re[i * num_sc_cur + off_sc]), - "r"(&g_x_MMSE.im[i * num_sc_cur + off_sc]) - ); - - sz -= vl; - off_sc += vl; - } - } -} - -/* - * MMSE - */ - -void mmse() -{ - /* H^H*H + R */ - TIME( - "Gram and add (RxTx x TxRx + TxTx): %ld\n", - cmatgram_TxRx_cadd); - /* L: (H^H*H + R) = L*L^H */ - TIME( - "Cholesky (TxTx): %ld\n", - ccholesky_TxTx); - /* z: L*z = H^H*y */ - TIME( - "Matrix-vector multiplication (TxRx x Rx): %ld\n", - cmatvecmul_TxRx); - TIME( - "Forward substitution (TxTx): %ld\n", - cforwardsub_TxTx); - /* x_MMSE: L^H*x_MMSE = z */ - TIME( - "Backward substitution (TxTx): %ld\n", - cbackwardsub_TxTx); -} - -acc_t mse() -{ - acc_t sum = 0.; - size_t off = 0; - data_t sub1, sub2; - size_t sz = num_tx_cur * num_sc_cur, vl; - register data_t num_tx_num_sc_reg __asm__("f0") = (data_t)(num_tx_cur * NUM_SC); - - while (sz > 0) { - __asm__ volatile ( - "vsetvli %0, %1, e32, m1, ta, ma\n" - : "=r"(vl) - : "r"(sz)); - __asm__ volatile ( - "vle32.v v0, (%1)\n" - "vle32.v v1, (%2)\n" - "vle32.v v2, (%3)\n" - "vle32.v v3, (%4)\n" - "vfsub.vv v0, v0, v2\n" - "vfsub.vv v1, v1, v3\n" - "vfmul.vv v0, v0, v0\n" - "vfmul.vv v1, v1, v1\n" - "vfadd.vv v0, v0, v1\n" - "vfdiv.vf v0, v0, %5\n" - "vfredusum.vs v0, v4, v5\n" - "vmv.x.s %0, v0\n" - : "+r"(sum) - : "r"(&g_x.re[off]), - "r"(&g_x.re[off]), - "r"(&g_x_MMSE.re[off]), - "r"(&g_x_MMSE.re[off]), - "f"(num_tx_num_sc_reg)); - sz -= vl; - off += vl; - } - - return sum; -}