cforwardsub.c (3807B)
1 #include "../include/common.h" 2 3 #include <stddef.h> 4 5 /** Complex forward substitution L*z = HHy 6 * 7 * z_t = (HHy_t - \sum_{tt=0}^{t-1} L_{t tt} z_tt) / L_{t t} (for LL / float solution) 8 * z_t = (HHy_t - \sum_{tt=0}^{t-1} L_{t tt} z_tt) (for LDL / fixed solution) 9 * 10 * \global g_L lower triangular matrix. Shape [NUM_TX][NUM_TX][NUM_SC] 11 * \global g_HHy vector. Shape [NUM_TX][NUM_SC] 12 * \global g_z output vector. Shape [NUM_TX][NUM_SC] 13 */ 14 void cforwardsub() 15 { 16 size_t t, tt, s; 17 size_t off_L, off_HHy, off_z; 18 19 #if defined(ARCH_x86) || defined(ARCH_rv) 20 acc_t sum_re, sum_im; 21 22 for (t = 0; t < NUM_TX; ++t) { 23 for (s = 0; s < NUM_SC; ++s) { 24 sum_re = sum_im = 0; 25 for (tt = 0; tt < t; ++tt) { 26 off_L = t * NUM_TX * NUM_SC + tt * NUM_SC + s; 27 off_z = tt * NUM_SC + s; 28 sum_re += (acc_t)g_L.re[off_L] * (acc_t)g_z.re[off_z] 29 - (acc_t)g_L.im[off_L] * (acc_t)g_z.im[off_z]; 30 sum_im += (acc_t)g_L.re[off_L] * (acc_t)g_z.im[off_z] 31 + (acc_t)g_L.im[off_L] * (acc_t)g_z.re[off_z]; 32 } 33 off_HHy = t * NUM_SC + s; 34 off_z = t * NUM_SC + s; 35 #if defined(DATA_TYPE_float) 36 off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s; 37 g_z.re[off_z] = (g_HHy.re[off_HHy] - (data_t)sum_re) / g_L.re[off_L]; 38 g_z.im[off_z] = (g_HHy.im[off_HHy] - (data_t)sum_im) / g_L.re[off_L]; 39 #elif defined(DATA_TYPE_fixed) 40 g_z.re[off_z] = g_HHy.re[off_HHy] - (data_t)(sum_re >> FP_Q); 41 g_z.im[off_z] = g_HHy.im[off_HHy] - (data_t)(sum_im >> FP_Q); 42 #else 43 #error "Unknown data type" 44 #endif 45 } 46 } 47 #elif defined(ARCH_rvv) 48 size_t sz, vl; 49 50 for (t = 0; t < NUM_TX; ++t) { 51 sz = NUM_SC; 52 s = 0; 53 54 while (sz > 0) { 55 off_HHy = t * NUM_SC + s; 56 57 /* Initialize z as HHy */ 58 /* v0 - z real part */ 59 /* v4 - z imaginary part */ 60 __asm__ volatile ( 61 "vsetvli %0, %1, e32, m4, ta, ma\n" 62 "vle32.v v0, (%2)\n" 63 "vle32.v v4, (%3)\n" 64 : "=r"(vl) 65 : "r"(sz), 66 "r"(&g_HHy.re[off_HHy]), "r"(&g_HHy.im[off_HHy]) 67 ); 68 69 for (tt = 0; tt != t; ++tt) { 70 off_L = t * NUM_TX * NUM_SC + tt * NUM_SC + s; 71 off_z = tt * NUM_SC + s; 72 73 /* HHy_t - sum L_{t tt} * z_{tt} */ 74 __asm__ volatile ( 75 "vle32.v v8, (%0)\n" 76 "vle32.v v12, (%1)\n" 77 "vle32.v v16, (%2)\n" 78 "vle32.v v20, (%3)\n" 79 #if defined(DATA_TYPE_float) 80 /* real part */ 81 "vfnmsac.vv v0, v8, v16\n" 82 "vfmacc.vv v0, v12, v20\n" 83 /* imaginary part */ 84 "vfnmsac.vv v4, v12, v16\n" 85 "vfnmsac.vv v4, v8, v20\n" 86 #elif defined(DATA_TYPE_fixed) 87 /* real part */ 88 "vsmul.vv v24, v8, v16\n" 89 "vssub.vv v0, v0, v24\n" 90 "vsmul.vv v24, v12, v20\n" 91 "vsadd.vv v0, v0, v24\n" 92 /* imaginary part */ 93 "vsmul.vv v24, v12, v16\n" 94 "vssub.vv v4, v4, v24\n" 95 "vsmul.vv v24, v8, v20\n" 96 "vssub.vv v4, v4, v24\n" 97 #else 98 #error "Unknown data type" 99 #endif 100 : 101 : "r"(&g_L.re[off_L]), "r"(&g_L.im[off_L]), 102 "r"(&g_z.re[off_z]), "r"(&g_z.im[off_z]) 103 ); 104 } 105 106 #if defined(DATA_TYPE_float) 107 /* Divide by L_ii */ 108 /* L_ii imaginary part is zero, so we ignore it */ 109 off_L = t * NUM_TX * NUM_SC + t * NUM_SC + s; 110 __asm__ volatile ( 111 "vle32.v v8, (%0)\n" 112 "vfdiv.vv v0, v0, v8\n" 113 "vfdiv.vv v4, v4, v8\n" 114 : 115 : "r"(&g_L.re[off_L]) 116 ); 117 #endif 118 119 /* Store result */ 120 off_z = t * NUM_SC + s; 121 __asm__ volatile ( 122 "vse32.v v0, (%0)\n" 123 "vse32.v v4, (%1)\n" 124 : 125 : "r"(&g_z.re[off_z]), "r"(&g_z.im[off_z]) 126 ); 127 128 s += vl; 129 sz -= vl; 130 } 131 } 132 133 #else 134 #error "Unknown architecture" 135 #endif 136 }