diff --git a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c index e22df34f99..9b823ad4ae 100644 --- a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c @@ -40,1040 +40,2331 @@ AUTOGENERATED KERNEL #include "common.h" +#include -int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc) - -{ - BLASLONG gvl = 0; - BLASLONG m_top = 0; - BLASLONG n_top = 0; - - - // -- MAIN PASS +#define GEMM_RIGHT_EDGE // One pass for right edge - swap A & B - transpose at end +#define GEMM_BOTTOM_EDGE // One pass for bottom edge - combo on vector and scalar ops +#define GEMM_RIGHT_CHUNK // Break K into chunks (causes epsilon differences) +#define GEMM_BOTTOM_CHUNK // Break K into chunks (causes epsilon differences) - for (BLASLONG j=0; j= 8) { + vfloat32m8_t B00 = __riscv_vle32_v_f32m8(B, N * 8); + B0 = __riscv_vget_v_f32m8_f32m1(B00, 0); + B1 = __riscv_vget_v_f32m8_f32m1(B00, 1); + B2 = __riscv_vget_v_f32m8_f32m1(B00, 2); + B3 = __riscv_vget_v_f32m8_f32m1(B00, 3); + B4 = __riscv_vget_v_f32m8_f32m1(B00, 4); + B5 = __riscv_vget_v_f32m8_f32m1(B00, 5); + B6 = __riscv_vget_v_f32m8_f32m1(B00, 6); + B7 = __riscv_vget_v_f32m8_f32m1(B00, 7); + B += (N * 8); + +#ifdef GEMM_NEW_PACKING + resultE = __riscv_vfmul_vf_f32m1(B0, A0[0 + (M & 0x6) + (M * 0)], N); + result0 = __riscv_vfmul_vf_f32m1(B1, A0[0 + (M & 0x6) + (M * 1)], N); + result1 = __riscv_vfmul_vf_f32m1(B2, A0[0 + (M & 0x6) + (M * 2)], N); + result2 = __riscv_vfmul_vf_f32m1(B3, A0[0 + (M & 0x6) + (M * 3)], N); + result3 = __riscv_vfmul_vf_f32m1(B4, A0[0 + (M & 0x6) + (M * 4)], N); + result4 = __riscv_vfmul_vf_f32m1(B5, A0[0 + (M & 0x6) + (M * 5)], N); + result5 = __riscv_vfmul_vf_f32m1(B6, A0[0 + (M & 0x6) + (M * 6)], N); + result6 = __riscv_vfmul_vf_f32m1(B7, A0[0 + (M & 0x6) + (M * 7)], N); + A0 += (M * 8); +#else + resultE = __riscv_vfmul_vf_f32m1(B0, A3[0 + (1 * 0)], N); + result0 = __riscv_vfmul_vf_f32m1(B1, A3[0 + (1 * 1)], N); + result1 = __riscv_vfmul_vf_f32m1(B2, A3[0 + (1 * 2)], N); + result2 = __riscv_vfmul_vf_f32m1(B3, A3[0 + (1 * 3)], N); + result3 = __riscv_vfmul_vf_f32m1(B4, A3[0 + (1 * 4)], N); + result4 = __riscv_vfmul_vf_f32m1(B5, A3[0 + (1 * 5)], N); + result5 = __riscv_vfmul_vf_f32m1(B6, A3[0 + (1 * 6)], N); + result6 = __riscv_vfmul_vf_f32m1(B7, A3[0 + (1 * 7)], N); + A3 += (1 * 8); +#endif + + for (BLASLONG k = (K / 8); --k; ) { + B00 = __riscv_vle32_v_f32m8(B, N * 8); + B0 = __riscv_vget_v_f32m8_f32m1(B00, 0); + B1 = __riscv_vget_v_f32m8_f32m1(B00, 1); + B2 = __riscv_vget_v_f32m8_f32m1(B00, 2); + B3 = __riscv_vget_v_f32m8_f32m1(B00, 3); + B4 = __riscv_vget_v_f32m8_f32m1(B00, 4); + B5 = __riscv_vget_v_f32m8_f32m1(B00, 5); + B6 = __riscv_vget_v_f32m8_f32m1(B00, 6); + B7 = __riscv_vget_v_f32m8_f32m1(B00, 7); + B += (N * 8); + +#ifdef GEMM_NEW_PACKING + resultE = __riscv_vfmacc_vf_f32m1(resultE, A0[0 + (M & 0x6) + (M * 0)], B0, N); + result0 = __riscv_vfmacc_vf_f32m1(result0, A0[0 + (M & 0x6) + (M * 1)], B1, N); + result1 = __riscv_vfmacc_vf_f32m1(result1, A0[0 + (M & 0x6) + (M * 2)], B2, N); + result2 = __riscv_vfmacc_vf_f32m1(result2, A0[0 + (M & 0x6) + (M * 3)], B3, N); + result3 = __riscv_vfmacc_vf_f32m1(result3, A0[0 + (M & 0x6) + (M * 4)], B4, N); + result4 = __riscv_vfmacc_vf_f32m1(result4, A0[0 + (M & 0x6) + (M * 5)], B5, N); + result5 = __riscv_vfmacc_vf_f32m1(result5, A0[0 + (M & 0x6) + (M * 6)], B6, N); + result6 = __riscv_vfmacc_vf_f32m1(result6, A0[0 + (M & 0x6) + (M * 7)], B7, N); + A0 += (M * 8); +#else + resultE = __riscv_vfmacc_vf_f32m1(resultE, A3[0 + (1 * 0)], B0, N); + result0 = __riscv_vfmacc_vf_f32m1(result0, A3[0 + (1 * 1)], B1, N); + result1 = __riscv_vfmacc_vf_f32m1(result1, A3[0 + (1 * 2)], B2, N); + result2 = __riscv_vfmacc_vf_f32m1(result2, A3[0 + (1 * 3)], B3, N); + result3 = __riscv_vfmacc_vf_f32m1(result3, A3[0 + (1 * 4)], B4, N); + result4 = __riscv_vfmacc_vf_f32m1(result4, A3[0 + (1 * 5)], B5, N); + result5 = __riscv_vfmacc_vf_f32m1(result5, A3[0 + (1 * 6)], B6, N); + result6 = __riscv_vfmacc_vf_f32m1(result6, A3[0 + (1 * 7)], B7, N); + A3 += (1 * 8); +#endif + } + + resultE = __riscv_vfadd_vv_f32m1(resultE, result0, N); + result1 = __riscv_vfadd_vv_f32m1(result1, result2, N); + result3 = __riscv_vfadd_vv_f32m1(result3, result4, N); + result5 = __riscv_vfadd_vv_f32m1(result5, result6, N); + resultE = __riscv_vfadd_vv_f32m1(resultE, result1, N); + result3 = __riscv_vfadd_vv_f32m1(result3, result5, N); + resultE = __riscv_vfadd_vv_f32m1(resultE, result3, N); + + K &= 7; + } else { + resultE = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + } + } else if (M <= 3) { + if (K >= 4) { + vfloat32m4_t B00 = __riscv_vle32_v_f32m4(B, N * 4); + B0 = __riscv_vget_v_f32m4_f32m1(B00, 0); + B1 = __riscv_vget_v_f32m4_f32m1(B00, 1); + B2 = __riscv_vget_v_f32m4_f32m1(B00, 2); + B3 = __riscv_vget_v_f32m4_f32m1(B00, 3); + B += (N * 4); + +#ifdef GEMM_NEW_PACKING + if (M & 2) { + resultC = __riscv_vfmul_vf_f32m1(B0, A0[0 + (M & 0x4) + (M * 0)], N); + resultD = __riscv_vfmul_vf_f32m1(B0, A0[1 + (M & 0x4) + (M * 0)], N); + result0 = __riscv_vfmul_vf_f32m1(B1, A0[0 + (M & 0x4) + (M * 1)], N); + result1 = __riscv_vfmul_vf_f32m1(B1, A0[1 + (M & 0x4) + (M * 1)], N); + result4 = __riscv_vfmul_vf_f32m1(B2, A0[0 + (M & 0x4) + (M * 2)], N); + result5 = __riscv_vfmul_vf_f32m1(B2, A0[1 + (M & 0x4) + (M * 2)], N); + result8 = __riscv_vfmul_vf_f32m1(B3, A0[0 + (M & 0x4) + (M * 3)], N); + result9 = __riscv_vfmul_vf_f32m1(B3, A0[1 + (M & 0x4) + (M * 3)], N); + } + if (M & 1) { + resultE = __riscv_vfmul_vf_f32m1(B0, A0[0 + (M & 0x6) + (M * 0)], N); + result2 = __riscv_vfmul_vf_f32m1(B1, A0[0 + (M & 0x6) + (M * 1)], N); + result6 = __riscv_vfmul_vf_f32m1(B2, A0[0 + (M & 0x6) + (M * 2)], N); + resultA = __riscv_vfmul_vf_f32m1(B3, A0[0 + (M & 0x6) + (M * 3)], N); + } + A0 += (M * 4); +#else + if (M & 2) { + resultC = __riscv_vfmul_vf_f32m1(B0, A2[0 + (2 * 0)], N); + resultD = __riscv_vfmul_vf_f32m1(B0, A2[1 + (2 * 0)], N); + result0 = __riscv_vfmul_vf_f32m1(B1, A2[0 + (2 * 1)], N); + result1 = __riscv_vfmul_vf_f32m1(B1, A2[1 + (2 * 1)], N); + result4 = __riscv_vfmul_vf_f32m1(B2, A2[0 + (2 * 2)], N); + result5 = __riscv_vfmul_vf_f32m1(B2, A2[1 + (2 * 2)], N); + result8 = __riscv_vfmul_vf_f32m1(B3, A2[0 + (2 * 3)], N); + result9 = __riscv_vfmul_vf_f32m1(B3, A2[1 + (2 * 3)], N); + A2 += (2 * 4); + } + if (M & 1) { + resultE = __riscv_vfmul_vf_f32m1(B0, A3[0 + (1 * 0)], N); + result2 = __riscv_vfmul_vf_f32m1(B1, A3[0 + (1 * 1)], N); + result6 = __riscv_vfmul_vf_f32m1(B2, A3[0 + (1 * 2)], N); + resultA = __riscv_vfmul_vf_f32m1(B3, A3[0 + (1 * 3)], N); + A3 += (1 * 4); + } +#endif + + for (BLASLONG k = (K / 4); --k; ) { + B00 = __riscv_vle32_v_f32m4(B, N * 4); + B0 = __riscv_vget_v_f32m4_f32m1(B00, 0); + B1 = __riscv_vget_v_f32m4_f32m1(B00, 1); + B2 = __riscv_vget_v_f32m4_f32m1(B00, 2); + B3 = __riscv_vget_v_f32m4_f32m1(B00, 3); + B += (N * 4); + +#ifdef GEMM_NEW_PACKING + if (M & 2) { + resultC = __riscv_vfmacc_vf_f32m1(resultC, A0[0 + (M & 0x4) + (M * 0)], B0, N); + resultD = __riscv_vfmacc_vf_f32m1(resultD, A0[1 + (M & 0x4) + (M * 0)], B0, N); + result0 = __riscv_vfmacc_vf_f32m1(result0, A0[0 + (M & 0x4) + (M * 1)], B1, N); + result1 = __riscv_vfmacc_vf_f32m1(result1, A0[1 + (M & 0x4) + (M * 1)], B1, N); + result4 = __riscv_vfmacc_vf_f32m1(result4, A0[0 + (M & 0x4) + (M * 2)], B2, N); + result5 = __riscv_vfmacc_vf_f32m1(result5, A0[1 + (M & 0x4) + (M * 2)], B2, N); + result8 = __riscv_vfmacc_vf_f32m1(result8, A0[0 + (M & 0x4) + (M * 3)], B3, N); + result9 = __riscv_vfmacc_vf_f32m1(result9, A0[1 + (M & 0x4) + (M * 3)], B3, N); + } + if (M & 1) { + resultE = __riscv_vfmacc_vf_f32m1(resultE, A0[0 + (M & 0x6) + (M * 0)], B0, N); + result2 = __riscv_vfmacc_vf_f32m1(result2, A0[0 + (M & 0x6) + (M * 1)], B1, N); + result6 = __riscv_vfmacc_vf_f32m1(result6, A0[0 + (M & 0x6) + (M * 2)], B2, N); + resultA = __riscv_vfmacc_vf_f32m1(resultA, A0[0 + (M & 0x6) + (M * 3)], B3, N); + } + A0 += (M * 4); +#else + if (M & 2) { + resultC = __riscv_vfmacc_vf_f32m1(resultC, A2[0 + (2 * 0)], B0, N); + resultD = __riscv_vfmacc_vf_f32m1(resultD, A2[1 + (2 * 0)], B0, N); + result0 = __riscv_vfmacc_vf_f32m1(result0, A2[0 + (2 * 1)], B1, N); + result1 = __riscv_vfmacc_vf_f32m1(result1, A2[1 + (2 * 1)], B1, N); + result4 = __riscv_vfmacc_vf_f32m1(result4, A2[0 + (2 * 2)], B2, N); + result5 = __riscv_vfmacc_vf_f32m1(result5, A2[1 + (2 * 2)], B2, N); + result8 = __riscv_vfmacc_vf_f32m1(result8, A2[0 + (2 * 3)], B3, N); + result9 = __riscv_vfmacc_vf_f32m1(result9, A2[1 + (2 * 3)], B3, N); + A2 += (2 * 4); + } + if (M & 1) { + resultE = __riscv_vfmacc_vf_f32m1(resultE, A3[0 + (1 * 0)], B0, N); + result2 = __riscv_vfmacc_vf_f32m1(result2, A3[0 + (1 * 1)], B1, N); + result6 = __riscv_vfmacc_vf_f32m1(result6, A3[0 + (1 * 2)], B2, N); + resultA = __riscv_vfmacc_vf_f32m1(resultA, A3[0 + (1 * 3)], B3, N); + A3 += (1 * 4); + } +#endif + } + + if (M & 2) { + resultC = __riscv_vfadd_vv_f32m1(resultC, result0, N); + resultD = __riscv_vfadd_vv_f32m1(resultD, result1, N); + result4 = __riscv_vfadd_vv_f32m1(result4, result8, N); + result5 = __riscv_vfadd_vv_f32m1(result5, result9, N); + resultC = __riscv_vfadd_vv_f32m1(resultC, result4, N); + resultD = __riscv_vfadd_vv_f32m1(resultD, result5, N); + } + if (M & 1) { + resultE = __riscv_vfadd_vv_f32m1(resultE, result2, N); + result6 = __riscv_vfadd_vv_f32m1(result6, resultA, N); + resultE = __riscv_vfadd_vv_f32m1(resultE, result6, N); + } + + K &= 3; + } else { + if (M & 2) { + resultC = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + resultD = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + } + if (M & 1) { + resultE = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + } + } + } else if (M <= 8) { + if (K >= 2) { + vfloat32m2_t B00, A00; + vfloat32m1_t A5; + vfloat32m1_t resultF; + + if (!S2) { + B00 = __riscv_vle32_v_f32m2(B, N * 2); + B0 = __riscv_vget_v_f32m2_f32m1(B00, 0); + B1 = __riscv_vget_v_f32m2_f32m1(B00, 1); + } + + if (M == 8) { + if (S2) { + A00 = __riscv_vle32_v_f32m2(A0, N * 2); + A4 = __riscv_vget_v_f32m2_f32m1(A00, 0); + A5 = __riscv_vget_v_f32m2_f32m1(A00, 1); + + result0 = __riscv_vfmul_vf_f32m1(A4, B[0], N); + result1 = __riscv_vfmul_vf_f32m1(A4, B[1], N); + result2 = __riscv_vfmul_vf_f32m1(A4, B[2], N); + result3 = __riscv_vfmul_vf_f32m1(A4, B[3], N); + result4 = __riscv_vfmul_vf_f32m1(A4, B[4], N); + result5 = __riscv_vfmul_vf_f32m1(A4, B[5], N); + result6 = __riscv_vfmul_vf_f32m1(A4, B[6], N); + result7 = __riscv_vfmul_vf_f32m1(A4, B[7], N); + result8 = __riscv_vfmul_vf_f32m1(A5, B[8], N); + result9 = __riscv_vfmul_vf_f32m1(A5, B[9], N); + resultA = __riscv_vfmul_vf_f32m1(A5, B[10], N); + resultB = __riscv_vfmul_vf_f32m1(A5, B[11], N); + resultC = __riscv_vfmul_vf_f32m1(A5, B[12], N); + resultD = __riscv_vfmul_vf_f32m1(A5, B[13], N); + resultE = __riscv_vfmul_vf_f32m1(A5, B[14], N); + resultF = __riscv_vfmul_vf_f32m1(A5, B[15], N); + } else { + result0 = __riscv_vfmul_vf_f32m1(B0, A0[0], N); + result1 = __riscv_vfmul_vf_f32m1(B0, A0[1], N); + result2 = __riscv_vfmul_vf_f32m1(B0, A0[2], N); + result3 = __riscv_vfmul_vf_f32m1(B0, A0[3], N); + result4 = __riscv_vfmul_vf_f32m1(B0, A0[4], N); + result5 = __riscv_vfmul_vf_f32m1(B0, A0[5], N); + result6 = __riscv_vfmul_vf_f32m1(B0, A0[6], N); + result7 = __riscv_vfmul_vf_f32m1(B0, A0[7], N); + result8 = __riscv_vfmul_vf_f32m1(B1, A0[8], N); + result9 = __riscv_vfmul_vf_f32m1(B1, A0[9], N); + resultA = __riscv_vfmul_vf_f32m1(B1, A0[10], N); + resultB = __riscv_vfmul_vf_f32m1(B1, A0[11], N); + resultC = __riscv_vfmul_vf_f32m1(B1, A0[12], N); + resultD = __riscv_vfmul_vf_f32m1(B1, A0[13], N); + resultE = __riscv_vfmul_vf_f32m1(B1, A0[14], N); + resultF = __riscv_vfmul_vf_f32m1(B1, A0[15], N); + } + } +#ifdef GEMM_NEW_PACKING + if (M & 4) { + result8 = __riscv_vfmul_vf_f32m1(B0, A0[0 + (M * 0)], N); + result9 = __riscv_vfmul_vf_f32m1(B0, A0[1 + (M * 0)], N); + resultA = __riscv_vfmul_vf_f32m1(B0, A0[2 + (M * 0)], N); + resultB = __riscv_vfmul_vf_f32m1(B0, A0[3 + (M * 0)], N); + result0 = __riscv_vfmul_vf_f32m1(B1, A0[0 + (M * 1)], N); + result1 = __riscv_vfmul_vf_f32m1(B1, A0[1 + (M * 1)], N); + result2 = __riscv_vfmul_vf_f32m1(B1, A0[2 + (M * 1)], N); + result3 = __riscv_vfmul_vf_f32m1(B1, A0[3 + (M * 1)], N); + } + if (M & 2) { + resultC = __riscv_vfmul_vf_f32m1(B0, A0[0 + (M & 0x4) + (M * 0)], N); + resultD = __riscv_vfmul_vf_f32m1(B0, A0[1 + (M & 0x4) + (M * 0)], N); + result4 = __riscv_vfmul_vf_f32m1(B1, A0[0 + (M & 0x4) + (M * 1)], N); + result5 = __riscv_vfmul_vf_f32m1(B1, A0[1 + (M & 0x4) + (M * 1)], N); + } + if (M & 1) { + resultE = __riscv_vfmul_vf_f32m1(B0, A0[0 + (M & 0x6) + (M * 0)], N); + result6 = __riscv_vfmul_vf_f32m1(B1, A0[0 + (M & 0x6) + (M * 1)], N); + } + A0 += (M * 2); +#else + if (M & 4) { + result8 = __riscv_vfmul_vf_f32m1(B0, A1[0 + (4 * 0)], N); + result9 = __riscv_vfmul_vf_f32m1(B0, A1[1 + (4 * 0)], N); + resultA = __riscv_vfmul_vf_f32m1(B0, A1[2 + (4 * 0)], N); + resultB = __riscv_vfmul_vf_f32m1(B0, A1[3 + (4 * 0)], N); + result0 = __riscv_vfmul_vf_f32m1(B1, A1[0 + (4 * 1)], N); + result1 = __riscv_vfmul_vf_f32m1(B1, A1[1 + (4 * 1)], N); + result2 = __riscv_vfmul_vf_f32m1(B1, A1[2 + (4 * 1)], N); + result3 = __riscv_vfmul_vf_f32m1(B1, A1[3 + (4 * 1)], N); + A1 += (4 * 2); + } + if (M & 2) { + resultC = __riscv_vfmul_vf_f32m1(B0, A2[0 + (2 * 0)], N); + resultD = __riscv_vfmul_vf_f32m1(B0, A2[1 + (2 * 0)], N); + result4 = __riscv_vfmul_vf_f32m1(B1, A2[0 + (2 * 1)], N); + result5 = __riscv_vfmul_vf_f32m1(B1, A2[1 + (2 * 1)], N); + A2 += (2 * 2); + } + if (M & 1) { + resultE = __riscv_vfmul_vf_f32m1(B0, A3[0 + (1 * 0)], N); + result6 = __riscv_vfmul_vf_f32m1(B1, A3[0 + (1 * 1)], N); + A3 += (1 * 2); + } + if (M == 8) { + A0 += (N * 2); + } +#endif + B += (N * 2); + + for (BLASLONG k = (K / 2); --k; ) { + if (!S2) { + B00 = __riscv_vle32_v_f32m2(B, N * 2); + B0 = __riscv_vget_v_f32m2_f32m1(B00, 0); + B1 = __riscv_vget_v_f32m2_f32m1(B00, 1); + } + + if (M == 8) { + if (S2) { + A00 = __riscv_vle32_v_f32m2(A0, N * 2); + A4 = __riscv_vget_v_f32m2_f32m1(A00, 0); + A5 = __riscv_vget_v_f32m2_f32m1(A00, 1); + + result0 = __riscv_vfmacc_vf_f32m1(result0, B[0], A4, N); + result1 = __riscv_vfmacc_vf_f32m1(result1, B[1], A4, N); + result2 = __riscv_vfmacc_vf_f32m1(result2, B[2], A4, N); + result3 = __riscv_vfmacc_vf_f32m1(result3, B[3], A4, N); + result4 = __riscv_vfmacc_vf_f32m1(result4, B[4], A4, N); + result5 = __riscv_vfmacc_vf_f32m1(result5, B[5], A4, N); + result6 = __riscv_vfmacc_vf_f32m1(result6, B[6], A4, N); + result7 = __riscv_vfmacc_vf_f32m1(result7, B[7], A4, N); + result8 = __riscv_vfmacc_vf_f32m1(result8, B[8], A5, N); + result9 = __riscv_vfmacc_vf_f32m1(result9, B[9], A5, N); + resultA = __riscv_vfmacc_vf_f32m1(resultA, B[10], A5, N); + resultB = __riscv_vfmacc_vf_f32m1(resultB, B[11], A5, N); + resultC = __riscv_vfmacc_vf_f32m1(resultC, B[12], A5, N); + resultD = __riscv_vfmacc_vf_f32m1(resultD, B[13], A5, N); + resultE = __riscv_vfmacc_vf_f32m1(resultE, B[14], A5, N); + resultF = __riscv_vfmacc_vf_f32m1(resultF, B[15], A5, N); + } else { + result0 = __riscv_vfmacc_vf_f32m1(result0, A0[0], B0, N); + result1 = __riscv_vfmacc_vf_f32m1(result1, A0[1], B0, N); + result2 = __riscv_vfmacc_vf_f32m1(result2, A0[2], B0, N); + result3 = __riscv_vfmacc_vf_f32m1(result3, A0[3], B0, N); + result4 = __riscv_vfmacc_vf_f32m1(result4, A0[4], B0, N); + result5 = __riscv_vfmacc_vf_f32m1(result5, A0[5], B0, N); + result6 = __riscv_vfmacc_vf_f32m1(result6, A0[6], B0, N); + result7 = __riscv_vfmacc_vf_f32m1(result7, A0[7], B0, N); + result8 = __riscv_vfmacc_vf_f32m1(result8, A0[8], B1, N); + result9 = __riscv_vfmacc_vf_f32m1(result9, A0[9], B1, N); + resultA = __riscv_vfmacc_vf_f32m1(resultA, A0[10], B1, N); + resultB = __riscv_vfmacc_vf_f32m1(resultB, A0[11], B1, N); + resultC = __riscv_vfmacc_vf_f32m1(resultC, A0[12], B1, N); + resultD = __riscv_vfmacc_vf_f32m1(resultD, A0[13], B1, N); + resultE = __riscv_vfmacc_vf_f32m1(resultE, A0[14], B1, N); + resultF = __riscv_vfmacc_vf_f32m1(resultF, A0[15], B1, N); + } + } +#ifdef GEMM_NEW_PACKING + if (M & 4) { + result8 = __riscv_vfmacc_vf_f32m1(result8, A0[0 + (M * 0)], B0, N); + result9 = __riscv_vfmacc_vf_f32m1(result9, A0[1 + (M * 0)], B0, N); + resultA = __riscv_vfmacc_vf_f32m1(resultA, A0[2 + (M * 0)], B0, N); + resultB = __riscv_vfmacc_vf_f32m1(resultB, A0[3 + (M * 0)], B0, N); + result0 = __riscv_vfmacc_vf_f32m1(result0, A0[0 + (M * 1)], B1, N); + result1 = __riscv_vfmacc_vf_f32m1(result1, A0[1 + (M * 1)], B1, N); + result2 = __riscv_vfmacc_vf_f32m1(result2, A0[2 + (M * 1)], B1, N); + result3 = __riscv_vfmacc_vf_f32m1(result3, A0[3 + (M * 1)], B1, N); + } + if (M & 2) { + resultC = __riscv_vfmacc_vf_f32m1(resultC, A0[0 + (M & 0x4) + (M * 0)], B0, N); + resultD = __riscv_vfmacc_vf_f32m1(resultD, A0[1 + (M & 0x4) + (M * 0)], B0, N); + result4 = __riscv_vfmacc_vf_f32m1(result4, A0[0 + (M & 0x4) + (M * 1)], B1, N); + result5 = __riscv_vfmacc_vf_f32m1(result5, A0[1 + (M & 0x4) + (M * 1)], B1, N); + } + if (M & 1) { + resultE = __riscv_vfmacc_vf_f32m1(resultE, A0[0 + (M & 0x6) + (M * 0)], B0, N); + result6 = __riscv_vfmacc_vf_f32m1(result6, A0[0 + (M & 0x6) + (M * 1)], B1, N); + } + A0 += (M * 2); +#else + if (M & 4) { + result8 = __riscv_vfmacc_vf_f32m1(result8, A1[0 + (4 * 0)], B0, N); + result9 = __riscv_vfmacc_vf_f32m1(result9, A1[1 + (4 * 0)], B0, N); + resultA = __riscv_vfmacc_vf_f32m1(resultA, A1[2 + (4 * 0)], B0, N); + resultB = __riscv_vfmacc_vf_f32m1(resultB, A1[3 + (4 * 0)], B0, N); + result0 = __riscv_vfmacc_vf_f32m1(result0, A1[0 + (4 * 1)], B1, N); + result1 = __riscv_vfmacc_vf_f32m1(result1, A1[1 + (4 * 1)], B1, N); + result2 = __riscv_vfmacc_vf_f32m1(result2, A1[2 + (4 * 1)], B1, N); + result3 = __riscv_vfmacc_vf_f32m1(result3, A1[3 + (4 * 1)], B1, N); + A1 += (4 * 2); + } + if (M & 2) { + resultC = __riscv_vfmacc_vf_f32m1(resultC, A2[0 + (2 * 0)], B0, N); + resultD = __riscv_vfmacc_vf_f32m1(resultD, A2[1 + (2 * 0)], B0, N); + result4 = __riscv_vfmacc_vf_f32m1(result4, A2[0 + (2 * 1)], B1, N); + result5 = __riscv_vfmacc_vf_f32m1(result5, A2[1 + (2 * 1)], B1, N); + A2 += (2 * 2); + } + if (M & 1) { + resultE = __riscv_vfmacc_vf_f32m1(resultE, A3[0 + (1 * 0)], B0, N); + result6 = __riscv_vfmacc_vf_f32m1(result6, A3[0 + (1 * 1)], B1, N); + A3 += (1 * 2); + } + if (M == 8) { + A0 += (N * 2); + } +#endif + B += (N * 2); + } + + if (M == 8) { + result0 = __riscv_vfadd_vv_f32m1(result0, result8, N); + result1 = __riscv_vfadd_vv_f32m1(result1, result9, N); + result2 = __riscv_vfadd_vv_f32m1(result2, resultA, N); + result3 = __riscv_vfadd_vv_f32m1(result3, resultB, N); + result4 = __riscv_vfadd_vv_f32m1(result4, resultC, N); + result5 = __riscv_vfadd_vv_f32m1(result5, resultD, N); + result6 = __riscv_vfadd_vv_f32m1(result6, resultE, N); + result7 = __riscv_vfadd_vv_f32m1(result7, resultF, N); + } + if (M & 4) { + result8 = __riscv_vfadd_vv_f32m1(result8, result0, N); + result9 = __riscv_vfadd_vv_f32m1(result9, result1, N); + resultA = __riscv_vfadd_vv_f32m1(resultA, result2, N); + resultB = __riscv_vfadd_vv_f32m1(resultB, result3, N); + } + if (M & 2) { + resultC = __riscv_vfadd_vv_f32m1(resultC, result4, N); + resultD = __riscv_vfadd_vv_f32m1(resultD, result5, N); + } + if (M & 1) { + resultE = __riscv_vfadd_vv_f32m1(resultE, result6, N); + } + + K &= 1; + } else { + if (M == 8) { + result0 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + result1 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + result2 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + result3 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + result4 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + result5 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + result6 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + result7 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + } + if (M & 4) { + result8 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + result9 = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + resultA = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + resultB = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + } + if (M & 2) { + resultC = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + resultD = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + } + if (M & 1) { + resultE = __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vmv_v_x_u32m1(0, N)); + } + } + } else +#endif + { + if (!S2) { + B0 = __riscv_vle32_v_f32m1(B, N); + } - for(BLASLONG k=1; k 4 + if (N & 4) { + if (!S2) { + resultF = __riscv_vle32_v_f32mf2(B, 4); } - - - BLASLONG ci=n_top*ldc+m_top; - - vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0; - vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0; - vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0; - vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); - c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl ); - c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl ); - c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl ); - c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl ); - - ci=n_top*ldc+m_top; - - __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0; - __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0; - __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0; - __riscv_vse32_v_f32m1( &C[ci], c3, gvl); - m_top += 8; + if (M & 8) { + if (S2) { + result08 = __riscv_vfmul_vf_f32m1(result03, B[0], 8); + result09 = __riscv_vfmul_vf_f32m1(result03, B[1], 8); + result0A = __riscv_vfmul_vf_f32m1(result03, B[2], 8); + result0B = __riscv_vfmul_vf_f32m1(result03, B[3], 8); + } else { + result0 = __riscv_vfmul_vf_f32mf2(resultF, A0[0], 4); + result1 = __riscv_vfmul_vf_f32mf2(resultF, A0[1], 4); + result2 = __riscv_vfmul_vf_f32mf2(resultF, A0[2], 4); + result3 = __riscv_vfmul_vf_f32mf2(resultF, A0[3], 4); + result4 = __riscv_vfmul_vf_f32mf2(resultF, A0[4], 4); + result5 = __riscv_vfmul_vf_f32mf2(resultF, A0[5], 4); + result6 = __riscv_vfmul_vf_f32mf2(resultF, A0[6], 4); + result7 = __riscv_vfmul_vf_f32mf2(resultF, A0[7], 4); + } + } +#ifdef GEMM_NEW_PACKING + if (M & 4) { + result8 = __riscv_vfmul_vf_f32mf2(resultF, A0[0 + (M & 0x8)], 4); + result9 = __riscv_vfmul_vf_f32mf2(resultF, A0[1 + (M & 0x8)], 4); + resultA = __riscv_vfmul_vf_f32mf2(resultF, A0[2 + (M & 0x8)], 4); + resultB = __riscv_vfmul_vf_f32mf2(resultF, A0[3 + (M & 0x8)], 4); + } + if (M & 2) { + resultC = __riscv_vfmul_vf_f32mf2(resultF, A0[0 + (M & 0xC)], 4); + resultD = __riscv_vfmul_vf_f32mf2(resultF, A0[1 + (M & 0xC)], 4); + } + if (M & 1) { + resultE = __riscv_vfmul_vf_f32mf2(resultF, A0[0 + (M & 0xE)], 4); + } +#else + if (M & 4) { + result8 = __riscv_vfmul_vf_f32mf2(resultF, A1[0], 4); + result9 = __riscv_vfmul_vf_f32mf2(resultF, A1[1], 4); + resultA = __riscv_vfmul_vf_f32mf2(resultF, A1[2], 4); + resultB = __riscv_vfmul_vf_f32mf2(resultF, A1[3], 4); + } + if (M & 2) { + resultC = __riscv_vfmul_vf_f32mf2(resultF, A2[0], 4); + resultD = __riscv_vfmul_vf_f32mf2(resultF, A2[1], 4); + } + if (M & 1) { + resultE = __riscv_vfmul_vf_f32mf2(resultF, A3[0], 4); + } +#endif } - - if( M & 4 ) { - gvl = __riscv_vsetvl_e32m1(4); - - BLASLONG ai=m_top*K; - BLASLONG bi=n_top*K; - float B0 = B[bi+0]; - float B1 = B[bi+1]; - float B2 = B[bi+2]; - float B3 = B[bi+3]; - bi += 4; - - vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl ); - ai += 4; - - vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl); - vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl); - vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl); - vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl); - - for(BLASLONG k=1; k 4 + if (N & 2) { + B4 = B00[0 + (N & 4)]; + B5 = B00[1 + (N & 4)]; + } + if (N & 1) { + B6 = B00[0 + (N & 6)]; + } + B00 += N; +#else + if (N & 2) { + B4 = B01[0]; + B5 = B01[1]; + B01 += 2; + } + if (N & 1) { + B6 = B02[0]; + B02 += 1; + } +#endif + + A00 = __riscv_vle32_v_f32m2(*A, 8 * 2); + A0 = __riscv_vget_v_f32m2_f32m1(A00, 0); + A1 = __riscv_vget_v_f32m2_f32m1(A00, 1); + *A += 16; + + if (N & 4) { + result0 = __riscv_vfmul_vf_f32m1(A0, B0, 8); + result1 = __riscv_vfmul_vf_f32m1(A1, B0, 8); + result2 = __riscv_vfmul_vf_f32m1(A0, B1, 8); + result3 = __riscv_vfmul_vf_f32m1(A1, B1, 8); + result4 = __riscv_vfmul_vf_f32m1(A0, B2, 8); + result5 = __riscv_vfmul_vf_f32m1(A1, B2, 8); + result6 = __riscv_vfmul_vf_f32m1(A0, B3, 8); + result7 = __riscv_vfmul_vf_f32m1(A1, B3, 8); + } + if (N & 2) { + result8 = __riscv_vfmul_vf_f32m1(A0, B4, 8); + result9 = __riscv_vfmul_vf_f32m1(A1, B4, 8); + resultA = __riscv_vfmul_vf_f32m1(A0, B5, 8); + resultB = __riscv_vfmul_vf_f32m1(A1, B5, 8); + } + if (N & 1) { + resultC = __riscv_vfmul_vf_f32m1(A0, B6, 8); + resultD = __riscv_vfmul_vf_f32m1(A1, B6, 8); } - - - BLASLONG ci=n_top*ldc+m_top; - - vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl; - vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); - c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl ); - c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl ); - - ci=n_top*ldc+m_top; - - __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl; - __riscv_vse32_v_f32m1( &C[ci], c1, gvl); - m_top += 16; } - - if( M & 8 ) { - gvl = __riscv_vsetvl_e32m1(8); - - BLASLONG ai=m_top*K; - BLASLONG bi=n_top*K; - float B0 = B[bi+0]; - bi += 1; - - vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl ); - ai += 8; - - vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl); - - for(BLASLONG k=1; k