Committed, thanks Kito. Pan From: Kito Cheng Sent: Monday, October 23, 2023 5:50 PM To: Juzhe-Zhong Cc: GCC Patches ; Kito Cheng ; Jeff Law ; Robin Dapp Subject: Re: [PATCH V2] RISC-V: Fix ICE for the fusion case from vsetvl to scalar move[PR111927] LGTM Juzhe-Zhong > 於 2023年10月23日 週一 17:41 寫道: ICE: during RTL pass: vsetvl : In function 'riscv_lms_f32': :240:1: internal compiler error: in merge, at config/riscv/riscv-vsetvl.cc:1997 240 | } In general compatible_p (avl_equal_p) has: if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ()) return false; Don't fuse AVL of vsetvl if the VL operand is used by non-RVV instructions. It is reasonable to add it into 'can_use_next_avl_p' since we don't want to fuse AVL of vsetvl into a scalar move instruction which doesn't demand AVL. And after the fusion, we will alway use compatible_p to check whether the demand is correct or not. PR target/111927 gcc/ChangeLog: * config/riscv/riscv-vsetvl.cc: Fix bug. gcc/testsuite/ChangeLog: * gcc.target/riscv/rvv/vsetvl/pr111927.c: New test. --- gcc/config/riscv/riscv-vsetvl.cc | 23 +++ .../gcc.target/riscv/rvv/vsetvl/pr111927.c | 170 ++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c diff --git a/gcc/config/riscv/riscv-vsetvl.cc b/gcc/config/riscv/riscv-vsetvl.cc index 47b459fddd4..f3922a051c5 100644 --- a/gcc/config/riscv/riscv-vsetvl.cc +++ b/gcc/config/riscv/riscv-vsetvl.cc @@ -1541,6 +1541,29 @@ private: inline bool can_use_next_avl_p (const vsetvl_info &prev, const vsetvl_info &next) { + /* Forbid the AVL/VL propagation if VL of NEXT is used + by non-RVV instructions. This is because: + + bb 2: + PREV: scalar move (no AVL) + bb 3: + NEXT: vsetvl a5(VL), a4(AVL) ... + branch a5,zero + + Since user vsetvl instruction is no side effect instruction + which should be placed in the correct and optimal location + of the program by the previous PASS, it is unreasonable that + VSETVL PASS tries to move it to another places if it used by + non-RVV instructions. + + Note: We only forbid the cases that VL is used by the following + non-RVV instructions which will cause issues. We don't forbid + other cases since it won't cause correctness issues and we still + more demand info are fused backward. The later LCM algorithm + should know the optimal location of the vsetvl. */ + if (next.has_vl () && next.vl_used_by_non_rvv_insn_p ()) + return false; + if (!next.has_nonvlmax_reg_avl () && !next.has_vl ()) return true; diff --git a/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c new file mode 100644 index 00000000000..ab599add57f --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/rvv/vsetvl/pr111927.c @@ -0,0 +1,170 @@ +/* { dg-do compile } */ +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3" } */ + +#include "riscv_vector.h" + +#define RISCV_MATH_LOOPUNROLL +#define RISCV_MATH_VECTOR +typedef float float32_t; + + typedef struct + { + uint16_t numTaps; /**< number of coefficients in the filter. */ + float32_t *pState; /**< points to the state variable array. The array is of length numTaps+blockSize-1. */ + float32_t *pCoeffs; /**< points to the coefficient array. The array is of length numTaps. */ + float32_t mu; /**< step size that controls filter coefficient updates. */ + } riscv_lms_instance_f32; + + +void riscv_lms_f32( + const riscv_lms_instance_f32 * S, + const float32_t * pSrc, + float32_t * pRef, + float32_t * pOut, + float32_t * pErr, + uint32_t blockSize) +{ + float32_t *pState = S->pState; /* State pointer */ + float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ + float32_t *pStateCurnt; /* Points to the current sample of the state */ + float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */ + float32_t mu = S->mu; /* Adaptive factor */ + float32_t acc, e; /* Accumulator, error */ + float32_t w; /* Weight factor */ + uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */ + uint32_t tapCnt, blkCnt; /* Loop counters */ + + /* Initializations of error, difference, Coefficient update */ + e = 0.0f; + w = 0.0f; + + /* S->pState points to state array which contains previous frame (numTaps - 1) samples */ + /* pStateCurnt points to the location where the new input data should be written */ + pStateCurnt = &(S->pState[(numTaps - 1U)]); + + /* initialise loop count */ + blkCnt = blockSize; + + while (blkCnt > 0U) + { + /* Copy the new input sample into the state buffer */ + *pStateCurnt++ = *pSrc++; + + /* Initialize pState pointer */ + px = pState; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + /* Set the accumulator to zero */ + acc = 0.0f; + uint32_t vblkCnt = numTaps; /* Loop counter */ + size_t l; + vfloat32m8_t vx, vy; + vfloat32m1_t temp00m1; + l = __riscv_vsetvl_e32m1(1); + temp00m1 = __riscv_vfmv_v_f_f32m1(0, l); + for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) { + vx = __riscv_vle32_v_f32m8(px, l); + px += l; + vy = __riscv_vle32_v_f32m8(pb, l); + pb += l; + temp00m1 = __riscv_vfredusum_vs_f32m8_f32m1(__riscv_vfmul_vv_f32m8(vx, vy, l), temp00m1, l); + } + acc += __riscv_vfmv_f_s_f32m1_f32(temp00m1); + + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + acc += (*px++) * (*pb++); + + /* Decrement the loop counter */ + tapCnt--; + } + /* Store the result from accumulator into the destination buffer. */ + *pOut++ = acc; + + /* Compute and store error */ + e = (float32_t) *pRef++ - acc; + *pErr++ = e; + + /* Calculation of Weighting factor for updating filter coefficients */ + w = e * mu; + + /* Initialize pState pointer */ + /* Advance state pointer by 1 for the next sample */ + px = pState++; + + /* Initialize coefficient pointer */ + pb = pCoeffs; + + vblkCnt = numTaps; + for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) { + vx = __riscv_vle32_v_f32m8(px, l); + px += l; + __riscv_vse32_v_f32m8(pb, __riscv_vfadd_vv_f32m8(__riscv_vfmul_vf_f32m8(vx, w, l), __riscv_vle32_v_f32m8(pb, l), l) , l); + pb += l; + } + while (tapCnt > 0U) + { + /* Perform the multiply-accumulate */ + *pb += w * (*px++); + pb++; + + /* Decrement loop counter */ + tapCnt--; + } + /* Decrement loop counter */ + blkCnt--; + } + + /* Processing is complete. + Now copy the last numTaps - 1 samples to the start of the state buffer. + This prepares the state buffer for the next function call. */ + + /* Points to the start of the pState buffer */ + pStateCurnt = S->pState; + + /* copy data */ + + uint32_t vblkCnt = (numTaps - 1U); /* Loop counter */ + size_t l; + for (; (l = __riscv_vsetvl_e32m8(vblkCnt)) > 0; vblkCnt -= l) { + __riscv_vse32_v_f32m8(pStateCurnt, __riscv_vle32_v_f32m8(pState, l) , l); + pState += l; + pStateCurnt += l; + } + + + /* Loop unrolling: Compute 4 taps at a time. */ + tapCnt = (numTaps - 1U) >> 2U; + + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + *pStateCurnt++ = *pState++; + + /* Decrement loop counter */ + tapCnt--; + } + + /* Loop unrolling: Compute remaining taps */ + tapCnt = (numTaps - 1U) & 0x3U; + + + + /* Initialize tapCnt with number of samples */ + tapCnt = (numTaps - 1U); + + + + while (tapCnt > 0U) + { + *pStateCurnt++ = *pState++; + + /* Decrement loop counter */ + tapCnt--; + } +} -- 2.36.3