The implementation is copied directly from ARM SVE.
I applied in my downstream GCC for a year and there is no issue so far.
Ok for trunk ?


juzhe.zhong@rivai.ai
 
From: juzhe.zhong
Date: 2023-05-13 19:44
To: gcc-patches
CC: kito.cheng; palmer; rdapp.gcc; jeffreyalaw; Juzhe-Zhong
Subject: [PATCH] RISC-V: Support TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT to optimize codegen of RVV auto-vectorization
From: Juzhe-Zhong <juzhe.zhong@rivai.ai>
 
This patch support vector alignement for RVV auto-vectorization.
 
Consider this following case:
void __attribute__((noinline, noclone))
f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
{
  for (int i = 0; i < count; ++i)
    dst[i] = op1[i] + op2[i];
}
 
Before this patch:
ble a3,zero,.L1
srli a4,a1,2
negw a4,a4
andi a5,a4,3
sext.w a3,a3
beq a5,zero,.L3
lw a7,0(a1)
lw a6,0(a2)
andi a4,a4,2
addw a6,a6,a7
sw a6,0(a0)
beq a4,zero,.L3
lw a7,4(a1)
lw a4,4(a2)
li a6,3
addw a4,a4,a7
sw a4,4(a0)
bne a5,a6,.L3
lw a6,8(a2)
lw a4,8(a1)
addw a4,a4,a6
sw a4,8(a0)
.L3:
subw a3,a3,a5
slli a4,a3,32
csrr a6,vlenb
srli a4,a4,32
srli a6,a6,2
slli a3,a5,2
mv a5,a4
bgtu a4,a6,.L17
.L5:
csrr a6,vlenb
add a1,a1,a3
add a2,a2,a3
add a0,a0,a3
srli a7,a6,2
li a3,0
.L8:
vsetvli zero,a5,e32,m1,ta,ma
vle32.v v1,0(a1)
vle32.v v2,0(a2)
vsetvli t1,zero,e32,m1,ta,ma
add a3,a3,a7
vadd.vv v1,v1,v2
vsetvli zero,a5,e32,m1,ta,ma
vse32.v v1,0(a0)
mv a5,a4
bleu a4,a3,.L6
mv a5,a3
.L6:
sub a5,a4,a5
bleu a5,a7,.L7
mv a5,a7
.L7:
add a1,a1,a6
add a2,a2,a6
add a0,a0,a6
bne a5,zero,.L8
.L1:
ret
.L17:
mv a5,a6
j .L5
 
After this patch:
f:
        ble     a3,zero,.L1
        csrr    a4,vlenb
        srli    a4,a4,2
        mv      a5,a3
        bgtu    a3,a4,.L9
.L3:
        csrr    a6,vlenb
        li      a4,0
        srli    a7,a6,2
.L6:
        vsetvli zero,a5,e32,m1,ta,ma
        vle32.v v2,0(a1)
        vle32.v v1,0(a2)
        vsetvli t1,zero,e32,m1,ta,ma
        add     a4,a4,a7
        vadd.vv v1,v1,v2
        vsetvli zero,a5,e32,m1,ta,ma
        vse32.v v1,0(a0)
        mv      a5,a3
        bleu    a3,a4,.L4
        mv      a5,a4
.L4:
        sub     a5,a3,a5
        bleu    a5,a7,.L5
        mv      a5,a7
.L5:
        add     a0,a0,a6
        add     a2,a2,a6
        add     a1,a1,a6
        bne     a5,zero,.L6
.L1:
        ret
.L9:
        mv      a5,a4
        j       .L3
 
gcc/ChangeLog:
 
        * config/riscv/riscv.cc (riscv_vectorize_preferred_vector_alignment): New function.
        (TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT): New target hook.
 
gcc/testsuite/ChangeLog:
 
        * gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c: Adapt testcase.
        * gcc.target/riscv/rvv/autovec/align-1.c: New test.
 
---
gcc/config/riscv/riscv.cc                     | 22 +++++++++++++++++++
.../gcc.target/riscv/rvv/autovec/align-1.c    | 12 ++++++++++
.../riscv/rvv/autovec/binop/shift-rv32gcv.c   | 10 +++++----
3 files changed, 40 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
 
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index de578b5b899..a5776a550b2 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -7499,6 +7499,24 @@ riscv_preferred_simd_mode (scalar_mode mode)
   return word_mode;
}
+/* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
+
+static poly_uint64
+riscv_vectorize_preferred_vector_alignment (const_tree type)
+{
+  if (riscv_v_ext_vector_mode_p (TYPE_MODE (type)))
+    {
+      /* If the length of the vector is a fixed power of 2, try to align
+ to that length, otherwise don't try to align at all.  */
+      HOST_WIDE_INT result;
+      if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
+   || !pow2p_hwi (result))
+ result = TYPE_ALIGN (TREE_TYPE (type));
+      return result;
+    }
+  return TYPE_ALIGN (type);
+}
+
/* Initialize the GCC target structure.  */
#undef TARGET_ASM_ALIGNED_HI_OP
#define TARGET_ASM_ALIGNED_HI_OP "\t.half\t"
@@ -7771,6 +7789,10 @@ riscv_preferred_simd_mode (scalar_mode mode)
#undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
#define TARGET_VECTORIZE_PREFERRED_SIMD_MODE riscv_preferred_simd_mode
+#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
+#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
+  riscv_vectorize_preferred_vector_alignment
+
struct gcc_target targetm = TARGET_INITIALIZER;
#include "gt-riscv.h"
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
new file mode 100644
index 00000000000..14201e1f7e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/align-1.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3 --param riscv-autovec-preference=scalable" } */
+
+void __attribute__((noinline, noclone))
+f (int * __restrict dst, int * __restrict op1, int * __restrict op2, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = op1[i] + op2[i];
+}
+
+/* { dg-final { scan-assembler-not "lw" } } */
+/* { dg-final { scan-assembler-not "sw" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
index da0f79a1cf0..d98100b3276 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/binop/shift-rv32gcv.c
@@ -4,8 +4,10 @@
#include "shift-template.h"
/* TODO: For int16_t and uint16_t we need widening/promotion patterns.
-   Therefore, expect only 4 vsll.vv instead of 6 for now.  */
+   We don't check the assembler number since lacking patterns make
+   auto-vectorization inconsistent in LMUL = 1/2/4/8.  */
+
+/* { dg-final { scan-assembler {\tvsll\.vv} } } */
+/* { dg-final { scan-assembler {\tvsrl\.vv} } } */
+/* { dg-final { scan-assembler {\tvsra\.vv} } } */
-/* { dg-final { scan-assembler-times {\tvsll\.vv} 4 } } */
-/* { dg-final { scan-assembler-times {\tvsrl\.vv} 3 } } */
-/* { dg-final { scan-assembler-times {\tvsra\.vv} 3 } } */
-- 
2.36.1