public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen
@ 2023-10-09  8:51 pan2.li
  2023-10-09  9:04 ` juzhe.zhong
  2023-10-09 13:09 ` [PATCH v2] " pan2.li
  0 siblings, 2 replies; 6+ messages in thread
From: pan2.li @ 2023-10-09  8:51 UTC (permalink / raw)
  To: gcc-patches; +Cc: juzhe.zhong, pan2.li, yanzhang.wang, kito.cheng

From: Pan Li <pan2.li@intel.com>

This patch would like to refine the code gen for the bswap16.

We will have VEC_PERM_EXPR after rtl expand when invoking
__builtin_bswap. It will generate about 9 instructions in
loop as below, no matter it is bswap16, bswap32 or bswap64.

  .L2:
1 vle16.v v4,0(a0)
2 vmv.v.x v2,a7
3 vand.vv v2,v6,v2
4 slli    a2,a5,1
5 vrgatherei16.vv v1,v4,v2
6 sub     a4,a4,a5
7 vse16.v v1,0(a3)
8 add     a0,a0,a2
9 add     a3,a3,a2
  bne     a4,zero,.L2

But for bswap16 we may have a even simple code gen, which
has only 7 instructions in loop as below.

  .L5
1 vle8.v  v2,0(a5)
2 addi    a5,a5,32
3 vsrl.vi v4,v2,8
4 vsll.vi v2,v2,8
5 vor.vv  v4,v4,v2
6 vse8.v  v4,0(a4)
7 addi    a4,a4,32
  bne     a5,a6,.L5

Unfortunately, this way will make the insn in loop will grow up to
13 and 24 for bswap32 and bswap64. Thus, we will refine the code
gen for the bswap16 only, and leave both the bswap32 and bswap64
as is.

gcc/ChangeLog:

	* config/riscv/riscv-v.cc (emit_vec_sll_scalar): New help func
	impl for emit vsll.vi/vsll.vx
	(emit_vec_srl_scalar): Likewise for vsrl.vi/vsrl.vx.
	(emit_vec_or): Likewise for vor.vv.
	(shuffle_bswap_pattern): New func impl for shuffle bswap.
	(expand_vec_perm_const_1): Add shuffle bswap pattern.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Adjust checker.
	* gcc.target/riscv/rvv/autovec/unop/bswap16-0.c: New test.
	* gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c: New test.
	* gcc.target/riscv/rvv/autovec/vls/bswap16-0.c: New test.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/config/riscv/riscv-v.cc                   | 117 ++++++++++++++++++
 .../riscv/rvv/autovec/unop/bswap16-0.c        |  17 +++
 .../riscv/rvv/autovec/unop/bswap16-run-0.c    |  44 +++++++
 .../riscv/rvv/autovec/vls/bswap16-0.c         |  34 +++++
 .../gcc.target/riscv/rvv/autovec/vls/perm-4.c |   4 +-
 5 files changed, 214 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 23633a2a74d..3e3b5f2e797 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -878,6 +878,33 @@ emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
 }
 
+static void
+emit_vec_sll_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx sll_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred_scalar (ASHIFT, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, sll_ops);
+}
+
+static void
+emit_vec_srl_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx srl_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred_scalar (LSHIFTRT, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, srl_ops);
+}
+
+static void
+emit_vec_or (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx or_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred (IOR, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, or_ops);
+}
+
 /* Emit merge instruction.  */
 
 static machine_mode
@@ -3030,6 +3057,94 @@ shuffle_decompress_patterns (struct expand_vec_perm_d *d)
   return true;
 }
 
+static bool
+shuffle_bswap_pattern (struct expand_vec_perm_d *d)
+{
+  HOST_WIDE_INT diff;
+  unsigned i, size, step;
+
+  if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
+    return false;
+
+  step = diff + 1;
+  size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
+
+  switch (size)
+    {
+    case 16:
+      break;
+    case 32:
+    case 64:
+      /* We will have VEC_PERM_EXPR after rtl expand when invoking
+	 __builtin_bswap. It will generate about 9 instructions in
+	 loop as below, no matter it is bswap16, bswap32 or bswap64.
+	   .L2:
+	 1 vle16.v v4,0(a0)
+	 2 vmv.v.x v2,a7
+	 3 vand.vv v2,v6,v2
+	 4 slli    a2,a5,1
+	 5 vrgatherei16.vv v1,v4,v2
+	 6 sub     a4,a4,a5
+	 7 vse16.v v1,0(a3)
+	 8 add     a0,a0,a2
+	 9 add     a3,a3,a2
+	   bne     a4,zero,.L2
+
+	 But for bswap16 we may have a even simple code gen, which
+	 has only 7 instructions in loop as below.
+	   .L5
+	 1 vle8.v  v2,0(a5)
+	 2 addi    a5,a5,32
+	 3 vsrl.vi v4,v2,8
+	 4 vsll.vi v2,v2,8
+	 5 vor.vv  v4,v4,v2
+	 6 vse8.v  v4,0(a4)
+	 7 addi    a4,a4,32
+	   bne     a5,a6,.L5
+
+	 Unfortunately, the instructions in loop will grow to 13 and 24
+	 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
+	 for both the bswap64 and bswap32, but take shift and or (7 insn)
+	 for bswap16.
+       */
+    default:
+      return false;
+    }
+
+  for (i = 0; i < step; i++)
+    if (!d->perm.series_p (i, step, diff - i, step))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode vhi_mode;
+  poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
+
+  if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
+    return false;
+
+  rtx src = gen_reg_rtx (vhi_mode);
+  rtx dest = gen_reg_rtx (vhi_mode);
+
+  /* Step-1: Move op0 to src with VHI mode.  */
+  emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
+
+  /* Step-2: Shift right 8 bits to dest.  */
+  emit_vec_srl_scalar (dest, src, GEN_INT (8), vhi_mode);
+
+  /* Step-3: Shift left 8 bits to src.  */
+  emit_vec_sll_scalar (src, src, GEN_INT (8), vhi_mode);
+
+  /* Step-4: Logic Or dest and src to dest.  */
+  emit_vec_or (dest, dest, src, vhi_mode);
+
+  /* Step-5: Move src to target with VQI mode.  */
+  emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+  return true;
+}
+
 /* Recognize the pattern that can be shuffled by generic approach.  */
 
 static bool
@@ -3089,6 +3204,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	    return true;
 	  if (shuffle_decompress_patterns (d))
 	    return true;
+	  if (shuffle_bswap_pattern (d))
+	    return true;
 	  if (shuffle_generic_patterns (d))
 	    return true;
 	  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
new file mode 100644
index 00000000000..10d235a8edf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+/*
+** test_uint16_t___builtin_bswap16:
+**   ...
+**   vsetvli\s+[atx][0-9]+,\s*zero,\s*e16,\s*m1,\s*ta,\s*ma
+**   vsrl\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vsll\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vor\.vv\s+v[0-9]+,\s*v[0-9],\s*v[0-9]+
+**   ...
+*/
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
new file mode 100644
index 00000000000..8d45cebc6c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+#define ARRAY_SIZE 128
+
+uint16_t in[ARRAY_SIZE];
+uint16_t out[ARRAY_SIZE];
+uint16_t ref[ARRAY_SIZE];
+
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
+TEST_ASSERT (uint16_t)
+
+/* TEST_INIT Arguments:
+	  +-------+-------+---------------------------+---------+
+	  | type  | input | reference                 | test id |
+	  +-------+-------+---------------------------+---------+
+*/
+TEST_INIT (uint16_t, 0x1234u, __builtin_bswap16 (0x1234u), 1)
+TEST_INIT (uint16_t, 0x1122u, __builtin_bswap16 (0x1122u), 2)
+TEST_INIT (uint16_t, 0xa55au, __builtin_bswap16 (0xa55au), 3)
+TEST_INIT (uint16_t, 0x0000u, __builtin_bswap16 (0x0000u), 4)
+TEST_INIT (uint16_t, 0xffffu, __builtin_bswap16 (0xffffu), 5)
+TEST_INIT (uint16_t, 0x4321u, __builtin_bswap16 (0x4321u), 6)
+
+int
+main ()
+{
+  /* RUN_TEST Arguments:
+	   +------+---------+-------------+----+-----+-----+------------+
+	   | type | test id | fun to test | in | out | ref | array size |
+	   +------+---------+-------------+----+-----+-----+------------+
+  */
+  RUN_TEST (uint16_t, 1, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 2, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 3, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 4, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 5, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 6, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
new file mode 100644
index 00000000000..11880bae1f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (bswap16, 1, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 4, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 8, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 16, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 32, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 64, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 128, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 256, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 512, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 1024, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2048, uint16_t, __builtin_bswap16)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
+/* { dg-final { scan-assembler-times {vsrl\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vsll\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vor\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 4d6862cf1c0..d2d49388a39 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,7 +3,7 @@
 
 #include "../vls-vlmax/perm-4.c"
 
-/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 18 } } */
 /* { dg-final { scan-assembler-times {vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
-/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
+/* { dg-final { scan-assembler-times {vrsub\.vi} 23 } } */
 /* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen
  2023-10-09  8:51 [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen pan2.li
@ 2023-10-09  9:04 ` juzhe.zhong
  2023-10-09 10:45   ` Li, Pan2
  2023-10-09 13:09 ` [PATCH v2] " pan2.li
  1 sibling, 1 reply; 6+ messages in thread
From: juzhe.zhong @ 2023-10-09  9:04 UTC (permalink / raw)
  To: pan2.li, gcc-patches; +Cc: pan2.li, yanzhang.wang, kito.cheng

[-- Attachment #1: Type: text/plain, Size: 13564 bytes --]

Remove these functions:

+static void
+emit_vec_sll_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx sll_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred_scalar (ASHIFT, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, sll_ops);
+}
+
+static void
+emit_vec_srl_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx srl_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred_scalar (LSHIFTRT, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, srl_ops);
+}
+
+static void
+emit_vec_or (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx or_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred (IOR, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, or_ops);
+}
+

Instead, 

For sll, you should use :
rtx tmp
    = expand_binop (Pmode, ashl_optab, op_1,
        gen_int_mode (8, Pmode), NULL_RTX, 0,
        OPTAB_DIRECT);

For srl, you should use:
rtx tmp
    = expand_binop (Pmode, lshiftrt_optab, op_1,
        gen_int_mode (8, Pmode), NULL_RTX, 0,
        OPTAB_DIRECT);


For or, you should use:
expand_binop (Pmode, ior_optab, tmp, dest, NULL_RTX, 0,
               OPTAB_DIRECT);



juzhe.zhong@rivai.ai
 
From: pan2.li
Date: 2023-10-09 16:51
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen
From: Pan Li <pan2.li@intel.com>
 
This patch would like to refine the code gen for the bswap16.
 
We will have VEC_PERM_EXPR after rtl expand when invoking
__builtin_bswap. It will generate about 9 instructions in
loop as below, no matter it is bswap16, bswap32 or bswap64.
 
  .L2:
1 vle16.v v4,0(a0)
2 vmv.v.x v2,a7
3 vand.vv v2,v6,v2
4 slli    a2,a5,1
5 vrgatherei16.vv v1,v4,v2
6 sub     a4,a4,a5
7 vse16.v v1,0(a3)
8 add     a0,a0,a2
9 add     a3,a3,a2
  bne     a4,zero,.L2
 
But for bswap16 we may have a even simple code gen, which
has only 7 instructions in loop as below.
 
  .L5
1 vle8.v  v2,0(a5)
2 addi    a5,a5,32
3 vsrl.vi v4,v2,8
4 vsll.vi v2,v2,8
5 vor.vv  v4,v4,v2
6 vse8.v  v4,0(a4)
7 addi    a4,a4,32
  bne     a5,a6,.L5
 
Unfortunately, this way will make the insn in loop will grow up to
13 and 24 for bswap32 and bswap64. Thus, we will refine the code
gen for the bswap16 only, and leave both the bswap32 and bswap64
as is.
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (emit_vec_sll_scalar): New help func
impl for emit vsll.vi/vsll.vx
(emit_vec_srl_scalar): Likewise for vsrl.vi/vsrl.vx.
(emit_vec_or): Likewise for vor.vv.
(shuffle_bswap_pattern): New func impl for shuffle bswap.
(expand_vec_perm_const_1): Add shuffle bswap pattern.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Adjust checker.
* gcc.target/riscv/rvv/autovec/unop/bswap16-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c: New test.
* gcc.target/riscv/rvv/autovec/vls/bswap16-0.c: New test.
 
Signed-off-by: Pan Li <pan2.li@intel.com>
---
gcc/config/riscv/riscv-v.cc                   | 117 ++++++++++++++++++
.../riscv/rvv/autovec/unop/bswap16-0.c        |  17 +++
.../riscv/rvv/autovec/unop/bswap16-run-0.c    |  44 +++++++
.../riscv/rvv/autovec/vls/bswap16-0.c         |  34 +++++
.../gcc.target/riscv/rvv/autovec/vls/perm-4.c |   4 +-
5 files changed, 214 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 23633a2a74d..3e3b5f2e797 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -878,6 +878,33 @@ emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
}
+static void
+emit_vec_sll_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx sll_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred_scalar (ASHIFT, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, sll_ops);
+}
+
+static void
+emit_vec_srl_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx srl_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred_scalar (LSHIFTRT, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, srl_ops);
+}
+
+static void
+emit_vec_or (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx or_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred (IOR, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, or_ops);
+}
+
/* Emit merge instruction.  */
static machine_mode
@@ -3030,6 +3057,94 @@ shuffle_decompress_patterns (struct expand_vec_perm_d *d)
   return true;
}
+static bool
+shuffle_bswap_pattern (struct expand_vec_perm_d *d)
+{
+  HOST_WIDE_INT diff;
+  unsigned i, size, step;
+
+  if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
+    return false;
+
+  step = diff + 1;
+  size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
+
+  switch (size)
+    {
+    case 16:
+      break;
+    case 32:
+    case 64:
+      /* We will have VEC_PERM_EXPR after rtl expand when invoking
+ __builtin_bswap. It will generate about 9 instructions in
+ loop as below, no matter it is bswap16, bswap32 or bswap64.
+    .L2:
+ 1 vle16.v v4,0(a0)
+ 2 vmv.v.x v2,a7
+ 3 vand.vv v2,v6,v2
+ 4 slli    a2,a5,1
+ 5 vrgatherei16.vv v1,v4,v2
+ 6 sub     a4,a4,a5
+ 7 vse16.v v1,0(a3)
+ 8 add     a0,a0,a2
+ 9 add     a3,a3,a2
+    bne     a4,zero,.L2
+
+ But for bswap16 we may have a even simple code gen, which
+ has only 7 instructions in loop as below.
+    .L5
+ 1 vle8.v  v2,0(a5)
+ 2 addi    a5,a5,32
+ 3 vsrl.vi v4,v2,8
+ 4 vsll.vi v2,v2,8
+ 5 vor.vv  v4,v4,v2
+ 6 vse8.v  v4,0(a4)
+ 7 addi    a4,a4,32
+    bne     a5,a6,.L5
+
+ Unfortunately, the instructions in loop will grow to 13 and 24
+ for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
+ for both the bswap64 and bswap32, but take shift and or (7 insn)
+ for bswap16.
+       */
+    default:
+      return false;
+    }
+
+  for (i = 0; i < step; i++)
+    if (!d->perm.series_p (i, step, diff - i, step))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode vhi_mode;
+  poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
+
+  if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
+    return false;
+
+  rtx src = gen_reg_rtx (vhi_mode);
+  rtx dest = gen_reg_rtx (vhi_mode);
+
+  /* Step-1: Move op0 to src with VHI mode.  */
+  emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
+
+  /* Step-2: Shift right 8 bits to dest.  */
+  emit_vec_srl_scalar (dest, src, GEN_INT (8), vhi_mode);
+
+  /* Step-3: Shift left 8 bits to src.  */
+  emit_vec_sll_scalar (src, src, GEN_INT (8), vhi_mode);
+
+  /* Step-4: Logic Or dest and src to dest.  */
+  emit_vec_or (dest, dest, src, vhi_mode);
+
+  /* Step-5: Move src to target with VQI mode.  */
+  emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+  return true;
+}
+
/* Recognize the pattern that can be shuffled by generic approach.  */
static bool
@@ -3089,6 +3204,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
    return true;
  if (shuffle_decompress_patterns (d))
    return true;
+   if (shuffle_bswap_pattern (d))
+     return true;
  if (shuffle_generic_patterns (d))
    return true;
  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
new file mode 100644
index 00000000000..10d235a8edf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+/*
+** test_uint16_t___builtin_bswap16:
+**   ...
+**   vsetvli\s+[atx][0-9]+,\s*zero,\s*e16,\s*m1,\s*ta,\s*ma
+**   vsrl\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vsll\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vor\.vv\s+v[0-9]+,\s*v[0-9],\s*v[0-9]+
+**   ...
+*/
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
new file mode 100644
index 00000000000..8d45cebc6c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+#define ARRAY_SIZE 128
+
+uint16_t in[ARRAY_SIZE];
+uint16_t out[ARRAY_SIZE];
+uint16_t ref[ARRAY_SIZE];
+
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
+TEST_ASSERT (uint16_t)
+
+/* TEST_INIT Arguments:
+   +-------+-------+---------------------------+---------+
+   | type  | input | reference                 | test id |
+   +-------+-------+---------------------------+---------+
+*/
+TEST_INIT (uint16_t, 0x1234u, __builtin_bswap16 (0x1234u), 1)
+TEST_INIT (uint16_t, 0x1122u, __builtin_bswap16 (0x1122u), 2)
+TEST_INIT (uint16_t, 0xa55au, __builtin_bswap16 (0xa55au), 3)
+TEST_INIT (uint16_t, 0x0000u, __builtin_bswap16 (0x0000u), 4)
+TEST_INIT (uint16_t, 0xffffu, __builtin_bswap16 (0xffffu), 5)
+TEST_INIT (uint16_t, 0x4321u, __builtin_bswap16 (0x4321u), 6)
+
+int
+main ()
+{
+  /* RUN_TEST Arguments:
+    +------+---------+-------------+----+-----+-----+------------+
+    | type | test id | fun to test | in | out | ref | array size |
+    +------+---------+-------------+----+-----+-----+------------+
+  */
+  RUN_TEST (uint16_t, 1, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 2, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 3, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 4, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 5, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 6, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
new file mode 100644
index 00000000000..11880bae1f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (bswap16, 1, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 4, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 8, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 16, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 32, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 64, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 128, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 256, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 512, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 1024, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2048, uint16_t, __builtin_bswap16)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
+/* { dg-final { scan-assembler-times {vsrl\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vsll\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vor\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 4d6862cf1c0..d2d49388a39 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,7 +3,7 @@
#include "../vls-vlmax/perm-4.c"
-/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 18 } } */
/* { dg-final { scan-assembler-times {vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
-/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
+/* { dg-final { scan-assembler-times {vrsub\.vi} 23 } } */
/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
-- 
2.34.1
 
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen
  2023-10-09  9:04 ` juzhe.zhong
@ 2023-10-09 10:45   ` Li, Pan2
  0 siblings, 0 replies; 6+ messages in thread
From: Li, Pan2 @ 2023-10-09 10:45 UTC (permalink / raw)
  To: juzhe.zhong, gcc-patches; +Cc: Wang, Yanzhang, kito.cheng

[-- Attachment #1: Type: text/plain, Size: 14316 bytes --]

Sure thing, will send V2 for this change.

Pan

From: juzhe.zhong@rivai.ai <juzhe.zhong@rivai.ai>
Sent: Monday, October 9, 2023 5:04 PM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches <gcc-patches@gcc.gnu.org>
Cc: Li, Pan2 <pan2.li@intel.com>; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng <kito.cheng@gmail.com>
Subject: Re: [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen

Remove these functions:


+static void

+emit_vec_sll_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)

+{

+  rtx sll_ops[] = {op_0, op_1, op_2};

+  insn_code icode = code_for_pred_scalar (ASHIFT, vec_mode);

+

+  emit_vlmax_insn (icode, BINARY_OP, sll_ops);

+}

+

+static void

+emit_vec_srl_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)

+{

+  rtx srl_ops[] = {op_0, op_1, op_2};

+  insn_code icode = code_for_pred_scalar (LSHIFTRT, vec_mode);

+

+  emit_vlmax_insn (icode, BINARY_OP, srl_ops);

+}

+

+static void

+emit_vec_or (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)

+{

+  rtx or_ops[] = {op_0, op_1, op_2};

+  insn_code icode = code_for_pred (IOR, vec_mode);

+

+  emit_vlmax_insn (icode, BINARY_OP, or_ops);

+}

+

Instead,

For sll, you should use :
rtx tmp
    = expand_binop (Pmode, ashl_optab, op_1,
        gen_int_mode (8, Pmode), NULL_RTX, 0,
        OPTAB_DIRECT);

For srl, you should use:
rtx tmp
    = expand_binop (Pmode, lshiftrt_optab, op_1,
        gen_int_mode (8, Pmode), NULL_RTX, 0,
        OPTAB_DIRECT);


For or, you should use:
expand_binop (Pmode, ior_optab, tmp, dest, NULL_RTX, 0,
               OPTAB_DIRECT);

________________________________
juzhe.zhong@rivai.ai<mailto:juzhe.zhong@rivai.ai>

From: pan2.li<mailto:pan2.li@intel.com>
Date: 2023-10-09 16:51
To: gcc-patches<mailto:gcc-patches@gcc.gnu.org>
CC: juzhe.zhong<mailto:juzhe.zhong@rivai.ai>; pan2.li<mailto:pan2.li@intel.com>; yanzhang.wang<mailto:yanzhang.wang@intel.com>; kito.cheng<mailto:kito.cheng@gmail.com>
Subject: [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen
From: Pan Li <pan2.li@intel.com<mailto:pan2.li@intel.com>>

This patch would like to refine the code gen for the bswap16.

We will have VEC_PERM_EXPR after rtl expand when invoking
__builtin_bswap. It will generate about 9 instructions in
loop as below, no matter it is bswap16, bswap32 or bswap64.

  .L2:
1 vle16.v v4,0(a0)
2 vmv.v.x v2,a7
3 vand.vv v2,v6,v2
4 slli    a2,a5,1
5 vrgatherei16.vv v1,v4,v2
6 sub     a4,a4,a5
7 vse16.v v1,0(a3)
8 add     a0,a0,a2
9 add     a3,a3,a2
  bne     a4,zero,.L2

But for bswap16 we may have a even simple code gen, which
has only 7 instructions in loop as below.

  .L5
1 vle8.v  v2,0(a5)
2 addi    a5,a5,32
3 vsrl.vi v4,v2,8
4 vsll.vi v2,v2,8
5 vor.vv  v4,v4,v2
6 vse8.v  v4,0(a4)
7 addi    a4,a4,32
  bne     a5,a6,.L5

Unfortunately, this way will make the insn in loop will grow up to
13 and 24 for bswap32 and bswap64. Thus, we will refine the code
gen for the bswap16 only, and leave both the bswap32 and bswap64
as is.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (emit_vec_sll_scalar): New help func
impl for emit vsll.vi/vsll.vx
(emit_vec_srl_scalar): Likewise for vsrl.vi/vsrl.vx.
(emit_vec_or): Likewise for vor.vv.
(shuffle_bswap_pattern): New func impl for shuffle bswap.
(expand_vec_perm_const_1): Add shuffle bswap pattern.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Adjust checker.
* gcc.target/riscv/rvv/autovec/unop/bswap16-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c: New test.
* gcc.target/riscv/rvv/autovec/vls/bswap16-0.c: New test.

Signed-off-by: Pan Li <pan2.li@intel.com<mailto:pan2.li@intel.com>>
---
gcc/config/riscv/riscv-v.cc                   | 117 ++++++++++++++++++
.../riscv/rvv/autovec/unop/bswap16-0.c        |  17 +++
.../riscv/rvv/autovec/unop/bswap16-run-0.c    |  44 +++++++
.../riscv/rvv/autovec/vls/bswap16-0.c         |  34 +++++
.../gcc.target/riscv/rvv/autovec/vls/perm-4.c |   4 +-
5 files changed, 214 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 23633a2a74d..3e3b5f2e797 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -878,6 +878,33 @@ emit_vlmax_decompress_insn (rtx target, rtx op0, rtx op1, rtx mask)
   emit_vlmax_masked_gather_mu_insn (target, op1, sel, mask);
}
+static void
+emit_vec_sll_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx sll_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred_scalar (ASHIFT, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, sll_ops);
+}
+
+static void
+emit_vec_srl_scalar (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx srl_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred_scalar (LSHIFTRT, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, srl_ops);
+}
+
+static void
+emit_vec_or (rtx op_0, rtx op_1, rtx op_2, machine_mode vec_mode)
+{
+  rtx or_ops[] = {op_0, op_1, op_2};
+  insn_code icode = code_for_pred (IOR, vec_mode);
+
+  emit_vlmax_insn (icode, BINARY_OP, or_ops);
+}
+
/* Emit merge instruction.  */
static machine_mode
@@ -3030,6 +3057,94 @@ shuffle_decompress_patterns (struct expand_vec_perm_d *d)
   return true;
}
+static bool
+shuffle_bswap_pattern (struct expand_vec_perm_d *d)
+{
+  HOST_WIDE_INT diff;
+  unsigned i, size, step;
+
+  if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
+    return false;
+
+  step = diff + 1;
+  size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
+
+  switch (size)
+    {
+    case 16:
+      break;
+    case 32:
+    case 64:
+      /* We will have VEC_PERM_EXPR after rtl expand when invoking
+ __builtin_bswap. It will generate about 9 instructions in
+ loop as below, no matter it is bswap16, bswap32 or bswap64.
+    .L2:
+ 1 vle16.v v4,0(a0)
+ 2 vmv.v.x v2,a7
+ 3 vand.vv v2,v6,v2
+ 4 slli    a2,a5,1
+ 5 vrgatherei16.vv v1,v4,v2
+ 6 sub     a4,a4,a5
+ 7 vse16.v v1,0(a3)
+ 8 add     a0,a0,a2
+ 9 add     a3,a3,a2
+    bne     a4,zero,.L2
+
+ But for bswap16 we may have a even simple code gen, which
+ has only 7 instructions in loop as below.
+    .L5
+ 1 vle8.v  v2,0(a5)
+ 2 addi    a5,a5,32
+ 3 vsrl.vi v4,v2,8
+ 4 vsll.vi v2,v2,8
+ 5 vor.vv  v4,v4,v2
+ 6 vse8.v  v4,0(a4)
+ 7 addi    a4,a4,32
+    bne     a5,a6,.L5
+
+ Unfortunately, the instructions in loop will grow to 13 and 24
+ for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
+ for both the bswap64 and bswap32, but take shift and or (7 insn)
+ for bswap16.
+       */
+    default:
+      return false;
+    }
+
+  for (i = 0; i < step; i++)
+    if (!d->perm.series_p (i, step, diff - i, step))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode vhi_mode;
+  poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
+
+  if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
+    return false;
+
+  rtx src = gen_reg_rtx (vhi_mode);
+  rtx dest = gen_reg_rtx (vhi_mode);
+
+  /* Step-1: Move op0 to src with VHI mode.  */
+  emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
+
+  /* Step-2: Shift right 8 bits to dest.  */
+  emit_vec_srl_scalar (dest, src, GEN_INT (8), vhi_mode);
+
+  /* Step-3: Shift left 8 bits to src.  */
+  emit_vec_sll_scalar (src, src, GEN_INT (8), vhi_mode);
+
+  /* Step-4: Logic Or dest and src to dest.  */
+  emit_vec_or (dest, dest, src, vhi_mode);
+
+  /* Step-5: Move src to target with VQI mode.  */
+  emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+  return true;
+}
+
/* Recognize the pattern that can be shuffled by generic approach.  */
static bool
@@ -3089,6 +3204,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
    return true;
  if (shuffle_decompress_patterns (d))
    return true;
+   if (shuffle_bswap_pattern (d))
+     return true;
  if (shuffle_generic_patterns (d))
    return true;
  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
new file mode 100644
index 00000000000..10d235a8edf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+/*
+** test_uint16_t___builtin_bswap16:
+**   ...
+**   vsetvli\s+[atx][0-9]+,\s*zero,\s*e16,\s*m1,\s*ta,\s*ma
+**   vsrl\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vsll\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vor\.vv\s+v[0-9]+,\s*v[0-9],\s*v[0-9]+
+**   ...
+*/
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
new file mode 100644
index 00000000000..8d45cebc6c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+#define ARRAY_SIZE 128
+
+uint16_t in[ARRAY_SIZE];
+uint16_t out[ARRAY_SIZE];
+uint16_t ref[ARRAY_SIZE];
+
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
+TEST_ASSERT (uint16_t)
+
+/* TEST_INIT Arguments:
+   +-------+-------+---------------------------+---------+
+   | type  | input | reference                 | test id |
+   +-------+-------+---------------------------+---------+
+*/
+TEST_INIT (uint16_t, 0x1234u, __builtin_bswap16 (0x1234u), 1)
+TEST_INIT (uint16_t, 0x1122u, __builtin_bswap16 (0x1122u), 2)
+TEST_INIT (uint16_t, 0xa55au, __builtin_bswap16 (0xa55au), 3)
+TEST_INIT (uint16_t, 0x0000u, __builtin_bswap16 (0x0000u), 4)
+TEST_INIT (uint16_t, 0xffffu, __builtin_bswap16 (0xffffu), 5)
+TEST_INIT (uint16_t, 0x4321u, __builtin_bswap16 (0x4321u), 6)
+
+int
+main ()
+{
+  /* RUN_TEST Arguments:
+    +------+---------+-------------+----+-----+-----+------------+
+    | type | test id | fun to test | in | out | ref | array size |
+    +------+---------+-------------+----+-----+-----+------------+
+  */
+  RUN_TEST (uint16_t, 1, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 2, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 3, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 4, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 5, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 6, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
new file mode 100644
index 00000000000..11880bae1f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (bswap16, 1, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 4, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 8, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 16, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 32, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 64, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 128, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 256, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 512, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 1024, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2048, uint16_t, __builtin_bswap16)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
+/* { dg-final { scan-assembler-times {vsrl\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vsll\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vor\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 4d6862cf1c0..d2d49388a39 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,7 +3,7 @@
#include "../vls-vlmax/perm-4.c"
-/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 18 } } */
/* { dg-final { scan-assembler-times {vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
-/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
+/* { dg-final { scan-assembler-times {vrsub\.vi} 23 } } */
/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
--
2.34.1



^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v2] RISC-V: Refine bswap16 auto vectorization code gen
  2023-10-09  8:51 [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen pan2.li
  2023-10-09  9:04 ` juzhe.zhong
@ 2023-10-09 13:09 ` pan2.li
  2023-10-09 13:11   ` juzhe.zhong
  1 sibling, 1 reply; 6+ messages in thread
From: pan2.li @ 2023-10-09 13:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: juzhe.zhong, pan2.li, yanzhang.wang, kito.cheng

From: Pan Li <pan2.li@intel.com>

Update in v2

* Remove emit helper functions.
* Take expand_binop instead.

Original log:

This patch would like to refine the code gen for the bswap16.

We will have VEC_PERM_EXPR after rtl expand when invoking
__builtin_bswap. It will generate about 9 instructions in
loop as below, no matter it is bswap16, bswap32 or bswap64.

  .L2:
1 vle16.v v4,0(a0)
2 vmv.v.x v2,a7
3 vand.vv v2,v6,v2
4 slli    a2,a5,1
5 vrgatherei16.vv v1,v4,v2
6 sub     a4,a4,a5
7 vse16.v v1,0(a3)
8 add     a0,a0,a2
9 add     a3,a3,a2
  bne     a4,zero,.L2

But for bswap16 we may have a even simple code gen, which
has only 7 instructions in loop as below.

  .L5
1 vle8.v  v2,0(a5)
2 addi    a5,a5,32
3 vsrl.vi v4,v2,8
4 vsll.vi v2,v2,8
5 vor.vv  v4,v4,v2
6 vse8.v  v4,0(a4)
7 addi    a4,a4,32
  bne     a5,a6,.L5

Unfortunately, this way will make the insn in loop will grow up to
13 and 24 for bswap32 and bswap64. Thus, we will refine the code
gen for the bswap16 only, and leave both the bswap32 and bswap64
as is.

gcc/ChangeLog:

	* config/riscv/riscv-v.cc (shuffle_bswap_pattern): New func impl
	for shuffle bswap.
	(expand_vec_perm_const_1): Add handling for shuffle bswap pattern.

gcc/testsuite/ChangeLog:

	* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Adjust checker.
	* gcc.target/riscv/rvv/autovec/unop/bswap16-0.c: New test.
	* gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c: New test.
	* gcc.target/riscv/rvv/autovec/vls/bswap16-0.c: New test.

Signed-off-by: Pan Li <pan2.li@intel.com>
---
 gcc/config/riscv/riscv-v.cc                   | 91 +++++++++++++++++++
 .../riscv/rvv/autovec/unop/bswap16-0.c        | 17 ++++
 .../riscv/rvv/autovec/unop/bswap16-run-0.c    | 44 +++++++++
 .../riscv/rvv/autovec/vls/bswap16-0.c         | 34 +++++++
 .../gcc.target/riscv/rvv/autovec/vls/perm-4.c |  4 +-
 5 files changed, 188 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
 create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 23633a2a74d..c72e411f125 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3030,6 +3030,95 @@ shuffle_decompress_patterns (struct expand_vec_perm_d *d)
   return true;
 }
 
+static bool
+shuffle_bswap_pattern (struct expand_vec_perm_d *d)
+{
+  HOST_WIDE_INT diff;
+  unsigned i, size, step;
+
+  if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
+    return false;
+
+  step = diff + 1;
+  size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
+
+  switch (size)
+    {
+    case 16:
+      break;
+    case 32:
+    case 64:
+      /* We will have VEC_PERM_EXPR after rtl expand when invoking
+	 __builtin_bswap. It will generate about 9 instructions in
+	 loop as below, no matter it is bswap16, bswap32 or bswap64.
+	   .L2:
+	 1 vle16.v v4,0(a0)
+	 2 vmv.v.x v2,a7
+	 3 vand.vv v2,v6,v2
+	 4 slli    a2,a5,1
+	 5 vrgatherei16.vv v1,v4,v2
+	 6 sub     a4,a4,a5
+	 7 vse16.v v1,0(a3)
+	 8 add     a0,a0,a2
+	 9 add     a3,a3,a2
+	   bne     a4,zero,.L2
+
+	 But for bswap16 we may have a even simple code gen, which
+	 has only 7 instructions in loop as below.
+	   .L5
+	 1 vle8.v  v2,0(a5)
+	 2 addi    a5,a5,32
+	 3 vsrl.vi v4,v2,8
+	 4 vsll.vi v2,v2,8
+	 5 vor.vv  v4,v4,v2
+	 6 vse8.v  v4,0(a4)
+	 7 addi    a4,a4,32
+	   bne     a5,a6,.L5
+
+	 Unfortunately, the instructions in loop will grow to 13 and 24
+	 for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
+	 for both the bswap64 and bswap32, but take shift and or (7 insn)
+	 for bswap16.
+       */
+    default:
+      return false;
+    }
+
+  for (i = 0; i < step; i++)
+    if (!d->perm.series_p (i, step, diff - i, step))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode vhi_mode;
+  poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
+
+  if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
+    return false;
+
+  /* Step-1: Move op0 to src with VHI mode.  */
+  rtx src = gen_reg_rtx (vhi_mode);
+  emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
+
+  /* Step-2: Shift right 8 bits to dest.  */
+  rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
+			   NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-3: Shift left 8 bits to src.  */
+  src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
+		      NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-4: Logic Or dest and src to dest.  */
+  dest = expand_binop (vhi_mode, ior_optab, dest, src,
+		       NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-5: Move src to target with VQI mode.  */
+  emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+  return true;
+}
+
 /* Recognize the pattern that can be shuffled by generic approach.  */
 
 static bool
@@ -3089,6 +3178,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 	    return true;
 	  if (shuffle_decompress_patterns (d))
 	    return true;
+	  if (shuffle_bswap_pattern (d))
+	    return true;
 	  if (shuffle_generic_patterns (d))
 	    return true;
 	  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
new file mode 100644
index 00000000000..10d235a8edf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+/*
+** test_uint16_t___builtin_bswap16:
+**   ...
+**   vsetvli\s+[atx][0-9]+,\s*zero,\s*e16,\s*m1,\s*ta,\s*ma
+**   vsrl\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vsll\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vor\.vv\s+v[0-9]+,\s*v[0-9],\s*v[0-9]+
+**   ...
+*/
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
new file mode 100644
index 00000000000..8d45cebc6c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+#define ARRAY_SIZE 128
+
+uint16_t in[ARRAY_SIZE];
+uint16_t out[ARRAY_SIZE];
+uint16_t ref[ARRAY_SIZE];
+
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
+TEST_ASSERT (uint16_t)
+
+/* TEST_INIT Arguments:
+	  +-------+-------+---------------------------+---------+
+	  | type  | input | reference                 | test id |
+	  +-------+-------+---------------------------+---------+
+*/
+TEST_INIT (uint16_t, 0x1234u, __builtin_bswap16 (0x1234u), 1)
+TEST_INIT (uint16_t, 0x1122u, __builtin_bswap16 (0x1122u), 2)
+TEST_INIT (uint16_t, 0xa55au, __builtin_bswap16 (0xa55au), 3)
+TEST_INIT (uint16_t, 0x0000u, __builtin_bswap16 (0x0000u), 4)
+TEST_INIT (uint16_t, 0xffffu, __builtin_bswap16 (0xffffu), 5)
+TEST_INIT (uint16_t, 0x4321u, __builtin_bswap16 (0x4321u), 6)
+
+int
+main ()
+{
+  /* RUN_TEST Arguments:
+	   +------+---------+-------------+----+-----+-----+------------+
+	   | type | test id | fun to test | in | out | ref | array size |
+	   +------+---------+-------------+----+-----+-----+------------+
+  */
+  RUN_TEST (uint16_t, 1, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 2, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 3, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 4, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 5, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 6, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
new file mode 100644
index 00000000000..11880bae1f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (bswap16, 1, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 4, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 8, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 16, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 32, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 64, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 128, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 256, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 512, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 1024, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2048, uint16_t, __builtin_bswap16)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
+/* { dg-final { scan-assembler-times {vsrl\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vsll\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vor\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 4d6862cf1c0..d2d49388a39 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,7 +3,7 @@
 
 #include "../vls-vlmax/perm-4.c"
 
-/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 18 } } */
 /* { dg-final { scan-assembler-times {vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
-/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
+/* { dg-final { scan-assembler-times {vrsub\.vi} 23 } } */
 /* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
-- 
2.34.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] RISC-V: Refine bswap16 auto vectorization code gen
  2023-10-09 13:09 ` [PATCH v2] " pan2.li
@ 2023-10-09 13:11   ` juzhe.zhong
  2023-10-09 13:13     ` Li, Pan2
  0 siblings, 1 reply; 6+ messages in thread
From: juzhe.zhong @ 2023-10-09 13:11 UTC (permalink / raw)
  To: pan2.li, gcc-patches; +Cc: pan2.li, yanzhang.wang, kito.cheng

[-- Attachment #1: Type: text/plain, Size: 11506 bytes --]

LGTM now.

Thanks.



juzhe.zhong@rivai.ai
 
From: pan2.li
Date: 2023-10-09 21:09
To: gcc-patches
CC: juzhe.zhong; pan2.li; yanzhang.wang; kito.cheng
Subject: [PATCH v2] RISC-V: Refine bswap16 auto vectorization code gen
From: Pan Li <pan2.li@intel.com>
 
Update in v2
 
* Remove emit helper functions.
* Take expand_binop instead.
 
Original log:
 
This patch would like to refine the code gen for the bswap16.
 
We will have VEC_PERM_EXPR after rtl expand when invoking
__builtin_bswap. It will generate about 9 instructions in
loop as below, no matter it is bswap16, bswap32 or bswap64.
 
  .L2:
1 vle16.v v4,0(a0)
2 vmv.v.x v2,a7
3 vand.vv v2,v6,v2
4 slli    a2,a5,1
5 vrgatherei16.vv v1,v4,v2
6 sub     a4,a4,a5
7 vse16.v v1,0(a3)
8 add     a0,a0,a2
9 add     a3,a3,a2
  bne     a4,zero,.L2
 
But for bswap16 we may have a even simple code gen, which
has only 7 instructions in loop as below.
 
  .L5
1 vle8.v  v2,0(a5)
2 addi    a5,a5,32
3 vsrl.vi v4,v2,8
4 vsll.vi v2,v2,8
5 vor.vv  v4,v4,v2
6 vse8.v  v4,0(a4)
7 addi    a4,a4,32
  bne     a5,a6,.L5
 
Unfortunately, this way will make the insn in loop will grow up to
13 and 24 for bswap32 and bswap64. Thus, we will refine the code
gen for the bswap16 only, and leave both the bswap32 and bswap64
as is.
 
gcc/ChangeLog:
 
* config/riscv/riscv-v.cc (shuffle_bswap_pattern): New func impl
for shuffle bswap.
(expand_vec_perm_const_1): Add handling for shuffle bswap pattern.
 
gcc/testsuite/ChangeLog:
 
* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Adjust checker.
* gcc.target/riscv/rvv/autovec/unop/bswap16-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c: New test.
* gcc.target/riscv/rvv/autovec/vls/bswap16-0.c: New test.
 
Signed-off-by: Pan Li <pan2.li@intel.com>
---
gcc/config/riscv/riscv-v.cc                   | 91 +++++++++++++++++++
.../riscv/rvv/autovec/unop/bswap16-0.c        | 17 ++++
.../riscv/rvv/autovec/unop/bswap16-run-0.c    | 44 +++++++++
.../riscv/rvv/autovec/vls/bswap16-0.c         | 34 +++++++
.../gcc.target/riscv/rvv/autovec/vls/perm-4.c |  4 +-
5 files changed, 188 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
 
diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 23633a2a74d..c72e411f125 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3030,6 +3030,95 @@ shuffle_decompress_patterns (struct expand_vec_perm_d *d)
   return true;
}
+static bool
+shuffle_bswap_pattern (struct expand_vec_perm_d *d)
+{
+  HOST_WIDE_INT diff;
+  unsigned i, size, step;
+
+  if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
+    return false;
+
+  step = diff + 1;
+  size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
+
+  switch (size)
+    {
+    case 16:
+      break;
+    case 32:
+    case 64:
+      /* We will have VEC_PERM_EXPR after rtl expand when invoking
+ __builtin_bswap. It will generate about 9 instructions in
+ loop as below, no matter it is bswap16, bswap32 or bswap64.
+    .L2:
+ 1 vle16.v v4,0(a0)
+ 2 vmv.v.x v2,a7
+ 3 vand.vv v2,v6,v2
+ 4 slli    a2,a5,1
+ 5 vrgatherei16.vv v1,v4,v2
+ 6 sub     a4,a4,a5
+ 7 vse16.v v1,0(a3)
+ 8 add     a0,a0,a2
+ 9 add     a3,a3,a2
+    bne     a4,zero,.L2
+
+ But for bswap16 we may have a even simple code gen, which
+ has only 7 instructions in loop as below.
+    .L5
+ 1 vle8.v  v2,0(a5)
+ 2 addi    a5,a5,32
+ 3 vsrl.vi v4,v2,8
+ 4 vsll.vi v2,v2,8
+ 5 vor.vv  v4,v4,v2
+ 6 vse8.v  v4,0(a4)
+ 7 addi    a4,a4,32
+    bne     a5,a6,.L5
+
+ Unfortunately, the instructions in loop will grow to 13 and 24
+ for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
+ for both the bswap64 and bswap32, but take shift and or (7 insn)
+ for bswap16.
+       */
+    default:
+      return false;
+    }
+
+  for (i = 0; i < step; i++)
+    if (!d->perm.series_p (i, step, diff - i, step))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode vhi_mode;
+  poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
+
+  if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
+    return false;
+
+  /* Step-1: Move op0 to src with VHI mode.  */
+  rtx src = gen_reg_rtx (vhi_mode);
+  emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
+
+  /* Step-2: Shift right 8 bits to dest.  */
+  rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
+    NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-3: Shift left 8 bits to src.  */
+  src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
+       NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-4: Logic Or dest and src to dest.  */
+  dest = expand_binop (vhi_mode, ior_optab, dest, src,
+        NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-5: Move src to target with VQI mode.  */
+  emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+  return true;
+}
+
/* Recognize the pattern that can be shuffled by generic approach.  */
static bool
@@ -3089,6 +3178,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
    return true;
  if (shuffle_decompress_patterns (d))
    return true;
+   if (shuffle_bswap_pattern (d))
+     return true;
  if (shuffle_generic_patterns (d))
    return true;
  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
new file mode 100644
index 00000000000..10d235a8edf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+/*
+** test_uint16_t___builtin_bswap16:
+**   ...
+**   vsetvli\s+[atx][0-9]+,\s*zero,\s*e16,\s*m1,\s*ta,\s*ma
+**   vsrl\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vsll\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vor\.vv\s+v[0-9]+,\s*v[0-9],\s*v[0-9]+
+**   ...
+*/
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
new file mode 100644
index 00000000000..8d45cebc6c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+#define ARRAY_SIZE 128
+
+uint16_t in[ARRAY_SIZE];
+uint16_t out[ARRAY_SIZE];
+uint16_t ref[ARRAY_SIZE];
+
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
+TEST_ASSERT (uint16_t)
+
+/* TEST_INIT Arguments:
+   +-------+-------+---------------------------+---------+
+   | type  | input | reference                 | test id |
+   +-------+-------+---------------------------+---------+
+*/
+TEST_INIT (uint16_t, 0x1234u, __builtin_bswap16 (0x1234u), 1)
+TEST_INIT (uint16_t, 0x1122u, __builtin_bswap16 (0x1122u), 2)
+TEST_INIT (uint16_t, 0xa55au, __builtin_bswap16 (0xa55au), 3)
+TEST_INIT (uint16_t, 0x0000u, __builtin_bswap16 (0x0000u), 4)
+TEST_INIT (uint16_t, 0xffffu, __builtin_bswap16 (0xffffu), 5)
+TEST_INIT (uint16_t, 0x4321u, __builtin_bswap16 (0x4321u), 6)
+
+int
+main ()
+{
+  /* RUN_TEST Arguments:
+    +------+---------+-------------+----+-----+-----+------------+
+    | type | test id | fun to test | in | out | ref | array size |
+    +------+---------+-------------+----+-----+-----+------------+
+  */
+  RUN_TEST (uint16_t, 1, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 2, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 3, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 4, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 5, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 6, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
new file mode 100644
index 00000000000..11880bae1f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (bswap16, 1, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 4, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 8, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 16, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 32, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 64, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 128, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 256, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 512, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 1024, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2048, uint16_t, __builtin_bswap16)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
+/* { dg-final { scan-assembler-times {vsrl\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vsll\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vor\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 4d6862cf1c0..d2d49388a39 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,7 +3,7 @@
#include "../vls-vlmax/perm-4.c"
-/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 18 } } */
/* { dg-final { scan-assembler-times {vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
-/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
+/* { dg-final { scan-assembler-times {vrsub\.vi} 23 } } */
/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
-- 
2.34.1
 
 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH v2] RISC-V: Refine bswap16 auto vectorization code gen
  2023-10-09 13:11   ` juzhe.zhong
@ 2023-10-09 13:13     ` Li, Pan2
  0 siblings, 0 replies; 6+ messages in thread
From: Li, Pan2 @ 2023-10-09 13:13 UTC (permalink / raw)
  To: juzhe.zhong, gcc-patches; +Cc: Wang, Yanzhang, kito.cheng

[-- Attachment #1: Type: text/plain, Size: 12159 bytes --]

Committed, thanks Juzhe.

Pan

From: juzhe.zhong@rivai.ai <juzhe.zhong@rivai.ai>
Sent: Monday, October 9, 2023 9:11 PM
To: Li, Pan2 <pan2.li@intel.com>; gcc-patches <gcc-patches@gcc.gnu.org>
Cc: Li, Pan2 <pan2.li@intel.com>; Wang, Yanzhang <yanzhang.wang@intel.com>; kito.cheng <kito.cheng@gmail.com>
Subject: Re: [PATCH v2] RISC-V: Refine bswap16 auto vectorization code gen

LGTM now.

Thanks.

________________________________
juzhe.zhong@rivai.ai<mailto:juzhe.zhong@rivai.ai>

From: pan2.li<mailto:pan2.li@intel.com>
Date: 2023-10-09 21:09
To: gcc-patches<mailto:gcc-patches@gcc.gnu.org>
CC: juzhe.zhong<mailto:juzhe.zhong@rivai.ai>; pan2.li<mailto:pan2.li@intel.com>; yanzhang.wang<mailto:yanzhang.wang@intel.com>; kito.cheng<mailto:kito.cheng@gmail.com>
Subject: [PATCH v2] RISC-V: Refine bswap16 auto vectorization code gen
From: Pan Li <pan2.li@intel.com<mailto:pan2.li@intel.com>>

Update in v2

* Remove emit helper functions.
* Take expand_binop instead.

Original log:

This patch would like to refine the code gen for the bswap16.

We will have VEC_PERM_EXPR after rtl expand when invoking
__builtin_bswap. It will generate about 9 instructions in
loop as below, no matter it is bswap16, bswap32 or bswap64.

  .L2:
1 vle16.v v4,0(a0)
2 vmv.v.x v2,a7
3 vand.vv v2,v6,v2
4 slli    a2,a5,1
5 vrgatherei16.vv v1,v4,v2
6 sub     a4,a4,a5
7 vse16.v v1,0(a3)
8 add     a0,a0,a2
9 add     a3,a3,a2
  bne     a4,zero,.L2

But for bswap16 we may have a even simple code gen, which
has only 7 instructions in loop as below.

  .L5
1 vle8.v  v2,0(a5)
2 addi    a5,a5,32
3 vsrl.vi v4,v2,8
4 vsll.vi v2,v2,8
5 vor.vv  v4,v4,v2
6 vse8.v  v4,0(a4)
7 addi    a4,a4,32
  bne     a5,a6,.L5

Unfortunately, this way will make the insn in loop will grow up to
13 and 24 for bswap32 and bswap64. Thus, we will refine the code
gen for the bswap16 only, and leave both the bswap32 and bswap64
as is.

gcc/ChangeLog:

* config/riscv/riscv-v.cc (shuffle_bswap_pattern): New func impl
for shuffle bswap.
(expand_vec_perm_const_1): Add handling for shuffle bswap pattern.

gcc/testsuite/ChangeLog:

* gcc.target/riscv/rvv/autovec/vls/perm-4.c: Adjust checker.
* gcc.target/riscv/rvv/autovec/unop/bswap16-0.c: New test.
* gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c: New test.
* gcc.target/riscv/rvv/autovec/vls/bswap16-0.c: New test.

Signed-off-by: Pan Li <pan2.li@intel.com<mailto:pan2.li@intel.com>>
---
gcc/config/riscv/riscv-v.cc                   | 91 +++++++++++++++++++
.../riscv/rvv/autovec/unop/bswap16-0.c        | 17 ++++
.../riscv/rvv/autovec/unop/bswap16-run-0.c    | 44 +++++++++
.../riscv/rvv/autovec/vls/bswap16-0.c         | 34 +++++++
.../gcc.target/riscv/rvv/autovec/vls/perm-4.c |  4 +-
5 files changed, 188 insertions(+), 2 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
create mode 100644 gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c

diff --git a/gcc/config/riscv/riscv-v.cc b/gcc/config/riscv/riscv-v.cc
index 23633a2a74d..c72e411f125 100644
--- a/gcc/config/riscv/riscv-v.cc
+++ b/gcc/config/riscv/riscv-v.cc
@@ -3030,6 +3030,95 @@ shuffle_decompress_patterns (struct expand_vec_perm_d *d)
   return true;
}
+static bool
+shuffle_bswap_pattern (struct expand_vec_perm_d *d)
+{
+  HOST_WIDE_INT diff;
+  unsigned i, size, step;
+
+  if (!d->one_vector_p || !d->perm[0].is_constant (&diff) || !diff)
+    return false;
+
+  step = diff + 1;
+  size = step * GET_MODE_UNIT_BITSIZE (d->vmode);
+
+  switch (size)
+    {
+    case 16:
+      break;
+    case 32:
+    case 64:
+      /* We will have VEC_PERM_EXPR after rtl expand when invoking
+ __builtin_bswap. It will generate about 9 instructions in
+ loop as below, no matter it is bswap16, bswap32 or bswap64.
+    .L2:
+ 1 vle16.v v4,0(a0)
+ 2 vmv.v.x v2,a7
+ 3 vand.vv v2,v6,v2
+ 4 slli    a2,a5,1
+ 5 vrgatherei16.vv v1,v4,v2
+ 6 sub     a4,a4,a5
+ 7 vse16.v v1,0(a3)
+ 8 add     a0,a0,a2
+ 9 add     a3,a3,a2
+    bne     a4,zero,.L2
+
+ But for bswap16 we may have a even simple code gen, which
+ has only 7 instructions in loop as below.
+    .L5
+ 1 vle8.v  v2,0(a5)
+ 2 addi    a5,a5,32
+ 3 vsrl.vi v4,v2,8
+ 4 vsll.vi v2,v2,8
+ 5 vor.vv  v4,v4,v2
+ 6 vse8.v  v4,0(a4)
+ 7 addi    a4,a4,32
+    bne     a5,a6,.L5
+
+ Unfortunately, the instructions in loop will grow to 13 and 24
+ for bswap32 and bswap64. Thus, we will leverage vrgather (9 insn)
+ for both the bswap64 and bswap32, but take shift and or (7 insn)
+ for bswap16.
+       */
+    default:
+      return false;
+    }
+
+  for (i = 0; i < step; i++)
+    if (!d->perm.series_p (i, step, diff - i, step))
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  machine_mode vhi_mode;
+  poly_uint64 vhi_nunits = exact_div (GET_MODE_NUNITS (d->vmode), 2);
+
+  if (!get_vector_mode (HImode, vhi_nunits).exists (&vhi_mode))
+    return false;
+
+  /* Step-1: Move op0 to src with VHI mode.  */
+  rtx src = gen_reg_rtx (vhi_mode);
+  emit_move_insn (src, gen_lowpart (vhi_mode, d->op0));
+
+  /* Step-2: Shift right 8 bits to dest.  */
+  rtx dest = expand_binop (vhi_mode, lshr_optab, src, gen_int_mode (8, Pmode),
+    NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-3: Shift left 8 bits to src.  */
+  src = expand_binop (vhi_mode, ashl_optab, src, gen_int_mode (8, Pmode),
+       NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-4: Logic Or dest and src to dest.  */
+  dest = expand_binop (vhi_mode, ior_optab, dest, src,
+        NULL_RTX, 0, OPTAB_DIRECT);
+
+  /* Step-5: Move src to target with VQI mode.  */
+  emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+  return true;
+}
+
/* Recognize the pattern that can be shuffled by generic approach.  */
static bool
@@ -3089,6 +3178,8 @@ expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
    return true;
  if (shuffle_decompress_patterns (d))
    return true;
+   if (shuffle_bswap_pattern (d))
+     return true;
  if (shuffle_generic_patterns (d))
    return true;
  return false;
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
new file mode 100644
index 00000000000..10d235a8edf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-0.c
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize -fno-vect-cost-model -fno-schedule-insns -fno-schedule-insns2" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+/*
+** test_uint16_t___builtin_bswap16:
+**   ...
+**   vsetvli\s+[atx][0-9]+,\s*zero,\s*e16,\s*m1,\s*ta,\s*ma
+**   vsrl\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vsll\.vi\s+v[0-9]+,\s*v[0-9],\s*8+
+**   vor\.vv\s+v[0-9]+,\s*v[0-9],\s*v[0-9]+
+**   ...
+*/
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
new file mode 100644
index 00000000000..8d45cebc6c2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/unop/bswap16-run-0.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { riscv_v } } } */
+/* { dg-additional-options "-std=c99 -O3 -ftree-vectorize -fno-vect-cost-model" } */
+
+#include <stdint-gcc.h>
+#include "test-math.h"
+
+#define ARRAY_SIZE 128
+
+uint16_t in[ARRAY_SIZE];
+uint16_t out[ARRAY_SIZE];
+uint16_t ref[ARRAY_SIZE];
+
+TEST_UNARY_CALL (uint16_t, __builtin_bswap16)
+TEST_ASSERT (uint16_t)
+
+/* TEST_INIT Arguments:
+   +-------+-------+---------------------------+---------+
+   | type  | input | reference                 | test id |
+   +-------+-------+---------------------------+---------+
+*/
+TEST_INIT (uint16_t, 0x1234u, __builtin_bswap16 (0x1234u), 1)
+TEST_INIT (uint16_t, 0x1122u, __builtin_bswap16 (0x1122u), 2)
+TEST_INIT (uint16_t, 0xa55au, __builtin_bswap16 (0xa55au), 3)
+TEST_INIT (uint16_t, 0x0000u, __builtin_bswap16 (0x0000u), 4)
+TEST_INIT (uint16_t, 0xffffu, __builtin_bswap16 (0xffffu), 5)
+TEST_INIT (uint16_t, 0x4321u, __builtin_bswap16 (0x4321u), 6)
+
+int
+main ()
+{
+  /* RUN_TEST Arguments:
+    +------+---------+-------------+----+-----+-----+------------+
+    | type | test id | fun to test | in | out | ref | array size |
+    +------+---------+-------------+----+-----+-----+------------+
+  */
+  RUN_TEST (uint16_t, 1, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 2, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 3, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 4, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 5, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+  RUN_TEST (uint16_t, 6, __builtin_bswap16, in, out, ref, ARRAY_SIZE);
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
new file mode 100644
index 00000000000..11880bae1f8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/bswap16-0.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv_zvfh_zvl4096b -mabi=lp64d -O3 --param=riscv-autovec-lmul=m8 -ffast-math -fdump-tree-optimized" } */
+
+#include "def.h"
+
+DEF_OP_V (bswap16, 1, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 4, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 8, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 16, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 32, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 64, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 128, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 256, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 512, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 1024, uint16_t, __builtin_bswap16)
+DEF_OP_V (bswap16, 2048, uint16_t, __builtin_bswap16)
+
+/* { dg-final { scan-assembler-not {csrr} } } */
+/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4,4" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "16,16" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "32,32" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "64,64" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "128,128" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "256,256" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "512,512" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "1024,1024" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "2048,2048" "optimized" } } */
+/* { dg-final { scan-tree-dump-not "4096,4096" "optimized" } } */
+/* { dg-final { scan-assembler-times {vsrl\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vsll\.vi\s+v[0-9]+,\s*v[0-9]+,\s*8} 11 } } */
+/* { dg-final { scan-assembler-times {vor\.vv\s+v[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 11 } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
index 4d6862cf1c0..d2d49388a39 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/perm-4.c
@@ -3,7 +3,7 @@
#include "../vls-vlmax/perm-4.c"
-/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 19 } } */
+/* { dg-final { scan-assembler-times {vrgather\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 18 } } */
/* { dg-final { scan-assembler-times {vrgatherei16\.vv\tv[0-9]+,\s*v[0-9]+,\s*v[0-9]+} 12 } } */
-/* { dg-final { scan-assembler-times {vrsub\.vi} 24 } } */
+/* { dg-final { scan-assembler-times {vrsub\.vi} 23 } } */
/* { dg-final { scan-assembler-times {vrsub\.vx} 7 } } */
--
2.34.1



^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-10-09 13:13 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-10-09  8:51 [PATCH v1] RISC-V: Refine bswap16 auto vectorization code gen pan2.li
2023-10-09  9:04 ` juzhe.zhong
2023-10-09 10:45   ` Li, Pan2
2023-10-09 13:09 ` [PATCH v2] " pan2.li
2023-10-09 13:11   ` juzhe.zhong
2023-10-09 13:13     ` Li, Pan2

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).