[PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]
@ 2024-05-08  2:42 Levy Hsu
  2024-05-08  7:07 ` Uros Bizjak
  0 siblings, 1 reply; 6+ messages in thread
From: Levy Hsu @ 2024-05-08  2:42 UTC (permalink / raw)
  To: gcc-patches; +Cc: admin, liwei.xu, crazylht

        PR target/107563

gcc/ChangeLog:

	* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
	subroutine.
	(ix86_expand_vec_perm_const_1): New Entry.

gcc/testsuite/ChangeLog:

	* g++.target/i386/pr107563.C: New test.
---
 gcc/config/i386/i386-expand.cc           | 64 ++++++++++++++++++++++++
 gcc/testsuite/g++.target/i386/pr107563.C | 23 +++++++++
 2 files changed, 87 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..2718b0acb87 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case E_V8QImode:
+      if (!TARGET_MMX_WITH_SSE)
+	return false;
+      mode = V4HImode;
+      gen_shr = gen_ashrv4hi3;
+      gen_shl = gen_ashlv4hi3;
+      gen_or = gen_iorv4hi3;
+      break;
+    case E_V16QImode:
+      mode = V8HImode;
+      gen_shr = gen_vlshrv8hi3;
+      gen_shl = gen_vashlv8hi3;
+      gen_or = gen_iorv8hi3;
+      break;
+    default: return false;
+    }
+
+  if (!rtx_equal_p (d->op0, d->op1))
+    return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
@@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 
   if (expand_vec_perm_2perm_pblendv (d, false))
     return true;
+
+  if (expand_vec_perm_psrlw_psllw_por (d))
+    return true;
 
   /* Try sequences of four instructions.  */
 
diff --git a/gcc/testsuite/g++.target/i386/pr107563.C b/gcc/testsuite/g++.target/i386/pr107563.C
new file mode 100755
index 00000000000..5b0c648e8f1
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563.C
@@ -0,0 +1,23 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-not "movzbl" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "orq" } } */
+/* { dg-final { scan-assembler-not "punpcklqdq" } } */
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 2 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
+void foo (temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+}
+
+using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
+void foo2 (temp_vec_type2& v) noexcept
+{
+  v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]
  2024-05-08  2:42 [PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563] Levy Hsu
@ 2024-05-08  7:07 ` Uros Bizjak
  2024-05-09  9:09   ` [PATCH 1/1] " Levy Hsu
  0 siblings, 1 reply; 6+ messages in thread
From: Uros Bizjak @ 2024-05-08  7:07 UTC (permalink / raw)
  To: Levy Hsu; +Cc: gcc-patches, liwei.xu, crazylht

On Wed, May 8, 2024 at 4:44 AM Levy Hsu <admin@levyhsu.com> wrote:
>
>         PR target/107563
>
> gcc/ChangeLog:
>
>         * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
>         subroutine.
>         (ix86_expand_vec_perm_const_1): New Entry.
>
> gcc/testsuite/ChangeLog:
>
>         * g++.target/i386/pr107563.C: New test.
> ---
>  gcc/config/i386/i386-expand.cc           | 64 ++++++++++++++++++++++++
>  gcc/testsuite/g++.target/i386/pr107563.C | 23 +++++++++
>  2 files changed, 87 insertions(+)
>  create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..2718b0acb87 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
>    return true;
>  }
>
> +/* A subroutine of ix86_expand_vec_perm_const_1.
> +   Implement a permutation with psrlw, psllw and por.
> +   It handles case:
> +   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
> +   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
> +
> +static bool
> +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
> +{
> +  unsigned i;
> +  rtx (*gen_shr) (rtx, rtx, rtx);
> +  rtx (*gen_shl) (rtx, rtx, rtx);
> +  rtx (*gen_or) (rtx, rtx, rtx);
> +  machine_mode mode = VOIDmode;
> +
> +  if (!TARGET_SSE2 || !d->one_operand_p)
> +    return false;
> +
> +  switch (d->vmode)
> +    {
> +    case E_V8QImode:
> +      if (!TARGET_MMX_WITH_SSE)
> +       return false;
> +      mode = V4HImode;
> +      gen_shr = gen_ashrv4hi3;
> +      gen_shl = gen_ashlv4hi3;
> +      gen_or = gen_iorv4hi3;
> +      break;
> +    case E_V16QImode:
> +      mode = V8HImode;
> +      gen_shr = gen_vlshrv8hi3;
> +      gen_shl = gen_vashlv8hi3;
> +      gen_or = gen_iorv8hi3;
> +      break;
> +    default: return false;
> +    }
> +
> +  if (!rtx_equal_p (d->op0, d->op1))
> +    return false;
> +
> +  for (i = 0; i < d->nelt; i += 2)
> +    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  rtx tmp1 = gen_reg_rtx (mode);
> +  rtx tmp2 = gen_reg_rtx (mode);
> +  rtx op0 = force_reg (d->vmode, d->op0);
> +
> +  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
> +  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
> +  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
> +  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
> +  emit_insn (gen_or (tmp1, tmp1, tmp2));
> +  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
> +
> +  return true;
> +}
> +
>  /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
>     permutation using two vperm2f128, followed by a vshufpd insn blending
>     the two vectors together.  */
> @@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>
>    if (expand_vec_perm_2perm_pblendv (d, false))
>      return true;
> +
> +  if (expand_vec_perm_psrlw_psllw_por (d))
> +    return true;
>
>    /* Try sequences of four instructions.  */
>
> diff --git a/gcc/testsuite/g++.target/i386/pr107563.C b/gcc/testsuite/g++.target/i386/pr107563.C
> new file mode 100755
> index 00000000000..5b0c648e8f1
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr107563.C
> @@ -0,0 +1,23 @@
> +/* PR target/107563.C */
> +/* { dg-do compile { target { ! ia32 } } } */

Please split the testcase to two files, one (e.g. pr107563-a.C)
testing 8-byte vectors and the other (e.g. pr107563-b.C) using 16-byte
vectors. The latter can also be tested with 32-bit targets.

Uros.

> +/* { dg-options "-std=c++2b -O3 -msse2" } */
> +/* { dg-final { scan-assembler-not "movzbl" } } */
> +/* { dg-final { scan-assembler-not "salq" } } */
> +/* { dg-final { scan-assembler-not "orq" } } */
> +/* { dg-final { scan-assembler-not "punpcklqdq" } } */
> +/* { dg-final { scan-assembler-times "psllw" 2 } } */
> +/* { dg-final { scan-assembler-times "psrlw" 1 } } */
> +/* { dg-final { scan-assembler-times "psraw" 1 } } */
> +/* { dg-final { scan-assembler-times "por" 2 } } */
> +
> +using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
> +void foo (temp_vec_type& v) noexcept
> +{
> +  v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
> +}
> +
> +using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
> +void foo2 (temp_vec_type2& v) noexcept
> +{
> +  v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
> +}
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/1] [PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]
  2024-05-08  7:07 ` Uros Bizjak
@ 2024-05-09  9:09   ` Levy Hsu
  2024-05-14  6:05     ` Uros Bizjak
  0 siblings, 1 reply; 6+ messages in thread
From: Levy Hsu @ 2024-05-09  9:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: liwei.xu, admin, crazylht

Hi All

We've introduced a new subroutine in ix86_expand_vec_perm_const_1
to optimize vector shifting for the V16QI type on x86.
This patch uses a three-instruction sequence psrlw, psllw, and por
to handle specific vector shuffle operations more efficiently.
The change aims to improve assembly code generation for configurations
supporting SSE2.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

Best
Levy

gcc/ChangeLog:

	PR target/107563
	* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
	subroutine.
	(ix86_expand_vec_perm_const_1): New Entry.

gcc/testsuite/ChangeLog:

	PR target/107563
	* g++.target/i386/pr107563-a.C: New test.
	* g++.target/i386/pr107563-b.C: New test.
---
 gcc/config/i386/i386-expand.cc             | 64 ++++++++++++++++++++++
 gcc/testsuite/g++.target/i386/pr107563-a.C | 13 +++++
 gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++++
 3 files changed, 89 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563-a.C
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563-b.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..5098d2886bb 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case E_V8QImode:
+      if (!TARGET_MMX_WITH_SSE)
+	return false;
+      mode = V4HImode;
+      gen_shr = gen_ashrv4hi3;
+      gen_shl = gen_ashlv4hi3;
+      gen_or = gen_iorv4hi3;
+      break;
+    case E_V16QImode:
+      mode = V8HImode;
+      gen_shr = gen_vlshrv8hi3;
+      gen_shl = gen_vashlv8hi3;
+      gen_or = gen_iorv8hi3;
+      break;
+    default: return false;
+    }
+
+  if (!rtx_equal_p (d->op0, d->op1))
+    return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
@@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_2perm_pblendv (d, false))
     return true;
 
+  if (expand_vec_perm_psrlw_psllw_por (d))
+    return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C b/gcc/testsuite/g++.target/i386/pr107563-a.C
new file mode 100755
index 00000000000..605c1bdf814
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
@@ -0,0 +1,13 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
+
+void foo2(temp_vec_type2& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
+}
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C b/gcc/testsuite/g++.target/i386/pr107563-b.C
new file mode 100755
index 00000000000..0ce3e8263bb
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
@@ -0,0 +1,12 @@
+/* PR target/107563.C */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__(16)]] = char;
+
+void foo(temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/1] [PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]
  2024-05-09  9:09   ` [PATCH 1/1] " Levy Hsu
@ 2024-05-14  6:05     ` Uros Bizjak
  2024-05-14  9:04       ` [PATCH] x86: Add " Levy Hsu
  0 siblings, 1 reply; 6+ messages in thread
From: Uros Bizjak @ 2024-05-14  6:05 UTC (permalink / raw)
  To: Levy Hsu; +Cc: gcc-patches, liwei.xu, crazylht

On Thu, May 9, 2024 at 11:12 AM Levy Hsu <admin@levyhsu.com> wrote:
>
> Hi All
>
> We've introduced a new subroutine in ix86_expand_vec_perm_const_1
> to optimize vector shifting for the V16QI type on x86.
> This patch uses a three-instruction sequence psrlw, psllw, and por
> to handle specific vector shuffle operations more efficiently.
> The change aims to improve assembly code generation for configurations
> supporting SSE2.
>
> Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?
>
> Best
> Levy
>
> gcc/ChangeLog:
>
>         PR target/107563
>         * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
>         subroutine.
>         (ix86_expand_vec_perm_const_1): New Entry.

Please say (ix86_expand_vec_perm_const_1): Call expand_vec_perm_psrlw_psllw_por.

>
> gcc/testsuite/ChangeLog:
>
>         PR target/107563
>         * g++.target/i386/pr107563-a.C: New test.
>         * g++.target/i386/pr107563-b.C: New test.

OK with the above adjustment.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-expand.cc             | 64 ++++++++++++++++++++++
>  gcc/testsuite/g++.target/i386/pr107563-a.C | 13 +++++
>  gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++++
>  3 files changed, 89 insertions(+)
>  create mode 100755 gcc/testsuite/g++.target/i386/pr107563-a.C
>  create mode 100755 gcc/testsuite/g++.target/i386/pr107563-b.C
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..5098d2886bb 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
>    return true;
>  }
>
> +/* A subroutine of ix86_expand_vec_perm_const_1.
> +   Implement a permutation with psrlw, psllw and por.
> +   It handles case:
> +   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
> +   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
> +
> +static bool
> +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
> +{
> +  unsigned i;
> +  rtx (*gen_shr) (rtx, rtx, rtx);
> +  rtx (*gen_shl) (rtx, rtx, rtx);
> +  rtx (*gen_or) (rtx, rtx, rtx);
> +  machine_mode mode = VOIDmode;
> +
> +  if (!TARGET_SSE2 || !d->one_operand_p)
> +    return false;
> +
> +  switch (d->vmode)
> +    {
> +    case E_V8QImode:
> +      if (!TARGET_MMX_WITH_SSE)
> +       return false;
> +      mode = V4HImode;
> +      gen_shr = gen_ashrv4hi3;
> +      gen_shl = gen_ashlv4hi3;
> +      gen_or = gen_iorv4hi3;
> +      break;
> +    case E_V16QImode:
> +      mode = V8HImode;
> +      gen_shr = gen_vlshrv8hi3;
> +      gen_shl = gen_vashlv8hi3;
> +      gen_or = gen_iorv8hi3;
> +      break;
> +    default: return false;
> +    }
> +
> +  if (!rtx_equal_p (d->op0, d->op1))
> +    return false;
> +
> +  for (i = 0; i < d->nelt; i += 2)
> +    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  rtx tmp1 = gen_reg_rtx (mode);
> +  rtx tmp2 = gen_reg_rtx (mode);
> +  rtx op0 = force_reg (d->vmode, d->op0);
> +
> +  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
> +  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
> +  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
> +  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
> +  emit_insn (gen_or (tmp1, tmp1, tmp2));
> +  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
> +
> +  return true;
> +}
> +
>  /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
>     permutation using two vperm2f128, followed by a vshufpd insn blending
>     the two vectors together.  */
> @@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
>    if (expand_vec_perm_2perm_pblendv (d, false))
>      return true;
>
> +  if (expand_vec_perm_psrlw_psllw_por (d))
> +    return true;
> +
>    /* Try sequences of four instructions.  */
>
>    if (expand_vec_perm_even_odd_trunc (d))
> diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C b/gcc/testsuite/g++.target/i386/pr107563-a.C
> new file mode 100755
> index 00000000000..605c1bdf814
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
> @@ -0,0 +1,13 @@
> +/* PR target/107563.C */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-std=c++2b -O3 -msse2" } */
> +/* { dg-final { scan-assembler-times "psllw" 1 } } */
> +/* { dg-final { scan-assembler-times "psraw" 1 } } */
> +/* { dg-final { scan-assembler-times "por" 1 } } */
> +
> +using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
> +
> +void foo2(temp_vec_type2& v) noexcept
> +{
> +  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
> +}
> diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C b/gcc/testsuite/g++.target/i386/pr107563-b.C
> new file mode 100755
> index 00000000000..0ce3e8263bb
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
> @@ -0,0 +1,12 @@
> +/* PR target/107563.C */
> +/* { dg-options "-std=c++2b -O3 -msse2" } */
> +/* { dg-final { scan-assembler-times "psllw" 1 } } */
> +/* { dg-final { scan-assembler-times "psrlw" 1 } } */
> +/* { dg-final { scan-assembler-times "por" 1 } } */
> +
> +using temp_vec_type [[__gnu__::__vector_size__(16)]] = char;
> +
> +void foo(temp_vec_type& v) noexcept
> +{
> +  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
> +}
> --
> 2.31.1
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] x86: Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]
  2024-05-14  6:05     ` Uros Bizjak
@ 2024-05-14  9:04       ` Levy Hsu
  0 siblings, 0 replies; 6+ messages in thread
From: Levy Hsu @ 2024-05-14  9:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: liwei.xu, admin, crazylht

Hi All

We've introduced a new subroutine in ix86_expand_vec_perm_const_1
to optimize vector shifting for the V16QI type on x86.
This patch uses a three-instruction sequence psrlw, psllw, and por
to handle specific vector shuffle operations more efficiently.
The change aims to improve assembly code generation for configurations
supporting SSE2.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

Best
Levy

gcc/ChangeLog:

	PR target/107563
	* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
	subroutine.
	(ix86_expand_vec_perm_const_1): Call expand_vec_perm_psrlw_psllw_por.

gcc/testsuite/ChangeLog:

	PR target/107563
	* g++.target/i386/pr107563-a.C: New test.
	* g++.target/i386/pr107563-b.C: New test.
---
 gcc/config/i386/i386-expand.cc             | 64 ++++++++++++++++++++++
 gcc/testsuite/g++.target/i386/pr107563-a.C | 13 +++++
 gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++++
 3 files changed, 89 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563-a.C
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563-b.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..5098d2886bb 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case E_V8QImode:
+      if (!TARGET_MMX_WITH_SSE)
+	return false;
+      mode = V4HImode;
+      gen_shr = gen_ashrv4hi3;
+      gen_shl = gen_ashlv4hi3;
+      gen_or = gen_iorv4hi3;
+      break;
+    case E_V16QImode:
+      mode = V8HImode;
+      gen_shr = gen_vlshrv8hi3;
+      gen_shl = gen_vashlv8hi3;
+      gen_or = gen_iorv8hi3;
+      break;
+    default: return false;
+    }
+
+  if (!rtx_equal_p (d->op0, d->op1))
+    return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
@@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_2perm_pblendv (d, false))
     return true;
 
+  if (expand_vec_perm_psrlw_psllw_por (d))
+    return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C b/gcc/testsuite/g++.target/i386/pr107563-a.C
new file mode 100755
index 00000000000..605c1bdf814
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
@@ -0,0 +1,13 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
+
+void foo2(temp_vec_type2& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
+}
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C b/gcc/testsuite/g++.target/i386/pr107563-b.C
new file mode 100755
index 00000000000..0ce3e8263bb
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
@@ -0,0 +1,12 @@
+/* PR target/107563.C */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__(16)]] = char;
+
+void foo(temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563]
@ 2024-05-08  3:04 Levy Hsu
  0 siblings, 0 replies; 6+ messages in thread
From: Levy Hsu @ 2024-05-08  3:04 UTC (permalink / raw)
  To: gcc-patches; +Cc: admin, liwei.xu, crazylht

Hi All

We've introduced a new subroutine in ix86_expand_vec_perm_const_1 
to optimize vector shifting for the V16QI type on x86. 
This patch uses a three-instruction sequence psrlw, psllw, and por
to handle specific vector shuffle operations more efficiently. 
The change aims to improve assembly code generation for configurations 
supporting SSE2. 
This update addresses the issue detailed in Bugzilla report 107563.

Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?

BRs,
Levy

gcc/ChangeLog:

	* config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
	subroutine.
	(ix86_expand_vec_perm_const_1): New Entry.

gcc/testsuite/ChangeLog:

	* g++.target/i386/pr107563.C: New test.
---
 gcc/config/i386/i386-expand.cc           | 64 ++++++++++++++++++++++++
 gcc/testsuite/g++.target/i386/pr107563.C | 23 +++++++++
 2 files changed, 87 insertions(+)
 create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..2718b0acb87 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case E_V8QImode:
+      if (!TARGET_MMX_WITH_SSE)
+	return false;
+      mode = V4HImode;
+      gen_shr = gen_ashrv4hi3;
+      gen_shl = gen_ashlv4hi3;
+      gen_or = gen_iorv4hi3;
+      break;
+    case E_V16QImode:
+      mode = V8HImode;
+      gen_shr = gen_vlshrv8hi3;
+      gen_shl = gen_vashlv8hi3;
+      gen_or = gen_iorv8hi3;
+      break;
+    default: return false;
+    }
+
+  if (!rtx_equal_p (d->op0, d->op1))
+    return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
@@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 
   if (expand_vec_perm_2perm_pblendv (d, false))
     return true;
+
+  if (expand_vec_perm_psrlw_psllw_por (d))
+    return true;
 
   /* Try sequences of four instructions.  */
 
diff --git a/gcc/testsuite/g++.target/i386/pr107563.C b/gcc/testsuite/g++.target/i386/pr107563.C
new file mode 100755
index 00000000000..5b0c648e8f1
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563.C
@@ -0,0 +1,23 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-not "movzbl" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "orq" } } */
+/* { dg-final { scan-assembler-not "punpcklqdq" } } */
+/* { dg-final { scan-assembler-times "psllw" 2 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 2 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char;
+void foo (temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+}
+
+using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char;
+void foo2 (temp_vec_type2& v) noexcept
+{
+  v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6);
+}
-- 
2.31.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-05-14  9:04 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-08  2:42 [PATCH] x86:Add 3-instruction subroutine vector shift for V16QI in ix86_expand_vec_perm_const_1 [PR107563] Levy Hsu
2024-05-08  7:07 ` Uros Bizjak
2024-05-09  9:09   ` [PATCH 1/1] " Levy Hsu
2024-05-14  6:05     ` Uros Bizjak
2024-05-14  9:04       ` [PATCH] x86: Add " Levy Hsu
2024-05-08  3:04 [PATCH] x86:Add " Levy Hsu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).