public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [x86 PATCH] PR target/107548: Handle vec_select in STV.
@ 2022-12-22 23:19 Roger Sayle
  2022-12-23  7:38 ` Uros Bizjak
  0 siblings, 1 reply; 2+ messages in thread
From: Roger Sayle @ 2022-12-22 23:19 UTC (permalink / raw)
  To: 'GCC Patches'; +Cc: 'Uros Bizjak'

[-- Attachment #1: Type: text/plain, Size: 2168 bytes --]


This patch enhances x86's STV pass to handle VEC_SELECT during general
scalar chain conversion, performing SImode scalar extraction from V4SI
and DImode scalar extraction from V2DI vector registers.

The motivating test case from bugzilla is:

typedef unsigned int v4si __attribute__((vector_size(16)));

unsigned int f (v4si a, v4si b)
{
  a[0] += b[0];
  return a[0] + a[1];
}

currently with -O2 -march=znver2 this generates:

        vpextrd $1, %xmm0, %edx
        vmovd   %xmm0, %eax
        addl    %edx, %eax
        vmovd   %xmm1, %edx
        addl    %edx, %eax
        ret

which performs three transfers from the vector unit to the scalar unit,
and performs the two additions there.  With this patch, we now generate:

        vmovdqa %xmm0, %xmm2
        vpshufd $85, %xmm0, %xmm0
        vpaddd  %xmm0, %xmm2, %xmm0
        vpaddd  %xmm1, %xmm0, %xmm0
        vmovd   %xmm0, %eax
        ret

which performs the two additions in the vector unit, and then transfers
the result to the scalar unit.  Technically the (cheap) movdqa isn't
needed with better register allocation (or this could be cleaned up
during peephole2), but even so this transform is still a win.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32},
with no new failures.  Ok for mainline?


2022-12-22  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
        PR target/107548
        * config/i386/i386-features.cc (scalar_chain::add_insn): The
        operands of a VEC_SELECT don't need to added to the scalar chain.
        (general_scalar_chain::compute_convert_gain) <case VEC_SELECT>:
        Provide gains for performing STV on a VEC_SELECT.
        (general_scalar_chain::convert_insn): Convert VEC_SELECT to pshufd,
        psrldq or no-op.
        (general_scalar_to_vector_candidate_p): Handle VEC_SELECT of a
        single element from a vector register to a scalar register.

gcc/testsuite/ChangeLog
        PR target/107548
        * gcc.target/i386/pr107548-1.c: New test V4SI case.
        * gcc.target/i386/pr107548-1.c: New test V2DI case.


Thanks in advance,
Roger
--


[-- Attachment #2: patchvs.txt --]
[-- Type: text/plain, Size: 4210 bytes --]

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index fd212262..cb21d3b 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -429,6 +429,11 @@ scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
   for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
     if (!HARD_REGISTER_P (DF_REF_REG (ref)))
       analyze_register_chain (candidates, ref);
+
+  /* The operand(s) of VEC_SELECT don't need to be converted/convertible.  */
+  if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
+    return;
+
   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
     if (!DF_REF_REG_MEM_P (ref))
       analyze_register_chain (candidates, ref);
@@ -629,6 +634,23 @@ general_scalar_chain::compute_convert_gain ()
 	      }
 	    break;
 
+	  case VEC_SELECT:
+	    if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
+	      {
+		// movd (4 bytes) replaced with movdqa (4 bytes).
+		if (!optimize_insn_for_size_p ())
+		  igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
+	      }
+	    else
+	      {
+		// pshufd; movd replaced with pshufd.
+		if (optimize_insn_for_size_p ())
+		  igain += COSTS_N_BYTES (4);
+		else
+		  igain += ix86_cost->sse_to_integer;
+	      }
+	    break;
+
 	  default:
 	    gcc_unreachable ();
 	  }
@@ -1167,6 +1189,24 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
       convert_op (&src, insn);
       break;
 
+    case VEC_SELECT:
+      if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
+	src = XEXP (src, 0);
+      else if (smode == DImode)
+	{
+	  rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
+	  dst = gen_lowpart (V1TImode, dst);
+	  src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
+	}
+      else
+	{
+	  rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
+	  rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
+	  rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
+	  src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
+	}
+      break;
+
     default:
       gcc_unreachable ();
     }
@@ -1917,6 +1957,16 @@ general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
     case CONST_INT:
       return REG_P (dst);
 
+    case VEC_SELECT:
+      /* Excluding MEM_P (dst) avoids intefering with vpextr[dq].  */
+      return REG_P (dst)
+	     && REG_P (XEXP (src, 0))
+	     && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
+							    : V4SImode)
+	     && GET_CODE (XEXP (src, 1)) == PARALLEL
+	     && XVECLEN (XEXP (src, 1), 0) == 1
+	     && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
+
     default:
       return false;
     }
diff --git a/gcc/testsuite/gcc.target/i386/pr107548-1.c b/gcc/testsuite/gcc.target/i386/pr107548-1.c
new file mode 100644
index 0000000..da78f75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107548-1.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mstv -mno-stackrealign" } */
+typedef unsigned int v4si __attribute__((vector_size(16)));
+
+unsigned int foo1 (v4si a, v4si b)
+{
+  a[0] += b[0];
+  return a[0] + a[1];
+}
+
+unsigned int foo2 (v4si a, v4si b)
+{
+  a[0] += b[0];
+  return a[0] + a[2];
+}
+
+unsigned int foo3 (v4si a, v4si b)
+{
+  a[0] += b[0];
+  return a[0] + a[3];
+}
+
+/* { dg-final { scan-assembler-times "\tmovd\t" 3 } } */
+/* { dg-final { scan-assembler-times "paddd" 6 } } */
+/* { dg-final { scan-assembler-not "addl" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr107548-2.c b/gcc/testsuite/gcc.target/i386/pr107548-2.c
new file mode 100644
index 0000000..b57594e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr107548-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -mstv -mno-stackrealign" } */
+typedef unsigned long long v2di __attribute__((vector_size(16)));
+
+unsigned long long foo(v2di a, v2di b)
+{
+  a[0] += b[0];
+  return a[0] + a[1];
+}
+
+/* { dg-final { scan-assembler-not "\taddq\t" } } */
+/* { dg-final { scan-assembler-times "paddq" 2 } } */
+/* { dg-final { scan-assembler "psrldq" } } */

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [x86 PATCH] PR target/107548: Handle vec_select in STV.
  2022-12-22 23:19 [x86 PATCH] PR target/107548: Handle vec_select in STV Roger Sayle
@ 2022-12-23  7:38 ` Uros Bizjak
  0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2022-12-23  7:38 UTC (permalink / raw)
  To: Roger Sayle; +Cc: GCC Patches

On Fri, Dec 23, 2022 at 12:19 AM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch enhances x86's STV pass to handle VEC_SELECT during general
> scalar chain conversion, performing SImode scalar extraction from V4SI
> and DImode scalar extraction from V2DI vector registers.
>
> The motivating test case from bugzilla is:
>
> typedef unsigned int v4si __attribute__((vector_size(16)));
>
> unsigned int f (v4si a, v4si b)
> {
>   a[0] += b[0];
>   return a[0] + a[1];
> }
>
> currently with -O2 -march=znver2 this generates:
>
>         vpextrd $1, %xmm0, %edx
>         vmovd   %xmm0, %eax
>         addl    %edx, %eax
>         vmovd   %xmm1, %edx
>         addl    %edx, %eax
>         ret
>
> which performs three transfers from the vector unit to the scalar unit,
> and performs the two additions there.  With this patch, we now generate:
>
>         vmovdqa %xmm0, %xmm2
>         vpshufd $85, %xmm0, %xmm0
>         vpaddd  %xmm0, %xmm2, %xmm0
>         vpaddd  %xmm1, %xmm0, %xmm0
>         vmovd   %xmm0, %eax
>         ret
>
> which performs the two additions in the vector unit, and then transfers
> the result to the scalar unit.  Technically the (cheap) movdqa isn't
> needed with better register allocation (or this could be cleaned up
> during peephole2), but even so this transform is still a win.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, both with and without --target_board=unix{-m32},
> with no new failures.  Ok for mainline?
>
>
> 2022-12-22  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         PR target/107548
>         * config/i386/i386-features.cc (scalar_chain::add_insn): The
>         operands of a VEC_SELECT don't need to added to the scalar chain.
>         (general_scalar_chain::compute_convert_gain) <case VEC_SELECT>:
>         Provide gains for performing STV on a VEC_SELECT.
>         (general_scalar_chain::convert_insn): Convert VEC_SELECT to pshufd,
>         psrldq or no-op.
>         (general_scalar_to_vector_candidate_p): Handle VEC_SELECT of a
>         single element from a vector register to a scalar register.
>
> gcc/testsuite/ChangeLog
>         PR target/107548
>         * gcc.target/i386/pr107548-1.c: New test V4SI case.
>         * gcc.target/i386/pr107548-1.c: New test V2DI case.

LGTM.

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2022-12-23  7:38 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-22 23:19 [x86 PATCH] PR target/107548: Handle vec_select in STV Roger Sayle
2022-12-23  7:38 ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).