[PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278)
@ 2018-11-30 21:14 Jakub Jelinek
  2018-12-02 19:23 ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Jelinek @ 2018-11-30 21:14 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

Hi!

As mentioned in the PR, while in vec_concatv2df or vec_concatv2di we have
alternatives where the lower half is low 64-bit part of a xmm reg or 64-bit
memory and upper half is zero using movq/vmovq (or movq2dq), for
vec_concatv4sf and vec_concatv4si we don't have anything similar, although
the operations do pretty much the same thing.  I'm not advocating to put
in alternatives with GPR operands as having V2SF or V2SI in a GPR is too
weird, but for the cases where a simple movq or vmovq instruction can move
64-bits and clear upper 64-bits these patterns look useful to me.

I had to tweak the pr53759.c testcase because it started FAILing, but only
because it changed:
-	vxorps	%xmm0, %xmm0, %xmm0
-	vmovlps	(%rsi), %xmm0, %xmm0
+	vmovq	(%rsi), %xmm0
 	vaddps	%xmm0, %xmm0, %xmm0
 	vmovaps	%xmm0, (%rdi)
 	ret
I've added a variant of that testcase that tests its original purpose by
using a register there not known to be all zeros.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2018-11-30  Jakub Jelinek  <jakub@redhat.com>

	PR target/88278
	* config/i386/sse.md (*vec_concatv4sf_0, *vec_concatv4si_0): New insns.

	* gcc.target/i386/pr88278.c: New test.
	* gcc.target/i386/pr53759.c: Don't expect vmovlps insn, expect vmovq
	instead.
	* gcc.target/i386/pr53759-2.c: New test.

--- gcc/config/i386/sse.md.jj	2018-11-30 21:36:22.277762263 +0100
+++ gcc/config/i386/sse.md	2018-11-30 21:38:02.261120768 +0100
@@ -7248,6 +7248,17 @@ (define_insn "*vec_concatv4sf"
    (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
    (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
 
+(define_insn "*vec_concatv4sf_0"
+  [(set (match_operand:V4SF 0 "register_operand"       "=v")
+	(vec_concat:V4SF
+	  (match_operand:V2SF 1 "nonimmediate_operand" "xm")
+	  (match_operand:V2SF 2 "const0_operand"       " C")))]
+  "TARGET_SSE2"
+  "%vmovq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "DF")])
+
 ;; Avoid combining registers from different units in a single alternative,
 ;; see comment above inline_secondary_memory_needed function in i386.c
 (define_insn "vec_set<mode>_0"
@@ -14409,6 +14420,23 @@ (define_insn "*vec_concatv4si"
    (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
    (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
 
+(define_insn "*vec_concatv4si_0"
+  [(set (match_operand:V4SI 0 "register_operand"       "=v,x")
+	(vec_concat:V4SI
+	  (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
+	  (match_operand:V2SI 2 "const0_operand"       " C,C")))]
+  "TARGET_SSE2"
+  "@
+   %vmovq\t{%1, %0|%0, %1}
+   movq2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex,orig")
+   (set_attr "mode" "TI")
+   (set (attr "preferred_for_speed")
+     (if_then_else (eq_attr "alternative" "1")
+       (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
+       (symbol_ref "true")))])
+
 ;; movd instead of movq is required to handle broken assemblers.
 (define_insn "vec_concatv2di"
   [(set (match_operand:V2DI 0 "register_operand"
--- gcc/testsuite/gcc.target/i386/pr88278.c.jj	2018-11-30 21:38:02.261120768 +0100
+++ gcc/testsuite/gcc.target/i386/pr88278.c	2018-11-30 21:38:02.261120768 +0100
@@ -0,0 +1,34 @@
+/* PR target/88278 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-sse3 -fgimple -masm=att" } */
+/* { dg-final { scan-assembler-times "movq\[ \t]+\\(" 2 } } */
+/* { dg-final { scan-assembler-not "punpcklqdq\[ \t]+" } } */
+/* { dg-final { scan-assembler-not "pxor\[ \t]+" } } */
+
+typedef unsigned char v16qi __attribute__((vector_size(16)));
+typedef unsigned char v8qi __attribute__((vector_size(8)));
+typedef unsigned int v4si __attribute__((vector_size(16)));
+typedef unsigned int v2si __attribute__((vector_size(8)));
+
+v16qi __GIMPLE foo (unsigned char *p)
+{
+  v8qi _2;
+  v16qi _3;
+
+bb_2:
+  _2 = __MEM <v8qi, 8> (p_1(D));
+  _3 = _Literal (v16qi) { _2, _Literal (v8qi) { _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0 } };
+  return _3;
+}
+
+
+v4si __GIMPLE bar (unsigned int *p)
+{
+  v2si _2;
+  v4si _3;
+
+bb_2:
+  _2 = __MEM <v2si, 32> (p_1(D));
+  _3 = _Literal (v4si) { _2, _Literal (v2si) { 0u, 0u } };
+  return _3;
+}
--- gcc/testsuite/gcc.target/i386/pr53759.c.jj	2016-05-22 12:20:04.184034591 +0200
+++ gcc/testsuite/gcc.target/i386/pr53759.c	2018-11-30 22:04:56.432806470 +0100
@@ -12,5 +12,6 @@ foo (__m128 *x, __m64 *y)
   *x = _mm_add_ps (b, b);
 }
 
-/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */
+/* { dg-final { scan-assembler "vmovq\[ \\t\]" } } */
+/* { dg-final { scan-assembler-not "vmovlps\[ \\t\]" } } */
 /* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */
--- gcc/testsuite/gcc.target/i386/pr53759-2.c.jj	2018-11-30 22:05:06.932657374 +0100
+++ gcc/testsuite/gcc.target/i386/pr53759-2.c	2018-11-30 22:05:42.568151361 +0100
@@ -0,0 +1,16 @@
+/* PR target/53759 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+
+#include <xmmintrin.h>
+
+void
+foo (__m128 *x, __m64 *y)
+{
+  __m128 a = _mm_add_ps (x[1], x[2]);
+  __m128 b = _mm_loadl_pi (a, y);
+  *x = _mm_add_ps (b, b);
+}
+
+/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */
+/* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */

	Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278)
  2018-11-30 21:14 [PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278) Jakub Jelinek
@ 2018-12-02 19:23 ` Uros Bizjak
  2018-12-02 19:48   ` Jakub Jelinek
  0 siblings, 1 reply; 4+ messages in thread
From: Uros Bizjak @ 2018-12-02 19:23 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Fri, Nov 30, 2018 at 10:14 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> As mentioned in the PR, while in vec_concatv2df or vec_concatv2di we have
> alternatives where the lower half is low 64-bit part of a xmm reg or 64-bit
> memory and upper half is zero using movq/vmovq (or movq2dq), for
> vec_concatv4sf and vec_concatv4si we don't have anything similar, although
> the operations do pretty much the same thing.  I'm not advocating to put
> in alternatives with GPR operands as having V2SF or V2SI in a GPR is too
> weird, but for the cases where a simple movq or vmovq instruction can move
> 64-bits and clear upper 64-bits these patterns look useful to me.
>
> I had to tweak the pr53759.c testcase because it started FAILing, but only
> because it changed:
> -       vxorps  %xmm0, %xmm0, %xmm0
> -       vmovlps (%rsi), %xmm0, %xmm0
> +       vmovq   (%rsi), %xmm0
>         vaddps  %xmm0, %xmm0, %xmm0
>         vmovaps %xmm0, (%rdi)
>         ret
> I've added a variant of that testcase that tests its original purpose by
> using a register there not known to be all zeros.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2018-11-30  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/88278
>         * config/i386/sse.md (*vec_concatv4sf_0, *vec_concatv4si_0): New insns.
>
>         * gcc.target/i386/pr88278.c: New test.
>         * gcc.target/i386/pr53759.c: Don't expect vmovlps insn, expect vmovq
>         instead.
>         * gcc.target/i386/pr53759-2.c: New test.

OK with a constraint adjustment below.

Thanks,
Uros.

>
> --- gcc/config/i386/sse.md.jj   2018-11-30 21:36:22.277762263 +0100
> +++ gcc/config/i386/sse.md      2018-11-30 21:38:02.261120768 +0100
> @@ -7248,6 +7248,17 @@ (define_insn "*vec_concatv4sf"
>     (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex")
>     (set_attr "mode" "V4SF,V4SF,V2SF,V2SF")])
>
> +(define_insn "*vec_concatv4sf_0"
> +  [(set (match_operand:V4SF 0 "register_operand"       "=v")
> +       (vec_concat:V4SF
> +         (match_operand:V2SF 1 "nonimmediate_operand" "xm")

The constraint here can be "vm". There is no limitation on register
set for vmovq insn.

> +         (match_operand:V2SF 2 "const0_operand"       " C")))]
> +  "TARGET_SSE2"
> +  "%vmovq\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_vex")
> +   (set_attr "mode" "DF")])
> +
>  ;; Avoid combining registers from different units in a single alternative,
>  ;; see comment above inline_secondary_memory_needed function in i386.c
>  (define_insn "vec_set<mode>_0"
> @@ -14409,6 +14420,23 @@ (define_insn "*vec_concatv4si"
>     (set_attr "prefix" "orig,maybe_evex,orig,orig,maybe_evex")
>     (set_attr "mode" "TI,TI,V4SF,V2SF,V2SF")])
>
> +(define_insn "*vec_concatv4si_0"
> +  [(set (match_operand:V4SI 0 "register_operand"       "=v,x")
> +       (vec_concat:V4SI
> +         (match_operand:V2SI 1 "nonimmediate_operand" "vm,?!*y")
> +         (match_operand:V2SI 2 "const0_operand"       " C,C")))]
> +  "TARGET_SSE2"
> +  "@
> +   %vmovq\t{%1, %0|%0, %1}
> +   movq2dq\t{%1, %0|%0, %1}"
> +  [(set_attr "type" "ssemov")
> +   (set_attr "prefix" "maybe_vex,orig")
> +   (set_attr "mode" "TI")
> +   (set (attr "preferred_for_speed")
> +     (if_then_else (eq_attr "alternative" "1")
> +       (symbol_ref "TARGET_INTER_UNIT_MOVES_FROM_VEC")
> +       (symbol_ref "true")))])
> +
>  ;; movd instead of movq is required to handle broken assemblers.
>  (define_insn "vec_concatv2di"
>    [(set (match_operand:V2DI 0 "register_operand"
> --- gcc/testsuite/gcc.target/i386/pr88278.c.jj  2018-11-30 21:38:02.261120768 +0100
> +++ gcc/testsuite/gcc.target/i386/pr88278.c     2018-11-30 21:38:02.261120768 +0100
> @@ -0,0 +1,34 @@
> +/* PR target/88278 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-sse3 -fgimple -masm=att" } */
> +/* { dg-final { scan-assembler-times "movq\[ \t]+\\(" 2 } } */
> +/* { dg-final { scan-assembler-not "punpcklqdq\[ \t]+" } } */
> +/* { dg-final { scan-assembler-not "pxor\[ \t]+" } } */
> +
> +typedef unsigned char v16qi __attribute__((vector_size(16)));
> +typedef unsigned char v8qi __attribute__((vector_size(8)));
> +typedef unsigned int v4si __attribute__((vector_size(16)));
> +typedef unsigned int v2si __attribute__((vector_size(8)));
> +
> +v16qi __GIMPLE foo (unsigned char *p)
> +{
> +  v8qi _2;
> +  v16qi _3;
> +
> +bb_2:
> +  _2 = __MEM <v8qi, 8> (p_1(D));
> +  _3 = _Literal (v16qi) { _2, _Literal (v8qi) { _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0, _Literal (unsigned char) 0 } };
> +  return _3;
> +}
> +
> +
> +v4si __GIMPLE bar (unsigned int *p)
> +{
> +  v2si _2;
> +  v4si _3;
> +
> +bb_2:
> +  _2 = __MEM <v2si, 32> (p_1(D));
> +  _3 = _Literal (v4si) { _2, _Literal (v2si) { 0u, 0u } };
> +  return _3;
> +}
> --- gcc/testsuite/gcc.target/i386/pr53759.c.jj  2016-05-22 12:20:04.184034591 +0200
> +++ gcc/testsuite/gcc.target/i386/pr53759.c     2018-11-30 22:04:56.432806470 +0100
> @@ -12,5 +12,6 @@ foo (__m128 *x, __m64 *y)
>    *x = _mm_add_ps (b, b);
>  }
>
> -/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */
> +/* { dg-final { scan-assembler "vmovq\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vmovlps\[ \\t\]" } } */
>  /* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */
> --- gcc/testsuite/gcc.target/i386/pr53759-2.c.jj        2018-11-30 22:05:06.932657374 +0100
> +++ gcc/testsuite/gcc.target/i386/pr53759-2.c   2018-11-30 22:05:42.568151361 +0100
> @@ -0,0 +1,16 @@
> +/* PR target/53759 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +
> +#include <xmmintrin.h>
> +
> +void
> +foo (__m128 *x, __m64 *y)
> +{
> +  __m128 a = _mm_add_ps (x[1], x[2]);
> +  __m128 b = _mm_loadl_pi (a, y);
> +  *x = _mm_add_ps (b, b);
> +}
> +
> +/* { dg-final { scan-assembler "vmovlps\[ \\t\]" } } */
> +/* { dg-final { scan-assembler-not "vshufps\[ \\t\]" } } */
>
>         Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278)
  2018-12-02 19:23 ` Uros Bizjak
@ 2018-12-02 19:48   ` Jakub Jelinek
  2018-12-02 20:04     ` Uros Bizjak
  0 siblings, 1 reply; 4+ messages in thread
From: Jakub Jelinek @ 2018-12-02 19:48 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: gcc-patches

On Sun, Dec 02, 2018 at 08:23:21PM +0100, Uros Bizjak wrote:
> OK with a constraint adjustment below.
> 
> > +(define_insn "*vec_concatv4sf_0"
> > +  [(set (match_operand:V4SF 0 "register_operand"       "=v")
> > +       (vec_concat:V4SF
> > +         (match_operand:V2SF 1 "nonimmediate_operand" "xm")
> 
> The constraint here can be "vm". There is no limitation on register
> set for vmovq insn.

That is what vec_concatv2df has also:
(define_insn "vec_concatv2df"
  [(set (match_operand:V2DF 0 "register_operand"     "=x,x,v,x,v,x,x, v,x,x")
        (vec_concat:V2DF
          (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,xm,0,0")
          (match_operand:DF 2 "nonimm_or_0_operand"  " x,x,v,1,1,m,m, C,x,m")))]

I believe ix86_hard_regno_mode_ok just doesn't allow V2SFmode in xmm16+ regs:
  if (SSE_REGNO_P (regno))
    {
...
      /* xmm16-xmm31 are only available for AVX-512.  */
      if (EXT_REX_SSE_REGNO_P (regno))
        return false;

and V2SFmode is a VALID_MMX_REG_MODE, which is not mentioned anywhere before
the the above conditional, only after it.
While vmovq doesn't have this limitation, I believe RA will not try to put
anything V2SFmode in those regs if lucky, otherwise it could be upset about
it.  Though, DFmode in the vec_concatv2df is different, because
ix86_hard_regno_mode_ok should allow that in those registers, as DFmode is
VALID_AVX512F_SCALAR_MODE.

	Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278)
  2018-12-02 19:48   ` Jakub Jelinek
@ 2018-12-02 20:04     ` Uros Bizjak
  0 siblings, 0 replies; 4+ messages in thread
From: Uros Bizjak @ 2018-12-02 20:04 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Sun, Dec 2, 2018 at 8:48 PM Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Sun, Dec 02, 2018 at 08:23:21PM +0100, Uros Bizjak wrote:
> > OK with a constraint adjustment below.
> >
> > > +(define_insn "*vec_concatv4sf_0"
> > > +  [(set (match_operand:V4SF 0 "register_operand"       "=v")
> > > +       (vec_concat:V4SF
> > > +         (match_operand:V2SF 1 "nonimmediate_operand" "xm")
> >
> > The constraint here can be "vm". There is no limitation on register
> > set for vmovq insn.
>
> That is what vec_concatv2df has also:
> (define_insn "vec_concatv2df"
>   [(set (match_operand:V2DF 0 "register_operand"     "=x,x,v,x,v,x,x, v,x,x")
>         (vec_concat:V2DF
>           (match_operand:DF 1 "nonimmediate_operand" " 0,x,v,m,m,0,x,xm,0,0")
>           (match_operand:DF 2 "nonimm_or_0_operand"  " x,x,v,1,1,m,m, C,x,m")))]
>
> I believe ix86_hard_regno_mode_ok just doesn't allow V2SFmode in xmm16+ regs:
>   if (SSE_REGNO_P (regno))
>     {
> ...
>       /* xmm16-xmm31 are only available for AVX-512.  */
>       if (EXT_REX_SSE_REGNO_P (regno))
>         return false;

IIRC, RA just won't allocate a register with a regno that won't
satisfy hard_regno_mode_ok. The quoted vec_concatv2df looks like an
unintended oversight (DFmode is supported in AVX512 regs through
VALID_AVX512F_SCALAR_MODE). But we can stay on the safe side and leave
MMX modes with "x" constraint.

So, patch is OK as is.

Thanks,
Uros.

> and V2SFmode is a VALID_MMX_REG_MODE, which is not mentioned anywhere before
> the the above conditional, only after it.
> While vmovq doesn't have this limitation, I believe RA will not try to put
> anything V2SFmode in those regs if lucky, otherwise it could be upset about
> it.  Though, DFmode in the vec_concatv2df is different, because
> ix86_hard_regno_mode_ok should allow that in those registers, as DFmode is
> VALID_AVX512F_SCALAR_MODE.
>
>         Jakub

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-12-02 20:04 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-30 21:14 [PATCH] Add *vec_concatv4sf_0 and *vec_concatv4si_0 insns (PR target/88278) Jakub Jelinek
2018-12-02 19:23 ` Uros Bizjak
2018-12-02 19:48   ` Jakub Jelinek
2018-12-02 20:04     ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).