[PATCH] i386: Add standard scalar operation patterns

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] i386: Add standard scalar operation patterns
@ 2019-02-07 17:49 H.J. Lu
  2019-05-15 19:16 ` PING^1: " H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-02-07 17:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: Uros Bizjak, Marc Glisse, Richard Sandiford

Standard scalar operation patterns which preserve the rest of the vector
look like

     (vec_merge:V2DF
       (vec_duplicate:V2DF
         (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
                (parallel [ (const_int 0 [0])]))
         (reg:DF 87))
       (reg/v:V2DF 85 [ x ])
       (const_int 1 [0x1])]))

Add such pattens to i386 backend and convert VEC_CONCAT patterns to
standard standard scalar operation patterns.

gcc/

	PR target/54855
	* simplify-rtx.c (simplify_binary_operation_1): Convert
	VEC_CONCAT patterns to standard standard scalar operation
	patterns.
	* config/i386/sse.md (*<sse>_vm<plusminus_insn><mode>3): New.
	(*<sse>_vm<multdiv_mnemonic><mode>3): Likewise.

gcc/testsuite/

	PR target/54855
	* gcc.target/i386/pr54855-1.c: New test.
	* gcc.target/i386/pr54855-2.c: Likewise.
	* gcc.target/i386/pr54855-3.c: Likewise.
	* gcc.target/i386/pr54855-4.c: Likewise.
	* gcc.target/i386/pr54855-5.c: Likewise.
	* gcc.target/i386/pr54855-6.c: Likewise.
	* gcc.target/i386/pr54855-7.c: Likewise.
---
 gcc/config/i386/sse.md                    | 45 +++++++++++++++++++++
 gcc/simplify-rtx.c                        | 49 +++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr54855-1.c | 16 ++++++++
 gcc/testsuite/gcc.target/i386/pr54855-2.c | 15 +++++++
 gcc/testsuite/gcc.target/i386/pr54855-3.c | 14 +++++++
 gcc/testsuite/gcc.target/i386/pr54855-4.c | 14 +++++++
 gcc/testsuite/gcc.target/i386/pr54855-5.c | 16 ++++++++
 gcc/testsuite/gcc.target/i386/pr54855-6.c | 14 +++++++
 gcc/testsuite/gcc.target/i386/pr54855-7.c | 14 +++++++
 9 files changed, 197 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr54855-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr54855-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr54855-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr54855-4.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr54855-5.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr54855-6.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr54855-7.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5dc0930ac1f..03b6f3369fc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1719,6 +1719,28 @@
    (set_attr "type" "sseadd")
    (set_attr "mode" "<MODE>")])
 
+;; Standard scalar operation patterns which preserve the rest of the
+;; vector for combiner.
+(define_insn "*<sse>_vm<plusminus_insn><mode>3"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
+	(vec_merge:VF_128
+	  (vec_duplicate:VF_128
+	    (plusminus:<ssescalarmode>
+	      (vec_select:<ssescalarmode>
+	        (match_operand:VF_128 1 "register_operand" "0,v")
+		(parallel [(const_int 0)]))
+	      (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "xm,vm")))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   <plusminus_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
+   v<plusminus_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sseadd")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
 (define_insn "<sse>_vm<plusminus_insn><mode>3<mask_scalar_name><round_scalar_name>"
   [(set (match_operand:VF_128 0 "register_operand" "=x,v")
 	(vec_merge:VF_128
@@ -1773,6 +1795,29 @@
    (set_attr "type" "ssemul")
    (set_attr "mode" "<MODE>")])
 
+;; Standard scalar operation patterns which preserve the rest of the
+;; vector for combiner.
+(define_insn "*<sse>_vm<multdiv_mnemonic><mode>3"
+  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
+	(vec_merge:VF_128
+	  (vec_duplicate:VF_128
+	    (multdiv:<ssescalarmode>
+	      (vec_select:<ssescalarmode>
+	        (match_operand:VF_128 1 "register_operand" "0,v")
+		(parallel [(const_int 0)]))
+	      (match_operand:<ssescalarmode> 2 "nonimmediate_operand" "xm,vm")))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   <multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
+   v<multdiv_mnemonic><ssescalarmodesuffix>\t{%2, %1, %0|%0, %1, %<iptr>2}"
+  [(set_attr "isa" "noavx,avx")
+   (set_attr "type" "sse<multdiv_mnemonic>")
+   (set_attr "prefix" "orig,vex")
+   (set_attr "btver2_decode" "direct,double")
+   (set_attr "mode" "<ssescalarmode>")])
+
 (define_insn "<sse>_vm<multdiv_mnemonic><mode>3<mask_scalar_name><round_scalar_name>"
   [(set (match_operand:VF_128 0 "register_operand" "=x,v")
 	(vec_merge:VF_128
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 83580a259f3..c32544381d0 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -4023,6 +4023,55 @@ simplify_binary_operation_1 (enum rtx_code code, machine_mode mode,
 	    return simplify_gen_binary (VEC_SELECT, mode, XEXP (trueop0, 0),
 					gen_rtx_PARALLEL (VOIDmode, vec));
 	  }
+
+	/* Turn
+
+	   (vec_concat:V2DF
+	     (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
+				   (parallel [ (const_int 0 [0])]))
+		       (reg:DF 87))
+	     (vec_select:DF (reg/v:V2DF 85 [ x ])
+			    (parallel [ (const_int 1 [0x1])])))
+
+	   into standard scalar operation patterns which preserve the
+	   rest of the vector:
+
+	   (vec_merge:V2DF
+	     (vec_duplicate:V2DF
+	       (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
+				     (parallel [ (const_int 0 [0])]))
+			 (reg:DF 87))
+	     (reg/v:V2DF 85 [ x ])
+	     (const_int 1 [0x1])]))
+
+           */
+	if (GET_CODE (trueop1) == VEC_SELECT
+	    && XVECLEN (XEXP (trueop1, 1), 0) == 1
+	    && INTVAL (XVECEXP (XEXP (trueop1, 1), 0, 0)) == 1
+	    && GET_MODE (XEXP (trueop1, 0)) == mode
+	    && op0_mode == GET_MODE_INNER (mode)
+	    && (GET_CODE (trueop0) == PLUS
+		|| GET_CODE (trueop0) == MINUS
+		|| GET_CODE (trueop0) == MULT
+		|| GET_CODE (trueop0) == DIV)
+	    && GET_CODE (XEXP (trueop0, 0)) == VEC_SELECT
+	    && rtx_equal_p (XEXP (trueop1, 0), XEXP (XEXP (trueop0, 0), 0))
+	    && XVECLEN (XEXP (XEXP (trueop0, 0), 1), 0) == 1
+	    && INTVAL (XVECEXP (XEXP (XEXP (trueop0, 0), 1), 0, 0)) == 0)
+	  {
+	    op0 = XEXP (trueop1, 0);
+	    op1 = XEXP (trueop0, 1);
+	    rtvec vec = rtvec_alloc (1);
+	    RTVEC_ELT (vec, 0) = const0_rtx;
+	    rtx op2 = simplify_gen_binary (VEC_SELECT, op0_mode, op0,
+					   gen_rtx_PARALLEL (VOIDmode,
+							     vec));
+	    op2 = simplify_gen_binary (GET_CODE (trueop0),
+				       op0_mode, op2, op1);
+	    op2 = gen_rtx_VEC_DUPLICATE (mode, op2);
+	    return simplify_gen_ternary (VEC_MERGE, mode, mode, op2,
+					 op0, GEN_INT (1));
+	  }
       }
       return 0;
 
diff --git a/gcc/testsuite/gcc.target/i386/pr54855-1.c b/gcc/testsuite/gcc.target/i386/pr54855-1.c
new file mode 100644
index 00000000000..693aafa09ab
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr54855-1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "addsd" 1 } } */
+/* { dg-final { scan-assembler-not "movapd" } } */
+/* { dg-final { scan-assembler-not "movsd" } } */
+
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+__m128d
+_mm_add_sd (__m128d x, __m128d y)
+{
+  __m128d z =  __extension__ (__m128d)(__v2df)
+    { (((__v2df) x)[0] + ((__v2df) y)[0]), ((__v2df) x)[1] };
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr54855-2.c b/gcc/testsuite/gcc.target/i386/pr54855-2.c
new file mode 100644
index 00000000000..20c6f8eb529
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr54855-2.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "mulsd" 1 } } */
+/* { dg-final { scan-assembler-not "movapd" } } */
+/* { dg-final { scan-assembler-not "movsd" } } */
+
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+
+__v2df
+_mm_mul_sd (__v2df x, __v2df y)
+{
+  __v2df z = x;
+  z[0] = x[0] * y[0];
+  return z;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr54855-3.c b/gcc/testsuite/gcc.target/i386/pr54855-3.c
new file mode 100644
index 00000000000..3c15dfc93d1
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr54855-3.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "subsd" 1 } } */
+/* { dg-final { scan-assembler-not "movapd" } } */
+/* { dg-final { scan-assembler-not "movsd" } } */
+
+typedef double vec __attribute__((vector_size(16)));
+
+vec
+foo (vec x)
+{
+  x[0] -= 1.;
+  return x;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr54855-4.c b/gcc/testsuite/gcc.target/i386/pr54855-4.c
new file mode 100644
index 00000000000..32eb28e852a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr54855-4.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "subsd" 1 } } */
+/* { dg-final { scan-assembler-not "movapd" } } */
+/* { dg-final { scan-assembler-not "movsd" } } */
+
+typedef double vec __attribute__((vector_size(16)));
+
+vec
+foo (vec x, double a)
+{
+  x[0] -= a;
+  return x;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr54855-5.c b/gcc/testsuite/gcc.target/i386/pr54855-5.c
new file mode 100644
index 00000000000..e06999074e0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr54855-5.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "subsd" 1 } } */
+/* { dg-final { scan-assembler-times "mulpd" 1 } } */
+/* { dg-final { scan-assembler-not "movapd" } } */
+/* { dg-final { scan-assembler-not "movsd" } } */
+
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+
+__v2df
+foo (__v2df x, __v2df y)
+{
+  x[0] -= y[0];
+  x *= y;
+  return x;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr54855-6.c b/gcc/testsuite/gcc.target/i386/pr54855-6.c
new file mode 100644
index 00000000000..8f44d17b6d8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr54855-6.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "divss" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movss" } } */
+
+typedef float vec __attribute__((vector_size(16)));
+
+vec
+foo (vec x, float f)
+{
+  x[0] /= f;
+  return x;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr54855-7.c b/gcc/testsuite/gcc.target/i386/pr54855-7.c
new file mode 100644
index 00000000000..a551bd5c92f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr54855-7.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse -mfpmath=sse" } */
+/* { dg-final { scan-assembler-times "divss" 1 } } */
+/* { dg-final { scan-assembler-not "movaps" } } */
+/* { dg-final { scan-assembler-not "movss" } } */
+
+typedef float vec __attribute__((vector_size(16)));
+
+vec
+foo (vec x)
+{
+  x[0] /= 2.1f;
+  return x;
+}
-- 
2.20.1

^ permalink raw reply	[flat|nested] 7+ messages in thread

* PING^1: [PATCH] i386: Add standard scalar operation patterns
  2019-02-07 17:49 [PATCH] i386: Add standard scalar operation patterns H.J. Lu
@ 2019-05-15 19:16 ` H.J. Lu
  2019-05-15 21:29   ` Richard Sandiford
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-05-15 19:16 UTC (permalink / raw)
  To: GCC Patches; +Cc: Uros Bizjak, Marc Glisse, Richard Sandiford

On Thu, Feb 7, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Standard scalar operation patterns which preserve the rest of the vector
> look like
>
>      (vec_merge:V2DF
>        (vec_duplicate:V2DF
>          (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
>                 (parallel [ (const_int 0 [0])]))
>          (reg:DF 87))
>        (reg/v:V2DF 85 [ x ])
>        (const_int 1 [0x1])]))
>
> Add such pattens to i386 backend and convert VEC_CONCAT patterns to
> standard standard scalar operation patterns.
>
> gcc/
>
>         PR target/54855
>         * simplify-rtx.c (simplify_binary_operation_1): Convert
>         VEC_CONCAT patterns to standard standard scalar operation
>         patterns.
>         * config/i386/sse.md (*<sse>_vm<plusminus_insn><mode>3): New.
>         (*<sse>_vm<multdiv_mnemonic><mode>3): Likewise.
>
> gcc/testsuite/
>
>         PR target/54855
>         * gcc.target/i386/pr54855-1.c: New test.
>         * gcc.target/i386/pr54855-2.c: Likewise.
>         * gcc.target/i386/pr54855-3.c: Likewise.
>         * gcc.target/i386/pr54855-4.c: Likewise.
>         * gcc.target/i386/pr54855-5.c: Likewise.
>         * gcc.target/i386/pr54855-6.c: Likewise.
>         * gcc.target/i386/pr54855-7.c: Likewise.

PING:

https://gcc.gnu.org/ml/gcc-patches/2019-02/msg00398.html


-- 
H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: PING^1: [PATCH] i386: Add standard scalar operation patterns
  2019-05-15 19:16 ` PING^1: " H.J. Lu
@ 2019-05-15 21:29   ` Richard Sandiford
  2019-05-21 15:54     ` [PATCH] i386: Generate standard floating point " H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Sandiford @ 2019-05-15 21:29 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Patches, Uros Bizjak, Marc Glisse

"H.J. Lu" <hjl.tools@gmail.com> writes:
> On Thu, Feb 7, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> Standard scalar operation patterns which preserve the rest of the vector
>> look like
>>
>>      (vec_merge:V2DF
>>        (vec_duplicate:V2DF
>>          (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
>>                 (parallel [ (const_int 0 [0])]))
>>          (reg:DF 87))
>>        (reg/v:V2DF 85 [ x ])
>>        (const_int 1 [0x1])]))
>>
>> Add such pattens to i386 backend and convert VEC_CONCAT patterns to
>> standard standard scalar operation patterns.

It looks like there's some variety in the patterns used, e.g.:

(define_insn "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>"
  [(set (match_operand:VF_128 0 "register_operand" "=x,v")
	(vec_merge:VF_128
	  (smaxmin:VF_128
	    (match_operand:VF_128 1 "register_operand" "0,v")
	    (match_operand:VF_128 2 "vector_operand" "xBm,<round_saeonly_scalar_constraint>"))
	 (match_dup 1)
	 (const_int 1)))]
  "TARGET_SSE"
  "@
   <maxmin_float><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
   v<maxmin_float><ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%2, %1, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %1, %<iptr>2<round_saeonly_scalar_mask_op3>}"
  [(set_attr "isa" "noavx,avx")
   (set_attr "type" "sse")
   (set_attr "btver2_sse_attr" "maxmin")
   (set_attr "prefix" "<round_saeonly_scalar_prefix>")
   (set_attr "mode" "<ssescalarmode>")])

makes the operand a full vector operation, which seems simpler.
The above would then be:

      (vec_merge:V2DF
	(op:V2DF
	  (reg:V2DF 85)
          (vec_duplicate:V2DF (reg:DF 87)))
        (reg/v:V2DF 85 [ x ])
        (const_int 1 [0x1])]))

I guess technically the two have different faulting behaviour though,
since the smaxmin gets applied to all elements, not just element 0.

The patch seems very specific.  E.g. why just PLUS, MINUS, MULT and DIV?

Thanks,
Richard


>>
>> gcc/
>>
>>         PR target/54855
>>         * simplify-rtx.c (simplify_binary_operation_1): Convert
>>         VEC_CONCAT patterns to standard standard scalar operation
>>         patterns.
>>         * config/i386/sse.md (*<sse>_vm<plusminus_insn><mode>3): New.
>>         (*<sse>_vm<multdiv_mnemonic><mode>3): Likewise.
>>
>> gcc/testsuite/
>>
>>         PR target/54855
>>         * gcc.target/i386/pr54855-1.c: New test.
>>         * gcc.target/i386/pr54855-2.c: Likewise.
>>         * gcc.target/i386/pr54855-3.c: Likewise.
>>         * gcc.target/i386/pr54855-4.c: Likewise.
>>         * gcc.target/i386/pr54855-5.c: Likewise.
>>         * gcc.target/i386/pr54855-6.c: Likewise.
>>         * gcc.target/i386/pr54855-7.c: Likewise.
>
> PING:
>
> https://gcc.gnu.org/ml/gcc-patches/2019-02/msg00398.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] i386: Generate standard floating point scalar operation patterns
  2019-05-15 21:29   ` Richard Sandiford
@ 2019-05-21 15:54     ` H.J. Lu
  2019-06-03 22:51       ` PING^1: " H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2019-05-21 15:54 UTC (permalink / raw)
  To: GCC Patches, Uros Bizjak, Marc Glisse, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 4036 bytes --]

On Wed, May 15, 2019 at 2:29 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu" <hjl.tools@gmail.com> writes:
> > On Thu, Feb 7, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> Standard scalar operation patterns which preserve the rest of the vector
> >> look like
> >>
> >>      (vec_merge:V2DF
> >>        (vec_duplicate:V2DF
> >>          (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
> >>                 (parallel [ (const_int 0 [0])]))
> >>          (reg:DF 87))
> >>        (reg/v:V2DF 85 [ x ])
> >>        (const_int 1 [0x1])]))
> >>
> >> Add such pattens to i386 backend and convert VEC_CONCAT patterns to
> >> standard standard scalar operation patterns.
>
> It looks like there's some variety in the patterns used, e.g.:
>
> (define_insn "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>"
>   [(set (match_operand:VF_128 0 "register_operand" "=x,v")
>         (vec_merge:VF_128
>           (smaxmin:VF_128
>             (match_operand:VF_128 1 "register_operand" "0,v")
>             (match_operand:VF_128 2 "vector_operand" "xBm,<round_saeonly_scalar_constraint>"))
>          (match_dup 1)
>          (const_int 1)))]
>   "TARGET_SSE"
>   "@
>    <maxmin_float><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
>    v<maxmin_float><ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%2, %1, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %1, %<iptr>2<round_saeonly_scalar_mask_op3>}"
>   [(set_attr "isa" "noavx,avx")
>    (set_attr "type" "sse")
>    (set_attr "btver2_sse_attr" "maxmin")
>    (set_attr "prefix" "<round_saeonly_scalar_prefix>")
>    (set_attr "mode" "<ssescalarmode>")])
>
> makes the operand a full vector operation, which seems simpler.

This pattern is used to implement scalar smaxmin intrinsics.

> The above would then be:
>
>       (vec_merge:V2DF
>         (op:V2DF
>           (reg:V2DF 85)
>           (vec_duplicate:V2DF (reg:DF 87)))
>         (reg/v:V2DF 85 [ x ])
>         (const_int 1 [0x1])]))
>
> I guess technically the two have different faulting behaviour though,
> since the smaxmin gets applied to all elements, not just element 0.

This is the issue.   We don't use the correct mode for scalar instructions:

---
#include <immintrin.h>

__m128d
foo1 (__m128d x, double *p)
{
  __m128d y = _mm_load_sd (p);
  return _mm_max_pd (x, y);
}
---

movq (%rdi), %xmm1
maxpd %xmm1, %xmm0
ret


Here is the updated patch to add standard floating point scalar
operation patterns to i386 backend.    Then we can do

---
#include <immintrin.h>

extern __inline __m128d __attribute__((__gnu_inline__,
__always_inline__, __artificial__))
_new_mm_max_pd (__m128d __A, __m128d __B)
{
  __A[0] = __A[0] > __B[0] ? __A[0] : __B[0];
  return __A;
}

__m128d
foo2 (__m128d x, double *p)
{
  __m128d y = _mm_load_sd (p);
  return _new_mm_max_pd (x, y);
}

maxsd (%rdi), %xmm0
ret

We should use generic vector operations to implement i386 intrinsics
as much as we can.

> The patch seems very specific.  E.g. why just PLUS, MINUS, MULT and DIV?

This patch only adds  +, -, *, /, > and <.    We can add more if there
are testcases
for them.

> Thanks,
> Richard
>
>
> >>
> >> gcc/
> >>
> >>         PR target/54855
> >>         * simplify-rtx.c (simplify_binary_operation_1): Convert
> >>         VEC_CONCAT patterns to standard standard scalar operation
> >>         patterns.
> >>         * config/i386/sse.md (*<sse>_vm<plusminus_insn><mode>3): New.
> >>         (*<sse>_vm<multdiv_mnemonic><mode>3): Likewise.
> >>
> >> gcc/testsuite/
> >>
> >>         PR target/54855
> >>         * gcc.target/i386/pr54855-1.c: New test.
> >>         * gcc.target/i386/pr54855-2.c: Likewise.
> >>         * gcc.target/i386/pr54855-3.c: Likewise.
> >>         * gcc.target/i386/pr54855-4.c: Likewise.
> >>         * gcc.target/i386/pr54855-5.c: Likewise.
> >>         * gcc.target/i386/pr54855-6.c: Likewise.
> >>         * gcc.target/i386/pr54855-7.c: Likewise.
> >
> > PING:
> >
> > https://gcc.gnu.org/ml/gcc-patches/2019-02/msg00398.html

Thanks.

-- 
H.J.

[-- Attachment #2: 0001-i386-Generate-standard-floating-point-scalar-operati.patch --]
[-- Type: application/x-patch, Size: 14321 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* PING^1: [PATCH] i386: Generate standard floating point scalar operation patterns
  2019-05-21 15:54     ` [PATCH] i386: Generate standard floating point " H.J. Lu
@ 2019-06-03 22:51       ` H.J. Lu
  2019-06-18 16:01         ` PING^2: " H.J. Lu
  2019-06-19 19:21         ` PING^1: " Jeff Law
  0 siblings, 2 replies; 7+ messages in thread
From: H.J. Lu @ 2019-06-03 22:51 UTC (permalink / raw)
  To: GCC Patches, Marc Glisse, Richard Sandiford; +Cc: Uros Bizjak, Jan Hubicka

On Tue, May 21, 2019 at 8:54 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, May 15, 2019 at 2:29 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > "H.J. Lu" <hjl.tools@gmail.com> writes:
> > > On Thu, Feb 7, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >>
> > >> Standard scalar operation patterns which preserve the rest of the vector
> > >> look like
> > >>
> > >>      (vec_merge:V2DF
> > >>        (vec_duplicate:V2DF
> > >>          (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
> > >>                 (parallel [ (const_int 0 [0])]))
> > >>          (reg:DF 87))
> > >>        (reg/v:V2DF 85 [ x ])
> > >>        (const_int 1 [0x1])]))
> > >>
> > >> Add such pattens to i386 backend and convert VEC_CONCAT patterns to
> > >> standard standard scalar operation patterns.
> >
> > It looks like there's some variety in the patterns used, e.g.:
> >
> > (define_insn "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>"
> >   [(set (match_operand:VF_128 0 "register_operand" "=x,v")
> >         (vec_merge:VF_128
> >           (smaxmin:VF_128
> >             (match_operand:VF_128 1 "register_operand" "0,v")
> >             (match_operand:VF_128 2 "vector_operand" "xBm,<round_saeonly_scalar_constraint>"))
> >          (match_dup 1)
> >          (const_int 1)))]
> >   "TARGET_SSE"
> >   "@
> >    <maxmin_float><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
> >    v<maxmin_float><ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%2, %1, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %1, %<iptr>2<round_saeonly_scalar_mask_op3>}"
> >   [(set_attr "isa" "noavx,avx")
> >    (set_attr "type" "sse")
> >    (set_attr "btver2_sse_attr" "maxmin")
> >    (set_attr "prefix" "<round_saeonly_scalar_prefix>")
> >    (set_attr "mode" "<ssescalarmode>")])
> >
> > makes the operand a full vector operation, which seems simpler.
>
> This pattern is used to implement scalar smaxmin intrinsics.
>
> > The above would then be:
> >
> >       (vec_merge:V2DF
> >         (op:V2DF
> >           (reg:V2DF 85)
> >           (vec_duplicate:V2DF (reg:DF 87)))
> >         (reg/v:V2DF 85 [ x ])
> >         (const_int 1 [0x1])]))
> >
> > I guess technically the two have different faulting behaviour though,
> > since the smaxmin gets applied to all elements, not just element 0.
>
> This is the issue.   We don't use the correct mode for scalar instructions:
>
> ---
> #include <immintrin.h>
>
> __m128d
> foo1 (__m128d x, double *p)
> {
>   __m128d y = _mm_load_sd (p);
>   return _mm_max_pd (x, y);
> }
> ---
>
> movq (%rdi), %xmm1
> maxpd %xmm1, %xmm0
> ret
>
>
> Here is the updated patch to add standard floating point scalar
> operation patterns to i386 backend.    Then we can do
>
> ---
> #include <immintrin.h>
>
> extern __inline __m128d __attribute__((__gnu_inline__,
> __always_inline__, __artificial__))
> _new_mm_max_pd (__m128d __A, __m128d __B)
> {
>   __A[0] = __A[0] > __B[0] ? __A[0] : __B[0];
>   return __A;
> }
>
> __m128d
> foo2 (__m128d x, double *p)
> {
>   __m128d y = _mm_load_sd (p);
>   return _new_mm_max_pd (x, y);
> }
>
> maxsd (%rdi), %xmm0
> ret
>
> We should use generic vector operations to implement i386 intrinsics
> as much as we can.
>
> > The patch seems very specific.  E.g. why just PLUS, MINUS, MULT and DIV?
>
> This patch only adds  +, -, *, /, > and <.    We can add more if there
> are testcases
> for them.
>
> > Thanks,
> > Richard
> >
> >
> > >>
> > >> gcc/
> > >>
> > >>         PR target/54855
> > >>         * simplify-rtx.c (simplify_binary_operation_1): Convert
> > >>         VEC_CONCAT patterns to standard standard scalar operation
> > >>         patterns.
> > >>         * config/i386/sse.md (*<sse>_vm<plusminus_insn><mode>3): New.
> > >>         (*<sse>_vm<multdiv_mnemonic><mode>3): Likewise.
> > >>
> > >> gcc/testsuite/
> > >>
> > >>         PR target/54855
> > >>         * gcc.target/i386/pr54855-1.c: New test.
> > >>         * gcc.target/i386/pr54855-2.c: Likewise.
> > >>         * gcc.target/i386/pr54855-3.c: Likewise.
> > >>         * gcc.target/i386/pr54855-4.c: Likewise.
> > >>         * gcc.target/i386/pr54855-5.c: Likewise.
> > >>         * gcc.target/i386/pr54855-6.c: Likewise.
> > >>         * gcc.target/i386/pr54855-7.c: Likewise.
> > >
> > > PING:
> > >
> > > https://gcc.gnu.org/ml/gcc-patches/2019-02/msg00398.html
>
> Thanks.
>

PING:

https://gcc.gnu.org/ml/gcc-patches/2019-05/msg01416.html

-- 
H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* PING^2: [PATCH] i386: Generate standard floating point scalar operation patterns
  2019-06-03 22:51       ` PING^1: " H.J. Lu
@ 2019-06-18 16:01         ` H.J. Lu
  2019-06-19 19:21         ` PING^1: " Jeff Law
  1 sibling, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2019-06-18 16:01 UTC (permalink / raw)
  To: GCC Patches, Marc Glisse, Richard Sandiford
  Cc: Uros Bizjak, Jan Hubicka, Jakub Jelinek, Jeffrey Law

On Mon, Jun 3, 2019 at 3:50 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, May 21, 2019 at 8:54 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, May 15, 2019 at 2:29 PM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> > >
> > > "H.J. Lu" <hjl.tools@gmail.com> writes:
> > > > On Thu, Feb 7, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >>
> > > >> Standard scalar operation patterns which preserve the rest of the vector
> > > >> look like
> > > >>
> > > >>      (vec_merge:V2DF
> > > >>        (vec_duplicate:V2DF
> > > >>          (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
> > > >>                 (parallel [ (const_int 0 [0])]))
> > > >>          (reg:DF 87))
> > > >>        (reg/v:V2DF 85 [ x ])
> > > >>        (const_int 1 [0x1])]))
> > > >>
> > > >> Add such pattens to i386 backend and convert VEC_CONCAT patterns to
> > > >> standard standard scalar operation patterns.
> > >
> > > It looks like there's some variety in the patterns used, e.g.:
> > >
> > > (define_insn "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>"
> > >   [(set (match_operand:VF_128 0 "register_operand" "=x,v")
> > >         (vec_merge:VF_128
> > >           (smaxmin:VF_128
> > >             (match_operand:VF_128 1 "register_operand" "0,v")
> > >             (match_operand:VF_128 2 "vector_operand" "xBm,<round_saeonly_scalar_constraint>"))
> > >          (match_dup 1)
> > >          (const_int 1)))]
> > >   "TARGET_SSE"
> > >   "@
> > >    <maxmin_float><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
> > >    v<maxmin_float><ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%2, %1, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %1, %<iptr>2<round_saeonly_scalar_mask_op3>}"
> > >   [(set_attr "isa" "noavx,avx")
> > >    (set_attr "type" "sse")
> > >    (set_attr "btver2_sse_attr" "maxmin")
> > >    (set_attr "prefix" "<round_saeonly_scalar_prefix>")
> > >    (set_attr "mode" "<ssescalarmode>")])
> > >
> > > makes the operand a full vector operation, which seems simpler.
> >
> > This pattern is used to implement scalar smaxmin intrinsics.
> >
> > > The above would then be:
> > >
> > >       (vec_merge:V2DF
> > >         (op:V2DF
> > >           (reg:V2DF 85)
> > >           (vec_duplicate:V2DF (reg:DF 87)))
> > >         (reg/v:V2DF 85 [ x ])
> > >         (const_int 1 [0x1])]))
> > >
> > > I guess technically the two have different faulting behaviour though,
> > > since the smaxmin gets applied to all elements, not just element 0.
> >
> > This is the issue.   We don't use the correct mode for scalar instructions:
> >
> > ---
> > #include <immintrin.h>
> >
> > __m128d
> > foo1 (__m128d x, double *p)
> > {
> >   __m128d y = _mm_load_sd (p);
> >   return _mm_max_pd (x, y);
> > }
> > ---
> >
> > movq (%rdi), %xmm1
> > maxpd %xmm1, %xmm0
> > ret
> >
> >
> > Here is the updated patch to add standard floating point scalar
> > operation patterns to i386 backend.    Then we can do
> >
> > ---
> > #include <immintrin.h>
> >
> > extern __inline __m128d __attribute__((__gnu_inline__,
> > __always_inline__, __artificial__))
> > _new_mm_max_pd (__m128d __A, __m128d __B)
> > {
> >   __A[0] = __A[0] > __B[0] ? __A[0] : __B[0];
> >   return __A;
> > }
> >
> > __m128d
> > foo2 (__m128d x, double *p)
> > {
> >   __m128d y = _mm_load_sd (p);
> >   return _new_mm_max_pd (x, y);
> > }
> >
> > maxsd (%rdi), %xmm0
> > ret
> >
> > We should use generic vector operations to implement i386 intrinsics
> > as much as we can.
> >
> > > The patch seems very specific.  E.g. why just PLUS, MINUS, MULT and DIV?
> >
> > This patch only adds  +, -, *, /, > and <.    We can add more if there
> > are testcases
> > for them.
> >
> > > Thanks,
> > > Richard
> > >
> > >
> > > >>
> > > >> gcc/
> > > >>
> > > >>         PR target/54855
> > > >>         * simplify-rtx.c (simplify_binary_operation_1): Convert
> > > >>         VEC_CONCAT patterns to standard standard scalar operation
> > > >>         patterns.
> > > >>         * config/i386/sse.md (*<sse>_vm<plusminus_insn><mode>3): New.
> > > >>         (*<sse>_vm<multdiv_mnemonic><mode>3): Likewise.
> > > >>
> > > >> gcc/testsuite/
> > > >>
> > > >>         PR target/54855
> > > >>         * gcc.target/i386/pr54855-1.c: New test.
> > > >>         * gcc.target/i386/pr54855-2.c: Likewise.
> > > >>         * gcc.target/i386/pr54855-3.c: Likewise.
> > > >>         * gcc.target/i386/pr54855-4.c: Likewise.
> > > >>         * gcc.target/i386/pr54855-5.c: Likewise.
> > > >>         * gcc.target/i386/pr54855-6.c: Likewise.
> > > >>         * gcc.target/i386/pr54855-7.c: Likewise.
> > > >
> > > > PING:
> > > >
> > > > https://gcc.gnu.org/ml/gcc-patches/2019-02/msg00398.html
> >
> > Thanks.
> >
>
> PING:
>
> https://gcc.gnu.org/ml/gcc-patches/2019-05/msg01416.html
>

PING.

-- 
H.J.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: PING^1: [PATCH] i386: Generate standard floating point scalar operation patterns
  2019-06-03 22:51       ` PING^1: " H.J. Lu
  2019-06-18 16:01         ` PING^2: " H.J. Lu
@ 2019-06-19 19:21         ` Jeff Law
  1 sibling, 0 replies; 7+ messages in thread
From: Jeff Law @ 2019-06-19 19:21 UTC (permalink / raw)
  To: H.J. Lu, GCC Patches, Marc Glisse, Richard Sandiford
  Cc: Uros Bizjak, Jan Hubicka

On 6/3/19 4:50 PM, H.J. Lu wrote:
> On Tue, May 21, 2019 at 8:54 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Wed, May 15, 2019 at 2:29 PM Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>>
>>> "H.J. Lu" <hjl.tools@gmail.com> writes:
>>>> On Thu, Feb 7, 2019 at 9:49 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>>
>>>>> Standard scalar operation patterns which preserve the rest of the vector
>>>>> look like
>>>>>
>>>>>      (vec_merge:V2DF
>>>>>        (vec_duplicate:V2DF
>>>>>          (op:DF (vec_select:DF (reg/v:V2DF 85 [ x ])
>>>>>                 (parallel [ (const_int 0 [0])]))
>>>>>          (reg:DF 87))
>>>>>        (reg/v:V2DF 85 [ x ])
>>>>>        (const_int 1 [0x1])]))
>>>>>
>>>>> Add such pattens to i386 backend and convert VEC_CONCAT patterns to
>>>>> standard standard scalar operation patterns.
>>>
>>> It looks like there's some variety in the patterns used, e.g.:
>>>
>>> (define_insn "<sse>_vm<code><mode>3<mask_scalar_name><round_saeonly_scalar_name>"
>>>   [(set (match_operand:VF_128 0 "register_operand" "=x,v")
>>>         (vec_merge:VF_128
>>>           (smaxmin:VF_128
>>>             (match_operand:VF_128 1 "register_operand" "0,v")
>>>             (match_operand:VF_128 2 "vector_operand" "xBm,<round_saeonly_scalar_constraint>"))
>>>          (match_dup 1)
>>>          (const_int 1)))]
>>>   "TARGET_SSE"
>>>   "@
>>>    <maxmin_float><ssescalarmodesuffix>\t{%2, %0|%0, %<iptr>2}
>>>    v<maxmin_float><ssescalarmodesuffix>\t{<round_saeonly_scalar_mask_op3>%2, %1, %0<mask_scalar_operand3>|%0<mask_scalar_operand3>, %1, %<iptr>2<round_saeonly_scalar_mask_op3>}"
>>>   [(set_attr "isa" "noavx,avx")
>>>    (set_attr "type" "sse")
>>>    (set_attr "btver2_sse_attr" "maxmin")
>>>    (set_attr "prefix" "<round_saeonly_scalar_prefix>")
>>>    (set_attr "mode" "<ssescalarmode>")])
>>>
>>> makes the operand a full vector operation, which seems simpler.
>>
>> This pattern is used to implement scalar smaxmin intrinsics.
>>
>>> The above would then be:
>>>
>>>       (vec_merge:V2DF
>>>         (op:V2DF
>>>           (reg:V2DF 85)
>>>           (vec_duplicate:V2DF (reg:DF 87)))
>>>         (reg/v:V2DF 85 [ x ])
>>>         (const_int 1 [0x1])]))
>>>
>>> I guess technically the two have different faulting behaviour though,
>>> since the smaxmin gets applied to all elements, not just element 0.
>>
>> This is the issue.   We don't use the correct mode for scalar instructions:
>>
>> ---
>> #include <immintrin.h>
>>
>> __m128d
>> foo1 (__m128d x, double *p)
>> {
>>   __m128d y = _mm_load_sd (p);
>>   return _mm_max_pd (x, y);
>> }
>> ---
>>
>> movq (%rdi), %xmm1
>> maxpd %xmm1, %xmm0
>> ret
>>
>>
>> Here is the updated patch to add standard floating point scalar
>> operation patterns to i386 backend.    Then we can do
>>
>> ---
>> #include <immintrin.h>
>>
>> extern __inline __m128d __attribute__((__gnu_inline__,
>> __always_inline__, __artificial__))
>> _new_mm_max_pd (__m128d __A, __m128d __B)
>> {
>>   __A[0] = __A[0] > __B[0] ? __A[0] : __B[0];
>>   return __A;
>> }
>>
>> __m128d
>> foo2 (__m128d x, double *p)
>> {
>>   __m128d y = _mm_load_sd (p);
>>   return _new_mm_max_pd (x, y);
>> }
>>
>> maxsd (%rdi), %xmm0
>> ret
>>
>> We should use generic vector operations to implement i386 intrinsics
>> as much as we can.
>>
>>> The patch seems very specific.  E.g. why just PLUS, MINUS, MULT and DIV?
>>
>> This patch only adds  +, -, *, /, > and <.    We can add more if there
>> are testcases
>> for them.
>>
>>> Thanks,
>>> Richard
>>>
>>>
>>>>>
>>>>> gcc/
>>>>>
>>>>>         PR target/54855
>>>>>         * simplify-rtx.c (simplify_binary_operation_1): Convert
>>>>>         VEC_CONCAT patterns to standard standard scalar operation
>>>>>         patterns.
>>>>>         * config/i386/sse.md (*<sse>_vm<plusminus_insn><mode>3): New.
>>>>>         (*<sse>_vm<multdiv_mnemonic><mode>3): Likewise.
>>>>>
>>>>> gcc/testsuite/
>>>>>
>>>>>         PR target/54855
>>>>>         * gcc.target/i386/pr54855-1.c: New test.
>>>>>         * gcc.target/i386/pr54855-2.c: Likewise.
>>>>>         * gcc.target/i386/pr54855-3.c: Likewise.
>>>>>         * gcc.target/i386/pr54855-4.c: Likewise.
>>>>>         * gcc.target/i386/pr54855-5.c: Likewise.
>>>>>         * gcc.target/i386/pr54855-6.c: Likewise.
>>>>>         * gcc.target/i386/pr54855-7.c: Likewise.
>>>>
>>>> PING:
>>>>
>>>> https://gcc.gnu.org/ml/gcc-patches/2019-02/msg00398.html
>>
>> Thanks.
>>
> 
> PING:
> 
> https://gcc.gnu.org/ml/gcc-patches/2019-05/msg01416.html
The simplify-rtx changes are OK as are the x86 backend changes (either
the original version that just handled basic arithmetic operators or the
subsequent one that added support for minmax and setv2df_0.

Jeff

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-06-19 19:21 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-07 17:49 [PATCH] i386: Add standard scalar operation patterns H.J. Lu
2019-05-15 19:16 ` PING^1: " H.J. Lu
2019-05-15 21:29   ` Richard Sandiford
2019-05-21 15:54     ` [PATCH] i386: Generate standard floating point " H.J. Lu
2019-06-03 22:51       ` PING^1: " H.J. Lu
2019-06-18 16:01         ` PING^2: " H.J. Lu
2019-06-19 19:21         ` PING^1: " Jeff Law

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).