public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] match.pd: Improve fneg/fadd optimization [PR109240]
@ 2023-04-18  8:50 Jakub Jelinek
  2023-04-18  8:57 ` Richard Biener
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2023-04-18  8:50 UTC (permalink / raw)
  To: Richard Biener, Richard Sandiford; +Cc: gcc-patches

Hi!

match.pd has mostly for AArch64 an optimization in which it optimizes
certain forms of __builtin_shuffle of x + y and x - y vectors into
fneg using twice as wide element type so that every other sign is changed,
followed by fadd.

The following patch extends that optimization, so that it can handle
other forms as well, using the same fneg but fsub instead of fadd.

As the plus is commutative and minus is not and I want to handle
vec_perm with plus minus and minus plus order preferrably in one
pattern, I had to do the matching operand checks by hand.

Bootstrapped/regtested on aarch64-linux, x86_64-linux and i686-linux,
ok for trunk?

2023-04-18  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/109240
	* match.pd (fneg/fadd): Rewrite such that it handles both plus as
	first vec_perm operand and minus as second using fneg/fadd and
	minus as first vec_perm operand and plus as second using fneg/fsub.

	* gcc.target/aarch64/simd/addsub_2.c: New test.
	* gcc.target/aarch64/sve/addsub_2.c: New test.

--- gcc/match.pd.jj	2023-03-21 19:59:40.209634256 +0100
+++ gcc/match.pd	2023-03-22 10:17:25.344772636 +0100
@@ -8074,63 +8074,76 @@ and,
    under IEEE 754 the fneg of the wider type will negate every even entry
    and when doing an add we get a sub of the even and add of every odd
    elements.  */
-(simplify
- (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
- (if (!VECTOR_INTEGER_TYPE_P (type)
-      && !FLOAT_WORDS_BIG_ENDIAN)
-  (with
-   {
-     /* Build a vector of integers from the tree mask.  */
-     vec_perm_builder builder;
-   }
-   (if (tree_to_vec_perm_builder (&builder, @2))
-    (with
-     {
-       /* Create a vec_perm_indices for the integer vector.  */
-       poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
-       vec_perm_indices sel (builder, 2, nelts);
-       machine_mode vec_mode = TYPE_MODE (type);
-       machine_mode wide_mode;
-       scalar_mode wide_elt_mode;
-       poly_uint64 wide_nunits;
-       scalar_mode inner_mode = GET_MODE_INNER (vec_mode);
-     }
-     (if (sel.series_p (0, 2, 0, 2)
-	  && sel.series_p (1, 2, nelts + 1, 2)
-	  && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode)
-	  && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits)
-	  && related_vector_mode (vec_mode, wide_elt_mode,
-				  wide_nunits).exists (&wide_mode))
-	(with
-	 {
-	   tree stype
-	     = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
-					       TYPE_UNSIGNED (type));
-	   tree ntype = build_vector_type_for_mode (stype, wide_mode);
+(for plusminus (plus minus)
+     minusplus (minus plus)
+ (simplify
+  (vec_perm (plusminus @0 @1) (minusplus @2 @3) VECTOR_CST@4)
+   (if (!VECTOR_INTEGER_TYPE_P (type)
+	&& !FLOAT_WORDS_BIG_ENDIAN
+        /* plus is commutative, while minus is not, so :c can't be used.
+	   Do equality comparisons by hand and at the end pick the operands
+	   from the minus.  */
+	&& (operand_equal_p (@0, @2, 0)
+	    ? operand_equal_p (@1, @3, 0)
+	    : operand_equal_p (@0, @3, 0) && operand_equal_p (@1, @2, 0)))
+   (with
+    {
+      /* Build a vector of integers from the tree mask.  */
+      vec_perm_builder builder;
+    }
+    (if (tree_to_vec_perm_builder (&builder, @4))
+     (with
+      {
+	/* Create a vec_perm_indices for the integer vector.  */
+	poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+	vec_perm_indices sel (builder, 2, nelts);
+	machine_mode vec_mode = TYPE_MODE (type);
+	machine_mode wide_mode;
+	scalar_mode wide_elt_mode;
+	poly_uint64 wide_nunits;
+	scalar_mode inner_mode = GET_MODE_INNER (vec_mode);
+      }
+      (if (sel.series_p (0, 2, 0, 2)
+	   && sel.series_p (1, 2, nelts + 1, 2)
+	   && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode)
+	   && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits)
+	   && related_vector_mode (vec_mode, wide_elt_mode,
+				   wide_nunits).exists (&wide_mode))
+       (with
+	{
+	  tree stype
+	    = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
+					      TYPE_UNSIGNED (type));
+	  tree ntype = build_vector_type_for_mode (stype, wide_mode);
 
-	   /* The format has to be a non-extended ieee format.  */
-	   const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
-	   const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
-	 }
-	 (if (TYPE_MODE (stype) != BLKmode
-	      && VECTOR_TYPE_P (ntype)
-	      && fmt_old != NULL
-	      && fmt_new != NULL)
-	  (with
-	   {
-	     /* If the target doesn't support v1xx vectors, try using
-		scalar mode xx instead.  */
+	  /* The format has to be a non-extended ieee format.  */
+	  const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
+	  const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
+	}
+	(if (TYPE_MODE (stype) != BLKmode
+	     && VECTOR_TYPE_P (ntype)
+	     && fmt_old != NULL
+	     && fmt_new != NULL)
+	 (with
+	  {
+	    /* If the target doesn't support v1xx vectors, try using
+	       scalar mode xx instead.  */
 	    if (known_eq (GET_MODE_NUNITS (wide_mode), 1)
 		&& !target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))
 	      ntype = stype;
-	   }
-	   (if (fmt_new->signbit_rw
-	        == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
-		&& fmt_new->signbit_rw == fmt_new->signbit_ro
-		&& targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS)
-		&& ((optimize_vectors_before_lowering_p () && VECTOR_TYPE_P (ntype))
-		    || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
-	    (plus (view_convert:type (negate (view_convert:ntype @1))) @0)))))))))))
+	  }
+	  (if (fmt_new->signbit_rw
+	       == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
+	       && fmt_new->signbit_rw == fmt_new->signbit_ro
+	       && targetm.can_change_mode_class (TYPE_MODE (ntype),
+						 TYPE_MODE (type), ALL_REGS)
+	       && ((optimize_vectors_before_lowering_p ()
+		    && VECTOR_TYPE_P (ntype))
+		   || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
+	   (if (plusminus == PLUS_EXPR)
+	    (plus (view_convert:type (negate (view_convert:ntype @3))) @2)
+	    (minus @0 (view_convert:type
+			(negate (view_convert:ntype @1))))))))))))))))
 
 (simplify
  (vec_perm @0 @1 VECTOR_CST@2)
--- gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c.jj	2023-03-22 10:22:57.324017790 +0100
+++ gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c	2023-03-22 10:23:54.482199126 +0100
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
+/* { dg-options "-Ofast" } */
+/* { dg-add-options arm_v8_2a_fp16_neon } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+/* 
+** f1:
+** ...
+**	fneg	v[0-9]+.2d, v[0-9]+.2d
+**	fsub	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	v[0-9]+.4s, v[0-9]+.4s
+** 	fsub	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
+** ...
+*/
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
+
+/* 
+** e1:
+** ...
+** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
+** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
--- gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c.jj	2023-03-22 10:24:14.169917153 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c	2023-03-22 10:25:05.414183194 +0100
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-Ofast" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+** f1:
+** ...
+** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fsub	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
+** ...
+*/
+void f1 (float *restrict a, float *restrict b, float *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
+
+/* 
+** d1:
+** ...
+** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
+** 	fsub	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
+** ...
+*/ 
+void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
+{
+   for (int i = 0; i < (n & -8); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}
+
+/*
+** e1:
+** ...
+** 	fadd	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
+** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
+** 	fsub	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
+** ...
+*/
+void e1 (double *restrict a, double *restrict b, double *res, int n)
+{
+   for (int i = 0; i < (n & -4); i+=2)
+    {
+      res[i+0] = a[i+0] - b[i+0];
+      res[i+1] = a[i+1] + b[i+1];
+    }
+}

	Jakub


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] match.pd: Improve fneg/fadd optimization [PR109240]
  2023-04-18  8:50 [PATCH] match.pd: Improve fneg/fadd optimization [PR109240] Jakub Jelinek
@ 2023-04-18  8:57 ` Richard Biener
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Biener @ 2023-04-18  8:57 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Sandiford, gcc-patches

On Tue, 18 Apr 2023, Jakub Jelinek wrote:

> Hi!
> 
> match.pd has mostly for AArch64 an optimization in which it optimizes
> certain forms of __builtin_shuffle of x + y and x - y vectors into
> fneg using twice as wide element type so that every other sign is changed,
> followed by fadd.
> 
> The following patch extends that optimization, so that it can handle
> other forms as well, using the same fneg but fsub instead of fadd.
> 
> As the plus is commutative and minus is not and I want to handle
> vec_perm with plus minus and minus plus order preferrably in one
> pattern, I had to do the matching operand checks by hand.
> 
> Bootstrapped/regtested on aarch64-linux, x86_64-linux and i686-linux,
> ok for trunk?

OK.

Thanks,
Richard.

> 2023-04-18  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR tree-optimization/109240
> 	* match.pd (fneg/fadd): Rewrite such that it handles both plus as
> 	first vec_perm operand and minus as second using fneg/fadd and
> 	minus as first vec_perm operand and plus as second using fneg/fsub.
> 
> 	* gcc.target/aarch64/simd/addsub_2.c: New test.
> 	* gcc.target/aarch64/sve/addsub_2.c: New test.
> 
> --- gcc/match.pd.jj	2023-03-21 19:59:40.209634256 +0100
> +++ gcc/match.pd	2023-03-22 10:17:25.344772636 +0100
> @@ -8074,63 +8074,76 @@ and,
>     under IEEE 754 the fneg of the wider type will negate every even entry
>     and when doing an add we get a sub of the even and add of every odd
>     elements.  */
> -(simplify
> - (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
> - (if (!VECTOR_INTEGER_TYPE_P (type)
> -      && !FLOAT_WORDS_BIG_ENDIAN)
> -  (with
> -   {
> -     /* Build a vector of integers from the tree mask.  */
> -     vec_perm_builder builder;
> -   }
> -   (if (tree_to_vec_perm_builder (&builder, @2))
> -    (with
> -     {
> -       /* Create a vec_perm_indices for the integer vector.  */
> -       poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> -       vec_perm_indices sel (builder, 2, nelts);
> -       machine_mode vec_mode = TYPE_MODE (type);
> -       machine_mode wide_mode;
> -       scalar_mode wide_elt_mode;
> -       poly_uint64 wide_nunits;
> -       scalar_mode inner_mode = GET_MODE_INNER (vec_mode);
> -     }
> -     (if (sel.series_p (0, 2, 0, 2)
> -	  && sel.series_p (1, 2, nelts + 1, 2)
> -	  && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode)
> -	  && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits)
> -	  && related_vector_mode (vec_mode, wide_elt_mode,
> -				  wide_nunits).exists (&wide_mode))
> -	(with
> -	 {
> -	   tree stype
> -	     = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
> -					       TYPE_UNSIGNED (type));
> -	   tree ntype = build_vector_type_for_mode (stype, wide_mode);
> +(for plusminus (plus minus)
> +     minusplus (minus plus)
> + (simplify
> +  (vec_perm (plusminus @0 @1) (minusplus @2 @3) VECTOR_CST@4)
> +   (if (!VECTOR_INTEGER_TYPE_P (type)
> +	&& !FLOAT_WORDS_BIG_ENDIAN
> +        /* plus is commutative, while minus is not, so :c can't be used.
> +	   Do equality comparisons by hand and at the end pick the operands
> +	   from the minus.  */
> +	&& (operand_equal_p (@0, @2, 0)
> +	    ? operand_equal_p (@1, @3, 0)
> +	    : operand_equal_p (@0, @3, 0) && operand_equal_p (@1, @2, 0)))
> +   (with
> +    {
> +      /* Build a vector of integers from the tree mask.  */
> +      vec_perm_builder builder;
> +    }
> +    (if (tree_to_vec_perm_builder (&builder, @4))
> +     (with
> +      {
> +	/* Create a vec_perm_indices for the integer vector.  */
> +	poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
> +	vec_perm_indices sel (builder, 2, nelts);
> +	machine_mode vec_mode = TYPE_MODE (type);
> +	machine_mode wide_mode;
> +	scalar_mode wide_elt_mode;
> +	poly_uint64 wide_nunits;
> +	scalar_mode inner_mode = GET_MODE_INNER (vec_mode);
> +      }
> +      (if (sel.series_p (0, 2, 0, 2)
> +	   && sel.series_p (1, 2, nelts + 1, 2)
> +	   && GET_MODE_2XWIDER_MODE (inner_mode).exists (&wide_elt_mode)
> +	   && multiple_p (GET_MODE_NUNITS (vec_mode), 2, &wide_nunits)
> +	   && related_vector_mode (vec_mode, wide_elt_mode,
> +				   wide_nunits).exists (&wide_mode))
> +       (with
> +	{
> +	  tree stype
> +	    = lang_hooks.types.type_for_mode (GET_MODE_INNER (wide_mode),
> +					      TYPE_UNSIGNED (type));
> +	  tree ntype = build_vector_type_for_mode (stype, wide_mode);
>  
> -	   /* The format has to be a non-extended ieee format.  */
> -	   const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
> -	   const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
> -	 }
> -	 (if (TYPE_MODE (stype) != BLKmode
> -	      && VECTOR_TYPE_P (ntype)
> -	      && fmt_old != NULL
> -	      && fmt_new != NULL)
> -	  (with
> -	   {
> -	     /* If the target doesn't support v1xx vectors, try using
> -		scalar mode xx instead.  */
> +	  /* The format has to be a non-extended ieee format.  */
> +	  const struct real_format *fmt_old = FLOAT_MODE_FORMAT (vec_mode);
> +	  const struct real_format *fmt_new = FLOAT_MODE_FORMAT (wide_mode);
> +	}
> +	(if (TYPE_MODE (stype) != BLKmode
> +	     && VECTOR_TYPE_P (ntype)
> +	     && fmt_old != NULL
> +	     && fmt_new != NULL)
> +	 (with
> +	  {
> +	    /* If the target doesn't support v1xx vectors, try using
> +	       scalar mode xx instead.  */
>  	    if (known_eq (GET_MODE_NUNITS (wide_mode), 1)
>  		&& !target_supports_op_p (ntype, NEGATE_EXPR, optab_vector))
>  	      ntype = stype;
> -	   }
> -	   (if (fmt_new->signbit_rw
> -	        == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
> -		&& fmt_new->signbit_rw == fmt_new->signbit_ro
> -		&& targetm.can_change_mode_class (TYPE_MODE (ntype), TYPE_MODE (type), ALL_REGS)
> -		&& ((optimize_vectors_before_lowering_p () && VECTOR_TYPE_P (ntype))
> -		    || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> -	    (plus (view_convert:type (negate (view_convert:ntype @1))) @0)))))))))))
> +	  }
> +	  (if (fmt_new->signbit_rw
> +	       == fmt_old->signbit_rw + GET_MODE_UNIT_BITSIZE (vec_mode)
> +	       && fmt_new->signbit_rw == fmt_new->signbit_ro
> +	       && targetm.can_change_mode_class (TYPE_MODE (ntype),
> +						 TYPE_MODE (type), ALL_REGS)
> +	       && ((optimize_vectors_before_lowering_p ()
> +		    && VECTOR_TYPE_P (ntype))
> +		   || target_supports_op_p (ntype, NEGATE_EXPR, optab_vector)))
> +	   (if (plusminus == PLUS_EXPR)
> +	    (plus (view_convert:type (negate (view_convert:ntype @3))) @2)
> +	    (minus @0 (view_convert:type
> +			(negate (view_convert:ntype @1))))))))))))))))
>  
>  (simplify
>   (vec_perm @0 @1 VECTOR_CST@2)
> --- gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c.jj	2023-03-22 10:22:57.324017790 +0100
> +++ gcc/testsuite/gcc.target/aarch64/simd/addsub_2.c	2023-03-22 10:23:54.482199126 +0100
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-add-options arm_v8_2a_fp16_neon } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +/* 
> +** f1:
> +** ...
> +**	fneg	v[0-9]+.2d, v[0-9]+.2d
> +**	fsub	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> +
> +/* 
> +** d1:
> +** ...
> +** 	fneg	v[0-9]+.4s, v[0-9]+.4s
> +** 	fsub	v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
> +** ...
> +*/
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> +
> +/* 
> +** e1:
> +** ...
> +** 	fsub	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +** 	fadd	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +** 	ins	v[0-9]+.d\[1\], v[0-9]+.d\[1\]
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> --- gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c.jj	2023-03-22 10:24:14.169917153 +0100
> +++ gcc/testsuite/gcc.target/aarch64/sve/addsub_2.c	2023-03-22 10:25:05.414183194 +0100
> @@ -0,0 +1,52 @@
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +/*
> +** f1:
> +** ...
> +** 	fneg	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +** 	fsub	z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
> +** ...
> +*/
> +void f1 (float *restrict a, float *restrict b, float *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> +
> +/* 
> +** d1:
> +** ...
> +** 	fneg	z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
> +** 	fsub	z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
> +** ...
> +*/ 
> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
> +{
> +   for (int i = 0; i < (n & -8); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> +
> +/*
> +** e1:
> +** ...
> +** 	fadd	z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
> +** 	movprfx	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
> +** 	fsub	z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
> +** ...
> +*/
> +void e1 (double *restrict a, double *restrict b, double *res, int n)
> +{
> +   for (int i = 0; i < (n & -4); i+=2)
> +    {
> +      res[i+0] = a[i+0] - b[i+0];
> +      res[i+1] = a[i+1] + b[i+1];
> +    }
> +}
> 
> 	Jakub
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg,
Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien Moerman;
HRB 36809 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-04-18  8:57 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-18  8:50 [PATCH] match.pd: Improve fneg/fadd optimization [PR109240] Jakub Jelinek
2023-04-18  8:57 ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).