[PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856]
@ 2024-02-24  3:18 Andrew Pinski
  2024-02-24  3:18 ` [PATCH 2/2] aarch64: Support `{1.0f, 1.0f, 0.0, 0.0}` CST forming with fmov with a smaller vector type Andrew Pinski
  2024-03-07 16:26 ` [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856] Richard Sandiford
  0 siblings, 2 replies; 5+ messages in thread
From: Andrew Pinski @ 2024-02-24  3:18 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andrew Pinski

Aarch64 has a way to form some floating point CSTs via the fmov instructions,
these instructions also zero out the upper parts of the registers so they can
be used for vector CSTs that have have one non-zero constant that would be able
to formed via the fmov in the first element.

This implements this "small" optimization so these vector cst don't need to do
loads from memory.

Built and tested on aarch64-linux-gnu with no regressions.

	PR target/113856

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (struct simd_immediate_info):
	Add FMOV_SDH to insn_type. For scalar_float_mode constructor
	add insn_in.
	(aarch64_simd_valid_immediate): Catch `{fp, 0...}` vector_cst
	and return a simd_immediate_info which uses FMOV_SDH.
	(aarch64_output_simd_mov_immediate): Support outputting
	fmov for FMOV_SDH.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/fmov-zero-cst-1.c: New test.
	* gcc.target/aarch64/fmov-zero-cst-2.c: New test.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/config/aarch64/aarch64.cc                 | 48 ++++++++++++++---
 .../gcc.target/aarch64/fmov-zero-cst-1.c      | 52 +++++++++++++++++++
 .../gcc.target/aarch64/fmov-zero-cst-2.c      | 19 +++++++
 3 files changed, 111 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 5dd0814f198..c4386591a9b 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -126,11 +126,11 @@ constexpr auto AARCH64_STATE_OUT = 1U << 2;
 /* Information about a legitimate vector immediate operand.  */
 struct simd_immediate_info
 {
-  enum insn_type { MOV, MVN, INDEX, PTRUE };
+  enum insn_type { MOV, FMOV_SDH, MVN, INDEX, PTRUE };
   enum modifier_type { LSL, MSL };
 
   simd_immediate_info () {}
-  simd_immediate_info (scalar_float_mode, rtx);
+  simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV);
   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 		       insn_type = MOV, modifier_type = LSL,
 		       unsigned int = 0);
@@ -145,7 +145,7 @@ struct simd_immediate_info
 
   union
   {
-    /* For MOV and MVN.  */
+    /* For MOV, FMOV_SDH and MVN.  */
     struct
     {
       /* The value of each element.  */
@@ -173,9 +173,10 @@ struct simd_immediate_info
 /* Construct a floating-point immediate in which each element has mode
    ELT_MODE_IN and value VALUE_IN.  */
 inline simd_immediate_info
-::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
-  : elt_mode (elt_mode_in), insn (MOV)
+::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in)
+  : elt_mode (elt_mode_in), insn (insn_in)
 {
+  gcc_assert (insn_in == MOV || insn_in == FMOV_SDH);
   u.mov.value = value_in;
   u.mov.modifier = LSL;
   u.mov.shift = 0;
@@ -22932,6 +22933,35 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
 	  return true;
 	}
     }
+  /* See if we can use fmov d0/s0/h0 ... for the constant. */
+  if (n_elts >= 1
+      && (vec_flags & VEC_ADVSIMD)
+      && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)
+      && !CONST_VECTOR_DUPLICATE_P (op))
+    {
+      rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
+      if (aarch64_float_const_zero_rtx_p (elt)
+	  || aarch64_float_const_representable_p (elt))
+	{
+	  bool valid = true;
+	  for (unsigned int i = 1; i < n_elts; i++)
+	    {
+	      rtx elt1 = CONST_VECTOR_ENCODED_ELT (op, i);
+	      if (!aarch64_float_const_zero_rtx_p (elt1))
+		{
+		  valid = false;
+		  break;
+		}
+	    }
+	  if (valid)
+	    {
+	      if (info)
+		*info = simd_immediate_info (elt_float_mode, elt,
+					     simd_immediate_info::FMOV_SDH);
+	      return true;
+	    }
+	}
+    }
 
   /* If all elements in an SVE vector have the same value, we have a free
      choice between using the element mode and using the container mode.
@@ -25121,7 +25151,8 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
 
   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
     {
-      gcc_assert (info.insn == simd_immediate_info::MOV
+      gcc_assert ((info.insn == simd_immediate_info::MOV
+		   || info.insn == simd_immediate_info::FMOV_SDH)
 		  && info.u.mov.shift == 0);
       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
 	 move immediate path.  */
@@ -25134,8 +25165,9 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
 	  real_to_decimal_for_mode (float_buf,
 				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
 				    buf_size, buf_size, 1, info.elt_mode);
-
-	  if (lane_count == 1)
+	  if (info.insn == simd_immediate_info::FMOV_SDH)
+	    snprintf (templ, sizeof (templ), "fmov\t%%%c0, %s", element_char, float_buf);
+	  else if (lane_count == 1)
 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
 	  else
 	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c
new file mode 100644
index 00000000000..9b13ef7b1ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c
@@ -0,0 +1,52 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcmodel=tiny" } */
+/* { dg-final { check-function-bodies "**" "" ""  } } */
+/* PR target/113856 */
+
+#define vect64 __attribute__((vector_size(8) ))
+#define vect128 __attribute__((vector_size(16) ))
+
+/*
+** f1:
+**	fmov	s0, 1.0e\+0
+**	ret
+*/
+vect64  float f1()
+{
+  return (vect64 float){1.0f, 0};
+}
+
+/*
+** f2:
+**	fmov	s0, 1.0e\+0
+**	ret
+*/
+vect128 float f2()
+{
+  return (vect128 float){1.0f, 0, 0, 0};
+}
+
+/* f3 should only be done for -ffast-math. */
+/*
+** f3:
+**	ldr	q0, .LC[0-9]+
+**	ret
+*/
+vect128 float f3()
+{
+  return (vect128 float){1.0f, 0, -0.0f, 0.0f};
+}
+
+/* f4 cannot be using fmov here,
+   Note this is checked here as {1.0, 0.0, 1.0, 0.0}
+   has CONST_VECTOR_DUPLICATE_P set
+   and represented interanlly as: {1.0, 0.0}. */
+/*
+** f4:
+**	ldr	q0, .LC[0-9]+
+**	ret
+*/
+vect128 float f4()
+{
+  return (vect128 float){1.0f, 0, 1.0f, 0.0f};
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c
new file mode 100644
index 00000000000..c97d85b68a9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcmodel=tiny -ffast-math" } */
+/* { dg-final { check-function-bodies "**" "" ""  } } */
+/* PR target/113856 */
+
+#define vect64 __attribute__((vector_size(8) ))
+#define vect128 __attribute__((vector_size(16) ))
+
+/* f3 can be done with -ffast-math. */
+/*
+** f3:
+**	fmov	s0, 1.0e\+0
+**	ret
+*/
+vect128 float f3()
+{
+  return (vect128 float){1.0f, 0, -0.0f, 0.0f};
+}
+
-- 
2.43.0


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 2/2] aarch64: Support `{1.0f, 1.0f, 0.0, 0.0}` CST forming with fmov with a smaller vector type.
  2024-02-24  3:18 [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856] Andrew Pinski
@ 2024-02-24  3:18 ` Andrew Pinski
  2024-03-07 16:41   ` Richard Sandiford
  2024-03-07 16:26 ` [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856] Richard Sandiford
  1 sibling, 1 reply; 5+ messages in thread
From: Andrew Pinski @ 2024-02-24  3:18 UTC (permalink / raw)
  To: gcc-patches; +Cc: Andrew Pinski

This enables construction of V4SF CST like `{1.0f, 1.0f, 0.0f, 0.0f}`
(and other fp enabled CSTs) by using `fmov v0.2s, 1.0` as the instruction
is designed to zero out the other bits.
This is a small extension on top of the code that creates fmov for the case
where the all but the first element is non-zero.

Built and tested for aarch64-linux-gnu with no regressions.

	PR target/113856

gcc/ChangeLog:

	* config/aarch64/aarch64.cc (simd_immediate_info): Add bool to the
	float mode constructor. Document modifier field for FMOV_SDH.
	(aarch64_simd_valid_immediate): Recognize where the first half
	of the const float vect is the same.
	(aarch64_output_simd_mov_immediate): Handle the case where insn is
	FMOV_SDH and modifier is MSL.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/fmov-zero-cst-3.c: New test.

Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
---
 gcc/config/aarch64/aarch64.cc                 | 34 ++++++++++++++++---
 .../gcc.target/aarch64/fmov-zero-cst-3.c      | 28 +++++++++++++++
 2 files changed, 57 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-3.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c4386591a9b..89bd0c5e5a6 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -130,7 +130,7 @@ struct simd_immediate_info
   enum modifier_type { LSL, MSL };
 
   simd_immediate_info () {}
-  simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV);
+  simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV, bool = false);
   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 		       insn_type = MOV, modifier_type = LSL,
 		       unsigned int = 0);
@@ -153,6 +153,8 @@ struct simd_immediate_info
 
       /* The kind of shift modifier to use, and the number of bits to shift.
 	 This is (LSL, 0) if no shift is needed.  */
+      /* For FMOV_SDH, LSL says it is a single while MSL
+	 says if it is either .4h/.2s fmov. */
       modifier_type modifier;
       unsigned int shift;
     } mov;
@@ -173,12 +175,12 @@ struct simd_immediate_info
 /* Construct a floating-point immediate in which each element has mode
    ELT_MODE_IN and value VALUE_IN.  */
 inline simd_immediate_info
-::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in)
+::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in, bool firsthalfsame)
   : elt_mode (elt_mode_in), insn (insn_in)
 {
   gcc_assert (insn_in == MOV || insn_in == FMOV_SDH);
   u.mov.value = value_in;
-  u.mov.modifier = LSL;
+  u.mov.modifier = firsthalfsame ? MSL : LSL;
   u.mov.shift = 0;
 }
 
@@ -22944,10 +22946,23 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
 	  || aarch64_float_const_representable_p (elt))
 	{
 	  bool valid = true;
+	  bool firsthalfsame = false;
 	  for (unsigned int i = 1; i < n_elts; i++)
 	    {
 	      rtx elt1 = CONST_VECTOR_ENCODED_ELT (op, i);
 	      if (!aarch64_float_const_zero_rtx_p (elt1))
+		{
+		  if (i == 1)
+		    firsthalfsame = true;
+		  if (!firsthalfsame
+		      || i >= n_elts/2
+		      || !rtx_equal_p (elt, elt1))
+		    {
+		      valid = false;
+		      break;
+		    }
+		}
+	      else if (firsthalfsame && i < n_elts/2)
 		{
 		  valid = false;
 		  break;
@@ -22957,7 +22972,8 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
 	    {
 	      if (info)
 		*info = simd_immediate_info (elt_float_mode, elt,
-					     simd_immediate_info::FMOV_SDH);
+					     simd_immediate_info::FMOV_SDH,
+					     firsthalfsame);
 	      return true;
 	    }
 	}
@@ -25165,8 +25181,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
 	  real_to_decimal_for_mode (float_buf,
 				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
 				    buf_size, buf_size, 1, info.elt_mode);
-	  if (info.insn == simd_immediate_info::FMOV_SDH)
+	  if (info.insn == simd_immediate_info::FMOV_SDH
+	      && info.u.mov.modifier == simd_immediate_info::LSL)
 	    snprintf (templ, sizeof (templ), "fmov\t%%%c0, %s", element_char, float_buf);
+	  else if (info.insn == simd_immediate_info::FMOV_SDH
+	      && info.u.mov.modifier == simd_immediate_info::MSL)
+	    {
+	      gcc_assert (element_char != 'd');
+	      gcc_assert (lane_count > 2);
+	      snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s", lane_count/2, element_char, float_buf);
+	    }
 	  else if (lane_count == 1)
 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
 	  else
diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-3.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-3.c
new file mode 100644
index 00000000000..7a78b6d3caf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-3.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcmodel=tiny" } */
+/* { dg-final { check-function-bodies "**" "" ""  } } */
+/* PR target/113856 */
+
+#define vect64 __attribute__((vector_size(8) ))
+#define vect128 __attribute__((vector_size(16) ))
+
+/*
+** f2:
+**	fmov	v0.2s, 1.0e\+0
+**	ret
+*/
+vect128 float f2()
+{
+  return (vect128 float){1.0f, 1.0f, 0, 0};
+}
+
+/*
+** f3:
+**	ldr	q0, \.LC[0-9]+
+**	ret
+*/
+vect128 float f3()
+{
+  return (vect128 float){1.0f, 1.0f, 1.0f, 0.0};
+}
+
-- 
2.43.0


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856]
  2024-02-24  3:18 [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856] Andrew Pinski
  2024-02-24  3:18 ` [PATCH 2/2] aarch64: Support `{1.0f, 1.0f, 0.0, 0.0}` CST forming with fmov with a smaller vector type Andrew Pinski
@ 2024-03-07 16:26 ` Richard Sandiford
  2024-03-07 16:33   ` Richard Sandiford
  1 sibling, 1 reply; 5+ messages in thread
From: Richard Sandiford @ 2024-03-07 16:26 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: gcc-patches

Andrew Pinski <quic_apinski@quicinc.com> writes:
> Aarch64 has a way to form some floating point CSTs via the fmov instructions,
> these instructions also zero out the upper parts of the registers so they can
> be used for vector CSTs that have have one non-zero constant that would be able
> to formed via the fmov in the first element.
>
> This implements this "small" optimization so these vector cst don't need to do
> loads from memory.
>
> Built and tested on aarch64-linux-gnu with no regressions.
>
> 	PR target/113856
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (struct simd_immediate_info):
> 	Add FMOV_SDH to insn_type. For scalar_float_mode constructor
> 	add insn_in.
> 	(aarch64_simd_valid_immediate): Catch `{fp, 0...}` vector_cst
> 	and return a simd_immediate_info which uses FMOV_SDH.
> 	(aarch64_output_simd_mov_immediate): Support outputting
> 	fmov for FMOV_SDH.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/fmov-zero-cst-1.c: New test.
> 	* gcc.target/aarch64/fmov-zero-cst-2.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64.cc                 | 48 ++++++++++++++---
>  .../gcc.target/aarch64/fmov-zero-cst-1.c      | 52 +++++++++++++++++++
>  .../gcc.target/aarch64/fmov-zero-cst-2.c      | 19 +++++++
>  3 files changed, 111 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 5dd0814f198..c4386591a9b 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -126,11 +126,11 @@ constexpr auto AARCH64_STATE_OUT = 1U << 2;
>  /* Information about a legitimate vector immediate operand.  */
>  struct simd_immediate_info
>  {
> -  enum insn_type { MOV, MVN, INDEX, PTRUE };
> +  enum insn_type { MOV, FMOV_SDH, MVN, INDEX, PTRUE };
>    enum modifier_type { LSL, MSL };
>  
>    simd_immediate_info () {}
> -  simd_immediate_info (scalar_float_mode, rtx);
> +  simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV);
>    simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
>  		       insn_type = MOV, modifier_type = LSL,
>  		       unsigned int = 0);
> @@ -145,7 +145,7 @@ struct simd_immediate_info
>  
>    union
>    {
> -    /* For MOV and MVN.  */
> +    /* For MOV, FMOV_SDH and MVN.  */
>      struct
>      {
>        /* The value of each element.  */
> @@ -173,9 +173,10 @@ struct simd_immediate_info
>  /* Construct a floating-point immediate in which each element has mode
>     ELT_MODE_IN and value VALUE_IN.  */
>  inline simd_immediate_info
> -::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
> -  : elt_mode (elt_mode_in), insn (MOV)
> +::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in)

Nit: long line.

> +  : elt_mode (elt_mode_in), insn (insn_in)
>  {
> +  gcc_assert (insn_in == MOV || insn_in == FMOV_SDH);
>    u.mov.value = value_in;
>    u.mov.modifier = LSL;
>    u.mov.shift = 0;
> @@ -22932,6 +22933,35 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
>  	  return true;
>  	}
>      }
> +  /* See if we can use fmov d0/s0/h0 ... for the constant. */
> +  if (n_elts >= 1

This condition seems unnecessary.  n_elts can't be zero.

> +      && (vec_flags & VEC_ADVSIMD)
> +      && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)
> +      && !CONST_VECTOR_DUPLICATE_P (op))

I think we should also drop this.  I guess it's to undo:

  if (CONST_VECTOR_P (op)
      && CONST_VECTOR_DUPLICATE_P (op))
    n_elts = CONST_VECTOR_NPATTERNS (op);

but we can use GET_MODE_NUNITS (mode) directly instead.

> +    {
> +      rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
> +      if (aarch64_float_const_zero_rtx_p (elt)
> +	  || aarch64_float_const_representable_p (elt))

What's the effect of including aarch64_float_const_zero_rtx_p for the
first element?  Does it change the code we generate for any cases
involving +0.0?  Or is it more for -0.0?

> +	{
> +	  bool valid = true;
> +	  for (unsigned int i = 1; i < n_elts; i++)
> +	    {
> +	      rtx elt1 = CONST_VECTOR_ENCODED_ELT (op, i);
> +	      if (!aarch64_float_const_zero_rtx_p (elt1))
> +		{
> +		  valid = false;
> +		  break;
> +		}
> +	    }
> +	  if (valid)
> +	    {
> +	      if (info)
> +		*info = simd_immediate_info (elt_float_mode, elt,
> +					     simd_immediate_info::FMOV_SDH);
> +	      return true;
> +	    }
> +	}
> +    }
>  
>    /* If all elements in an SVE vector have the same value, we have a free
>       choice between using the element mode and using the container mode.
> @@ -25121,7 +25151,8 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
>  
>    if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
>      {
> -      gcc_assert (info.insn == simd_immediate_info::MOV
> +      gcc_assert ((info.insn == simd_immediate_info::MOV
> +		   || info.insn == simd_immediate_info::FMOV_SDH)
>  		  && info.u.mov.shift == 0);
>        /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
>  	 move immediate path.  */
> @@ -25134,8 +25165,9 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
>  	  real_to_decimal_for_mode (float_buf,
>  				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
>  				    buf_size, buf_size, 1, info.elt_mode);
> -
> -	  if (lane_count == 1)
> +	  if (info.insn == simd_immediate_info::FMOV_SDH)
> +	    snprintf (templ, sizeof (templ), "fmov\t%%%c0, %s", element_char, float_buf);

Long line.  I think this is more general than the lane_count == 1
handling (which would need to change if we ever added 32-bit vectors),
so it's probably better to add "|| lane_count == 1" to this if rather
than have an else if.

> +	  else if (lane_count == 1)
>  	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
>  	  else
>  	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c
> new file mode 100644
> index 00000000000..9b13ef7b1ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c
> @@ -0,0 +1,52 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mcmodel=tiny" } */
> +/* { dg-final { check-function-bodies "**" "" ""  } } */
> +/* PR target/113856 */

The tests look specific to little-endian, so I think they need to be
guarded with aarch64_little_endian.

Thanks,
Richard

> +
> +#define vect64 __attribute__((vector_size(8) ))
> +#define vect128 __attribute__((vector_size(16) ))
> +
> +/*
> +** f1:
> +**	fmov	s0, 1.0e\+0
> +**	ret
> +*/
> +vect64  float f1()
> +{
> +  return (vect64 float){1.0f, 0};
> +}
> +
> +/*
> +** f2:
> +**	fmov	s0, 1.0e\+0
> +**	ret
> +*/
> +vect128 float f2()
> +{
> +  return (vect128 float){1.0f, 0, 0, 0};
> +}
> +
> +/* f3 should only be done for -ffast-math. */
> +/*
> +** f3:
> +**	ldr	q0, .LC[0-9]+
> +**	ret
> +*/
> +vect128 float f3()
> +{
> +  return (vect128 float){1.0f, 0, -0.0f, 0.0f};
> +}
> +
> +/* f4 cannot be using fmov here,
> +   Note this is checked here as {1.0, 0.0, 1.0, 0.0}
> +   has CONST_VECTOR_DUPLICATE_P set
> +   and represented interanlly as: {1.0, 0.0}. */
> +/*
> +** f4:
> +**	ldr	q0, .LC[0-9]+
> +**	ret
> +*/
> +vect128 float f4()
> +{
> +  return (vect128 float){1.0f, 0, 1.0f, 0.0f};
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c
> new file mode 100644
> index 00000000000..c97d85b68a9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mcmodel=tiny -ffast-math" } */
> +/* { dg-final { check-function-bodies "**" "" ""  } } */
> +/* PR target/113856 */
> +
> +#define vect64 __attribute__((vector_size(8) ))
> +#define vect128 __attribute__((vector_size(16) ))
> +
> +/* f3 can be done with -ffast-math. */
> +/*
> +** f3:
> +**	fmov	s0, 1.0e\+0
> +**	ret
> +*/
> +vect128 float f3()
> +{
> +  return (vect128 float){1.0f, 0, -0.0f, 0.0f};
> +}
> +

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856]
  2024-03-07 16:26 ` [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856] Richard Sandiford
@ 2024-03-07 16:33   ` Richard Sandiford
  0 siblings, 0 replies; 5+ messages in thread
From: Richard Sandiford @ 2024-03-07 16:33 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: gcc-patches

Richard Sandiford <richard.sandiford@arm.com> writes:
> Andrew Pinski <quic_apinski@quicinc.com> writes:
>> Aarch64 has a way to form some floating point CSTs via the fmov instructions,
>> these instructions also zero out the upper parts of the registers so they can
>> be used for vector CSTs that have have one non-zero constant that would be able
>> to formed via the fmov in the first element.
>>
>> This implements this "small" optimization so these vector cst don't need to do
>> loads from memory.
>>
>> Built and tested on aarch64-linux-gnu with no regressions.
>>
>> 	PR target/113856
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64.cc (struct simd_immediate_info):
>> 	Add FMOV_SDH to insn_type. For scalar_float_mode constructor
>> 	add insn_in.
>> 	(aarch64_simd_valid_immediate): Catch `{fp, 0...}` vector_cst
>> 	and return a simd_immediate_info which uses FMOV_SDH.
>> 	(aarch64_output_simd_mov_immediate): Support outputting
>> 	fmov for FMOV_SDH.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.target/aarch64/fmov-zero-cst-1.c: New test.
>> 	* gcc.target/aarch64/fmov-zero-cst-2.c: New test.
>>
>> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
>> ---
>>  gcc/config/aarch64/aarch64.cc                 | 48 ++++++++++++++---
>>  .../gcc.target/aarch64/fmov-zero-cst-1.c      | 52 +++++++++++++++++++
>>  .../gcc.target/aarch64/fmov-zero-cst-2.c      | 19 +++++++
>>  3 files changed, 111 insertions(+), 8 deletions(-)
>>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-1.c
>>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-2.c
>>
>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>> index 5dd0814f198..c4386591a9b 100644
>> --- a/gcc/config/aarch64/aarch64.cc
>> +++ b/gcc/config/aarch64/aarch64.cc
>> @@ -126,11 +126,11 @@ constexpr auto AARCH64_STATE_OUT = 1U << 2;
>>  /* Information about a legitimate vector immediate operand.  */
>>  struct simd_immediate_info
>>  {
>> -  enum insn_type { MOV, MVN, INDEX, PTRUE };
>> +  enum insn_type { MOV, FMOV_SDH, MVN, INDEX, PTRUE };
>>    enum modifier_type { LSL, MSL };
>>  
>>    simd_immediate_info () {}
>> -  simd_immediate_info (scalar_float_mode, rtx);
>> +  simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV);
>>    simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
>>  		       insn_type = MOV, modifier_type = LSL,
>>  		       unsigned int = 0);
>> @@ -145,7 +145,7 @@ struct simd_immediate_info
>>  
>>    union
>>    {
>> -    /* For MOV and MVN.  */
>> +    /* For MOV, FMOV_SDH and MVN.  */
>>      struct
>>      {
>>        /* The value of each element.  */
>> @@ -173,9 +173,10 @@ struct simd_immediate_info
>>  /* Construct a floating-point immediate in which each element has mode
>>     ELT_MODE_IN and value VALUE_IN.  */
>>  inline simd_immediate_info
>> -::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
>> -  : elt_mode (elt_mode_in), insn (MOV)
>> +::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in)
>
> Nit: long line.
>
>> +  : elt_mode (elt_mode_in), insn (insn_in)
>>  {
>> +  gcc_assert (insn_in == MOV || insn_in == FMOV_SDH);
>>    u.mov.value = value_in;
>>    u.mov.modifier = LSL;
>>    u.mov.shift = 0;
>> @@ -22932,6 +22933,35 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
>>  	  return true;
>>  	}
>>      }
>> +  /* See if we can use fmov d0/s0/h0 ... for the constant. */
>> +  if (n_elts >= 1
>
> This condition seems unnecessary.  n_elts can't be zero.
>
>> +      && (vec_flags & VEC_ADVSIMD)
>> +      && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)
>> +      && !CONST_VECTOR_DUPLICATE_P (op))
>
> I think we should also drop this.  I guess it's to undo:
>
>   if (CONST_VECTOR_P (op)
>       && CONST_VECTOR_DUPLICATE_P (op))
>     n_elts = CONST_VECTOR_NPATTERNS (op);
>
> but we can use GET_MODE_NUNITS (mode) directly instead.
>
>> +    {
>> +      rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
>> +      if (aarch64_float_const_zero_rtx_p (elt)
>> +	  || aarch64_float_const_representable_p (elt))
>
> What's the effect of including aarch64_float_const_zero_rtx_p for the
> first element?  Does it change the code we generate for any cases
> involving +0.0?  Or is it more for -0.0?
>
>> +	{
>> +	  bool valid = true;
>> +	  for (unsigned int i = 1; i < n_elts; i++)
>> +	    {
>> +	      rtx elt1 = CONST_VECTOR_ENCODED_ELT (op, i);
>> +	      if (!aarch64_float_const_zero_rtx_p (elt1))
>> +		{
>> +		  valid = false;
>> +		  break;
>> +		}
>> +	    }
>> +	  if (valid)
>> +	    {
>> +	      if (info)
>> +		*info = simd_immediate_info (elt_float_mode, elt,
>> +					     simd_immediate_info::FMOV_SDH);
>> +	      return true;
>> +	    }
>> +	}
>> +    }

Sorry to reply to myself almost immediately, but did you consider extending:

  scalar_float_mode elt_float_mode;
  if (n_elts == 1
      && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
    {
      rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
      if (aarch64_float_const_zero_rtx_p (elt)
	  || aarch64_float_const_representable_p (elt))
	{
	  if (info)
	    *info = simd_immediate_info (elt_float_mode, elt);
	  return true;
	}
    }

rather than adding a new move type?

I think the same trick works for SVE as well as Advanced SIMD.

Richard

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] aarch64: Support `{1.0f, 1.0f, 0.0, 0.0}` CST forming with fmov with a smaller vector type.
  2024-02-24  3:18 ` [PATCH 2/2] aarch64: Support `{1.0f, 1.0f, 0.0, 0.0}` CST forming with fmov with a smaller vector type Andrew Pinski
@ 2024-03-07 16:41   ` Richard Sandiford
  0 siblings, 0 replies; 5+ messages in thread
From: Richard Sandiford @ 2024-03-07 16:41 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: gcc-patches

Andrew Pinski <quic_apinski@quicinc.com> writes:
> This enables construction of V4SF CST like `{1.0f, 1.0f, 0.0f, 0.0f}`
> (and other fp enabled CSTs) by using `fmov v0.2s, 1.0` as the instruction
> is designed to zero out the other bits.
> This is a small extension on top of the code that creates fmov for the case
> where the all but the first element is non-zero.

Similarly to the second reply to 1/2, I think we should handle this
by detecting when only the low 64 bits are nonzero, and then try to
construct a simd_immediate_info for the low 64 bits.  The technique
is more general than just floats.

The same thing would work for SVE too (if TARGET_SIMD).

Thanks,
Richard

> Built and tested for aarch64-linux-gnu with no regressions.
>
> 	PR target/113856
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64.cc (simd_immediate_info): Add bool to the
> 	float mode constructor. Document modifier field for FMOV_SDH.
> 	(aarch64_simd_valid_immediate): Recognize where the first half
> 	of the const float vect is the same.
> 	(aarch64_output_simd_mov_immediate): Handle the case where insn is
> 	FMOV_SDH and modifier is MSL.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/fmov-zero-cst-3.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apinski@quicinc.com>
> ---
>  gcc/config/aarch64/aarch64.cc                 | 34 ++++++++++++++++---
>  .../gcc.target/aarch64/fmov-zero-cst-3.c      | 28 +++++++++++++++
>  2 files changed, 57 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-3.c
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index c4386591a9b..89bd0c5e5a6 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -130,7 +130,7 @@ struct simd_immediate_info
>    enum modifier_type { LSL, MSL };
>  
>    simd_immediate_info () {}
> -  simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV);
> +  simd_immediate_info (scalar_float_mode, rtx, insn_type = MOV, bool = false);
>    simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
>  		       insn_type = MOV, modifier_type = LSL,
>  		       unsigned int = 0);
> @@ -153,6 +153,8 @@ struct simd_immediate_info
>  
>        /* The kind of shift modifier to use, and the number of bits to shift.
>  	 This is (LSL, 0) if no shift is needed.  */
> +      /* For FMOV_SDH, LSL says it is a single while MSL
> +	 says if it is either .4h/.2s fmov. */
>        modifier_type modifier;
>        unsigned int shift;
>      } mov;
> @@ -173,12 +175,12 @@ struct simd_immediate_info
>  /* Construct a floating-point immediate in which each element has mode
>     ELT_MODE_IN and value VALUE_IN.  */
>  inline simd_immediate_info
> -::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in)
> +::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in, insn_type insn_in, bool firsthalfsame)
>    : elt_mode (elt_mode_in), insn (insn_in)
>  {
>    gcc_assert (insn_in == MOV || insn_in == FMOV_SDH);
>    u.mov.value = value_in;
> -  u.mov.modifier = LSL;
> +  u.mov.modifier = firsthalfsame ? MSL : LSL;
>    u.mov.shift = 0;
>  }
>  
> @@ -22944,10 +22946,23 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
>  	  || aarch64_float_const_representable_p (elt))
>  	{
>  	  bool valid = true;
> +	  bool firsthalfsame = false;
>  	  for (unsigned int i = 1; i < n_elts; i++)
>  	    {
>  	      rtx elt1 = CONST_VECTOR_ENCODED_ELT (op, i);
>  	      if (!aarch64_float_const_zero_rtx_p (elt1))
> +		{
> +		  if (i == 1)
> +		    firsthalfsame = true;
> +		  if (!firsthalfsame
> +		      || i >= n_elts/2
> +		      || !rtx_equal_p (elt, elt1))
> +		    {
> +		      valid = false;
> +		      break;
> +		    }
> +		}
> +	      else if (firsthalfsame && i < n_elts/2)
>  		{
>  		  valid = false;
>  		  break;
> @@ -22957,7 +22972,8 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
>  	    {
>  	      if (info)
>  		*info = simd_immediate_info (elt_float_mode, elt,
> -					     simd_immediate_info::FMOV_SDH);
> +					     simd_immediate_info::FMOV_SDH,
> +					     firsthalfsame);
>  	      return true;
>  	    }
>  	}
> @@ -25165,8 +25181,16 @@ aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
>  	  real_to_decimal_for_mode (float_buf,
>  				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
>  				    buf_size, buf_size, 1, info.elt_mode);
> -	  if (info.insn == simd_immediate_info::FMOV_SDH)
> +	  if (info.insn == simd_immediate_info::FMOV_SDH
> +	      && info.u.mov.modifier == simd_immediate_info::LSL)
>  	    snprintf (templ, sizeof (templ), "fmov\t%%%c0, %s", element_char, float_buf);
> +	  else if (info.insn == simd_immediate_info::FMOV_SDH
> +	      && info.u.mov.modifier == simd_immediate_info::MSL)
> +	    {
> +	      gcc_assert (element_char != 'd');
> +	      gcc_assert (lane_count > 2);
> +	      snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s", lane_count/2, element_char, float_buf);
> +	    }
>  	  else if (lane_count == 1)
>  	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
>  	  else
> diff --git a/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-3.c b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-3.c
> new file mode 100644
> index 00000000000..7a78b6d3caf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/fmov-zero-cst-3.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mcmodel=tiny" } */
> +/* { dg-final { check-function-bodies "**" "" ""  } } */
> +/* PR target/113856 */
> +
> +#define vect64 __attribute__((vector_size(8) ))
> +#define vect128 __attribute__((vector_size(16) ))
> +
> +/*
> +** f2:
> +**	fmov	v0.2s, 1.0e\+0
> +**	ret
> +*/
> +vect128 float f2()
> +{
> +  return (vect128 float){1.0f, 1.0f, 0, 0};
> +}
> +
> +/*
> +** f3:
> +**	ldr	q0, \.LC[0-9]+
> +**	ret
> +*/
> +vect128 float f3()
> +{
> +  return (vect128 float){1.0f, 1.0f, 1.0f, 0.0};
> +}
> +

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2024-03-07 16:41 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-24  3:18 [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856] Andrew Pinski
2024-02-24  3:18 ` [PATCH 2/2] aarch64: Support `{1.0f, 1.0f, 0.0, 0.0}` CST forming with fmov with a smaller vector type Andrew Pinski
2024-03-07 16:41   ` Richard Sandiford
2024-03-07 16:26 ` [PATCH 1/2] aarch64: Use fmov s/d/hN, FP_CST for some vector CST [PR113856] Richard Sandiford
2024-03-07 16:33   ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).