public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, rs6000] Add store fusion support for Power10
@ 2021-08-02 20:19 Pat Haugen
  2021-08-04 14:23 ` Bill Schmidt
  0 siblings, 1 reply; 5+ messages in thread
From: Pat Haugen @ 2021-08-02 20:19 UTC (permalink / raw)
  To: GCC Patches; +Cc: Segher Boessenkool, David Edelsohn, Peter Bergner

Enable store fusion on Power10.

Use the SCHED_REORDER hook to implement Power10 specific ready list reordering.
As of now, pairing stores for store fusion is the only function being
performed.

Bootstrap/regtest on powerpc64le(Power10) with no new regressions. Ok for
master?

-Pat


2021-08-02  Pat Haugen  <pthaugen@linux.ibm.com>

gcc/ChangeLog:

	* config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag.
	(POWERPC_MASKS): Likewise.
	* config/rs6000/rs6000.c (rs6000_option_override_internal): Enable
	store fusion for Power10.
	(is_load_insn1): Verify destination is a register.
	(is_store_insn1): Verify source is a register.
	(is_fusable_store): New.
	(power10_sched_reorder): Likewise.
	(rs6000_sched_reorder): Do Power10 specific reordering.
	(rs6000_sched_reorder2): Likewise.
	* config/rs6000/rs6000.opt: Add new option.



diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def
index 6758296c0fd..f5812da0184 100644
--- a/gcc/config/rs6000/rs6000-cpus.def
+++ b/gcc/config/rs6000/rs6000-cpus.def
@@ -90,7 +90,8 @@
 				 | OPTION_MASK_P10_FUSION_2LOGICAL	\
 				 | OPTION_MASK_P10_FUSION_LOGADD 	\
 				 | OPTION_MASK_P10_FUSION_ADDLOG	\
-				 | OPTION_MASK_P10_FUSION_2ADD)
+				 | OPTION_MASK_P10_FUSION_2ADD		\
+				 | OPTION_MASK_P10_FUSION_2STORE)
 
 /* Flags that need to be turned off if -mno-power9-vector.  */
 #define OTHER_P9_VECTOR_MASKS	(OPTION_MASK_FLOAT128_HW		\
@@ -143,6 +144,7 @@
 				 | OPTION_MASK_P10_FUSION_LOGADD 	\
 				 | OPTION_MASK_P10_FUSION_ADDLOG	\
 				 | OPTION_MASK_P10_FUSION_2ADD    	\
+				 | OPTION_MASK_P10_FUSION_2STORE	\
 				 | OPTION_MASK_HTM			\
 				 | OPTION_MASK_ISEL			\
 				 | OPTION_MASK_MFCRF			\
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index 279f00cc648..1460a0d7c5c 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4490,6 +4490,10 @@ rs6000_option_override_internal (bool global_init_p)
       && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
     rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
 
+  if (TARGET_POWER10
+      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
+    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
+
   /* Turn off vector pair/mma options on non-power10 systems.  */
   else if (!TARGET_POWER10 && TARGET_MMA)
     {
@@ -18357,7 +18361,7 @@ is_load_insn1 (rtx pat, rtx *load_mem)
   if (!pat || pat == NULL_RTX)
     return false;
 
-  if (GET_CODE (pat) == SET)
+  if (GET_CODE (pat) == SET && REG_P (SET_DEST (pat)))
     return find_mem_ref (SET_SRC (pat), load_mem);
 
   if (GET_CODE (pat) == PARALLEL)
@@ -18394,7 +18398,8 @@ is_store_insn1 (rtx pat, rtx *str_mem)
   if (!pat || pat == NULL_RTX)
     return false;
 
-  if (GET_CODE (pat) == SET)
+  if (GET_CODE (pat) == SET
+      && (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat))))
     return find_mem_ref (SET_DEST (pat), str_mem);
 
   if (GET_CODE (pat) == PARALLEL)
@@ -18859,6 +18864,96 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos)
   return cached_can_issue_more;
 }
 
+/* Determine if INSN is a store to memory that can be fused with a similar
+   adjacent store.  */
+
+static bool
+is_fusable_store (rtx_insn *insn, rtx *str_mem)
+{
+  /* Exit early if not doing store fusion.  */
+  if (!(TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE))
+    return false;
+
+  /* Insn must be a non-prefixed base+disp form store.  */
+  if (is_store_insn (insn, str_mem)
+      && get_attr_prefixed (insn) == PREFIXED_NO
+      && get_attr_update (insn) == UPDATE_NO
+      && get_attr_indexed (insn) == INDEXED_NO)
+    {
+      /* Further restictions by mode and size.  */
+      machine_mode mode = GET_MODE (*str_mem);
+      HOST_WIDE_INT size;
+      if MEM_SIZE_KNOWN_P (*str_mem)
+	size = MEM_SIZE (*str_mem);
+      else
+	return false;
+
+      if INTEGRAL_MODE_P (mode)
+	{
+	  /* Must be word or dword size.  */
+	  return (size == 4 || size == 8);
+	}
+      else if FLOAT_MODE_P (mode)
+	{
+	  /* Must be dword size.  */
+	  return (size == 8);
+	}
+    }
+
+  return false;
+}
+
+/* Do Power10 specific reordering of the ready list.  */
+
+static int
+power10_sched_reorder (rtx_insn **ready, int lastpos)
+{
+  int pos;
+  rtx mem1, mem2;
+
+  /* Do store fusion during sched2 only.  */
+  if (!reload_completed)
+    return cached_can_issue_more;
+
+  /* If the prior insn finished off a store fusion pair then simply
+     reset the counter and return, nothing more to do.  */
+  if (load_store_pendulum != 0)
+    {
+      load_store_pendulum = 0;
+      return cached_can_issue_more;
+    }
+
+  /* Try to pair certain store insns to adjacent memory locations
+     so that the hardware will fuse them to a single operation.  */
+  if (is_fusable_store (last_scheduled_insn, &mem1))
+    {
+      /* A fusable store was just scheduled.  Scan the ready list for another
+	 store that it can fuse with.  */
+      pos = lastpos;
+      while (pos >= 0)
+	{
+	  /* GPR stores can be ascending or descending offsets, FPR/VSR stores
+	     must be ascending only.  */
+	  if (is_fusable_store (ready[pos], &mem2)
+	      && ((INTEGRAL_MODE_P (GET_MODE (mem1))
+		   && adjacent_mem_locations (mem1, mem2))
+		  || (FLOAT_MODE_P (GET_MODE (mem1))
+		   && (adjacent_mem_locations (mem1, mem2) == mem1))))
+	    {
+	      /* Found a fusable store.  Move it to the end of the ready list
+		 so it is scheduled next.  */
+	      move_to_end_of_ready (ready, pos, lastpos);
+
+	      load_store_pendulum = -1;
+	      break;
+	    }
+	  pos--;
+	}
+    }
+
+  return cached_can_issue_more;
+}
+
 /* We are about to begin issuing insns for this clock cycle. */
 
 static int
@@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose,
   if (rs6000_tune == PROCESSOR_POWER6)
     load_store_pendulum = 0;
 
+  /* Do Power10 dependent reordering.  */
+  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
+    power10_sched_reorder (ready, *pn_ready - 1);
+
   return rs6000_issue_rate ();
 }
 
@@ -18906,6 +19005,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready,
       && recog_memoized (last_scheduled_insn) >= 0)
     return power9_sched_reorder2 (ready, *pn_ready - 1);
 
+  /* Do Power10 dependent reordering.  */
+  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
+    return power10_sched_reorder (ready, *pn_ready - 1);
+
   return cached_can_issue_more;
 }
 
diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
index 0538db387dc..3753de19557 100644
--- a/gcc/config/rs6000/rs6000.opt
+++ b/gcc/config/rs6000/rs6000.opt
@@ -514,6 +514,10 @@ mpower10-fusion-2add
 Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags)
 Fuse dependent pairs of add or vaddudm instructions for better performance on power10.
 
+mpower10-fusion-2store
+Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
+Fuse certain store operations together for better performance on power10.
+
 mcrypto
 Target Mask(CRYPTO) Var(rs6000_isa_flags)
 Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, rs6000] Add store fusion support for Power10
  2021-08-02 20:19 [PATCH, rs6000] Add store fusion support for Power10 Pat Haugen
@ 2021-08-04 14:23 ` Bill Schmidt
  2021-08-04 16:12   ` Segher Boessenkool
  2021-08-04 21:16   ` Pat Haugen
  0 siblings, 2 replies; 5+ messages in thread
From: Bill Schmidt @ 2021-08-04 14:23 UTC (permalink / raw)
  To: Pat Haugen, GCC Patches; +Cc: Peter Bergner, David Edelsohn, Segher Boessenkool

Hi Pat,

Good stuff!  Comments below.

On 8/2/21 3:19 PM, Pat Haugen via Gcc-patches wrote:
> Enable store fusion on Power10.
>
> Use the SCHED_REORDER hook to implement Power10 specific ready list reordering.
> As of now, pairing stores for store fusion is the only function being
> performed.
>
> Bootstrap/regtest on powerpc64le(Power10) with no new regressions. Ok for
> master?
>
> -Pat
>
>
> 2021-08-02  Pat Haugen  <pthaugen@linux.ibm.com>
>
> gcc/ChangeLog:
>
> 	* config/rs6000/rs6000-cpus.def (ISA_3_1_MASKS_SERVER): Add new flag.
> 	(POWERPC_MASKS): Likewise.
> 	* config/rs6000/rs6000.c (rs6000_option_override_internal): Enable
> 	store fusion for Power10.
> 	(is_load_insn1): Verify destination is a register.
> 	(is_store_insn1): Verify source is a register.
> 	(is_fusable_store): New.
> 	(power10_sched_reorder): Likewise.
> 	(rs6000_sched_reorder): Do Power10 specific reordering.
> 	(rs6000_sched_reorder2): Likewise.
> 	* config/rs6000/rs6000.opt: Add new option.
>
>
>
> diff --git a/gcc/config/rs6000/rs6000-cpus.def b/gcc/config/rs6000/rs6000-cpus.def
> index 6758296c0fd..f5812da0184 100644
> --- a/gcc/config/rs6000/rs6000-cpus.def
> +++ b/gcc/config/rs6000/rs6000-cpus.def
> @@ -90,7 +90,8 @@
>   				 | OPTION_MASK_P10_FUSION_2LOGICAL	\
>   				 | OPTION_MASK_P10_FUSION_LOGADD 	\
>   				 | OPTION_MASK_P10_FUSION_ADDLOG	\
> -				 | OPTION_MASK_P10_FUSION_2ADD)
> +				 | OPTION_MASK_P10_FUSION_2ADD		\
> +				 | OPTION_MASK_P10_FUSION_2STORE)


This is all fine for now; as we've discussed elsewhere, we probably want 
to eventually consolidate all these fusion flags into one.

>
>   /* Flags that need to be turned off if -mno-power9-vector.  */
>   #define OTHER_P9_VECTOR_MASKS	(OPTION_MASK_FLOAT128_HW		\
> @@ -143,6 +144,7 @@
>   				 | OPTION_MASK_P10_FUSION_LOGADD 	\
>   				 | OPTION_MASK_P10_FUSION_ADDLOG	\
>   				 | OPTION_MASK_P10_FUSION_2ADD    	\
> +				 | OPTION_MASK_P10_FUSION_2STORE	\
>   				 | OPTION_MASK_HTM			\
>   				 | OPTION_MASK_ISEL			\
>   				 | OPTION_MASK_MFCRF			\
> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
> index 279f00cc648..1460a0d7c5c 100644
> --- a/gcc/config/rs6000/rs6000.c
> +++ b/gcc/config/rs6000/rs6000.c
> @@ -4490,6 +4490,10 @@ rs6000_option_override_internal (bool global_init_p)
>         && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
>       rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
>
> +  if (TARGET_POWER10
> +      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
> +    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
> +
>     /* Turn off vector pair/mma options on non-power10 systems.  */
>     else if (!TARGET_POWER10 && TARGET_MMA)
>       {
> @@ -18357,7 +18361,7 @@ is_load_insn1 (rtx pat, rtx *load_mem)
>     if (!pat || pat == NULL_RTX)
>       return false;
>
> -  if (GET_CODE (pat) == SET)
> +  if (GET_CODE (pat) == SET && REG_P (SET_DEST (pat)))
>       return find_mem_ref (SET_SRC (pat), load_mem);
Looks like this is just an optimization to quickly discard stores, right?
>     if (GET_CODE (pat) == PARALLEL)
> @@ -18394,7 +18398,8 @@ is_store_insn1 (rtx pat, rtx *str_mem)
>     if (!pat || pat == NULL_RTX)
>       return false;
>
> -  if (GET_CODE (pat) == SET)
> +  if (GET_CODE (pat) == SET
> +      && (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat))))
>       return find_mem_ref (SET_DEST (pat), str_mem);


Similar question.

>
>     if (GET_CODE (pat) == PARALLEL)
> @@ -18859,6 +18864,96 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos)
>     return cached_can_issue_more;
>   }
>
> +/* Determine if INSN is a store to memory that can be fused with a similar
> +   adjacent store.  */
> +
> +static bool
> +is_fusable_store (rtx_insn *insn, rtx *str_mem)
> +{
> +  /* Exit early if not doing store fusion.  */
> +  if (!(TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE))
> +    return false;
> +
> +  /* Insn must be a non-prefixed base+disp form store.  */
> +  if (is_store_insn (insn, str_mem)
> +      && get_attr_prefixed (insn) == PREFIXED_NO
> +      && get_attr_update (insn) == UPDATE_NO
> +      && get_attr_indexed (insn) == INDEXED_NO)
> +    {
> +      /* Further restictions by mode and size.  */
> +      machine_mode mode = GET_MODE (*str_mem);
> +      HOST_WIDE_INT size;
> +      if MEM_SIZE_KNOWN_P (*str_mem)
> +	size = MEM_SIZE (*str_mem);
> +      else
> +	return false;
> +
> +      if INTEGRAL_MODE_P (mode)
> +	{
> +	  /* Must be word or dword size.  */
> +	  return (size == 4 || size == 8);
> +	}
> +      else if FLOAT_MODE_P (mode)
> +	{
> +	  /* Must be dword size.  */
> +	  return (size == 8);
> +	}
> +    }
> +
> +  return false;
> +}
> +
> +/* Do Power10 specific reordering of the ready list.  */
> +
> +static int
> +power10_sched_reorder (rtx_insn **ready, int lastpos)
> +{
> +  int pos;
> +  rtx mem1, mem2;
> +
> +  /* Do store fusion during sched2 only.  */
> +  if (!reload_completed)
> +    return cached_can_issue_more;
> +
> +  /* If the prior insn finished off a store fusion pair then simply
> +     reset the counter and return, nothing more to do.  */


Good comments throughout, thanks!

> +  if (load_store_pendulum != 0)
> +    {
> +      load_store_pendulum = 0;
> +      return cached_can_issue_more;
> +    }
> +
> +  /* Try to pair certain store insns to adjacent memory locations
> +     so that the hardware will fuse them to a single operation.  */
> +  if (is_fusable_store (last_scheduled_insn, &mem1))
> +    {
> +      /* A fusable store was just scheduled.  Scan the ready list for another
> +	 store that it can fuse with.  */
> +      pos = lastpos;
> +      while (pos >= 0)
> +	{
> +	  /* GPR stores can be ascending or descending offsets, FPR/VSR stores
VSR?  I don't see how that applies here.
> +	     must be ascending only.  */
> +	  if (is_fusable_store (ready[pos], &mem2)
> +	      && ((INTEGRAL_MODE_P (GET_MODE (mem1))
> +		   && adjacent_mem_locations (mem1, mem2))
> +		  || (FLOAT_MODE_P (GET_MODE (mem1))
> +		   && (adjacent_mem_locations (mem1, mem2) == mem1))))
> +	    {
> +	      /* Found a fusable store.  Move it to the end of the ready list
> +		 so it is scheduled next.  */
> +	      move_to_end_of_ready (ready, pos, lastpos);
> +
> +	      load_store_pendulum = -1;
> +	      break;
> +	    }
> +	  pos--;
> +	}
> +    }
> +
> +  return cached_can_issue_more;
> +}
> +
>   /* We are about to begin issuing insns for this clock cycle. */
>
>   static int
> @@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose,
>     if (rs6000_tune == PROCESSOR_POWER6)
>       load_store_pendulum = 0;
>
> +  /* Do Power10 dependent reordering.  */
> +  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
> +    power10_sched_reorder (ready, *pn_ready - 1);
> +


I happened to notice that pn_ready is marked as ATTRIBUTE_UNUSED.  This 
predates your patch, but maybe you could clean that up too.

>     return rs6000_issue_rate ();
>   }
>
> @@ -18906,6 +19005,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready,
>         && recog_memoized (last_scheduled_insn) >= 0)
>       return power9_sched_reorder2 (ready, *pn_ready - 1);
>
> +  /* Do Power10 dependent reordering.  */
> +  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
> +    return power10_sched_reorder (ready, *pn_ready - 1);
> +
>     return cached_can_issue_more;
>   }
>
> diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
> index 0538db387dc..3753de19557 100644
> --- a/gcc/config/rs6000/rs6000.opt
> +++ b/gcc/config/rs6000/rs6000.opt
> @@ -514,6 +514,10 @@ mpower10-fusion-2add
>   Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags)
>   Fuse dependent pairs of add or vaddudm instructions for better performance on power10.
>
> +mpower10-fusion-2store
> +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
> +Fuse certain store operations together for better performance on power10.
> +
>   mcrypto
>   Target Mask(CRYPTO) Var(rs6000_isa_flags)
>   Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.

Can you think of any test cases we can use to demonstrate store fusion?

Otherwise looks good to me.  Thanks!

Bill


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, rs6000] Add store fusion support for Power10
  2021-08-04 14:23 ` Bill Schmidt
@ 2021-08-04 16:12   ` Segher Boessenkool
  2021-08-04 21:16   ` Pat Haugen
  1 sibling, 0 replies; 5+ messages in thread
From: Segher Boessenkool @ 2021-08-04 16:12 UTC (permalink / raw)
  To: Bill Schmidt; +Cc: Pat Haugen, GCC Patches, Peter Bergner, David Edelsohn

On Wed, Aug 04, 2021 at 09:23:13AM -0500, Bill Schmidt wrote:
> On 8/2/21 3:19 PM, Pat Haugen via Gcc-patches wrote:

(I reviewed this elsewhere instead of on the list...  Not good, since
the patch was on the list already.  Sorry.)

> >@@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump 
> >ATTRIBUTE_UNUSED, int sched_verbose,
> >    if (rs6000_tune == PROCESSOR_POWER6)
> >      load_store_pendulum = 0;
> >
> >+  /* Do Power10 dependent reordering.  */
> >+  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
> >+    power10_sched_reorder (ready, *pn_ready - 1);
> >+
> 
> I happened to notice that pn_ready is marked as ATTRIBUTE_UNUSED.  This 
> predates your patch, but maybe you could clean that up too.

*All* ATTRIBUTE_UNUSED instances should go away (or almost all, only
those that really mean "maybe unused" can stay).  The preferred way to
in C++ say some argument is unused is by not naming it (or only in a
comment).  So instead of

void f (int a ATTRIBUTE_UNUSED, int b ATTRIBUTE_UNUSED) { ... }

you can say

void f (int, int) { ... }

or

void f (int/*a*/, int/*b*/) { ... }

> Can you think of any test cases we can use to demonstrate store fusion?

It may be possible to make that reliable by doing a *really* simple
testcase?  But then it isn't testing very much :-(


Segher

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, rs6000] Add store fusion support for Power10
  2021-08-04 14:23 ` Bill Schmidt
  2021-08-04 16:12   ` Segher Boessenkool
@ 2021-08-04 21:16   ` Pat Haugen
  2021-08-04 21:55     ` Segher Boessenkool
  1 sibling, 1 reply; 5+ messages in thread
From: Pat Haugen @ 2021-08-04 21:16 UTC (permalink / raw)
  To: wschmidt, GCC Patches; +Cc: Peter Bergner, David Edelsohn, Segher Boessenkool

On 8/4/21 9:23 AM, Bill Schmidt wrote:
> Hi Pat,
> 
> Good stuff!  Comments below.
> 
> On 8/2/21 3:19 PM, Pat Haugen via Gcc-patches wrote:
>> diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
>> index 279f00cc648..1460a0d7c5c 100644
>> --- a/gcc/config/rs6000/rs6000.c
>> +++ b/gcc/config/rs6000/rs6000.c
>> @@ -4490,6 +4490,10 @@ rs6000_option_override_internal (bool global_init_p)
>>         && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2ADD) == 0)
>>       rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2ADD;
>>
>> +  if (TARGET_POWER10
>> +      && (rs6000_isa_flags_explicit & OPTION_MASK_P10_FUSION_2STORE) == 0)
>> +    rs6000_isa_flags |= OPTION_MASK_P10_FUSION_2STORE;
>> +
>>     /* Turn off vector pair/mma options on non-power10 systems.  */
>>     else if (!TARGET_POWER10 && TARGET_MMA)
>>       {
>> @@ -18357,7 +18361,7 @@ is_load_insn1 (rtx pat, rtx *load_mem)
>>     if (!pat || pat == NULL_RTX)
>>       return false;
>>
>> -  if (GET_CODE (pat) == SET)
>> +  if (GET_CODE (pat) == SET && REG_P (SET_DEST (pat)))
>>       return find_mem_ref (SET_SRC (pat), load_mem);
> Looks like this is just an optimization to quickly discard stores, right?

Additional verification check to make sure destination is REG, yes. This will become a separate patch.

>>     if (GET_CODE (pat) == PARALLEL)
>> @@ -18394,7 +18398,8 @@ is_store_insn1 (rtx pat, rtx *str_mem)
>>     if (!pat || pat == NULL_RTX)
>>       return false;
>>
>> -  if (GET_CODE (pat) == SET)
>> +  if (GET_CODE (pat) == SET
>> +      && (REG_P (SET_SRC (pat)) || SUBREG_P (SET_SRC (pat))))
>>       return find_mem_ref (SET_DEST (pat), str_mem);
> 
> 
> Similar question.

Similar answer. :)
> 
>>
>>     if (GET_CODE (pat) == PARALLEL)
>> @@ -18859,6 +18864,96 @@ power9_sched_reorder2 (rtx_insn **ready, int lastpos)
>>     return cached_can_issue_more;
>>   }
>>
>> +/* Determine if INSN is a store to memory that can be fused with a similar
>> +   adjacent store.  */
>> +
>> +static bool
>> +is_fusable_store (rtx_insn *insn, rtx *str_mem)
>> +{
>> +  /* Exit early if not doing store fusion.  */
>> +  if (!(TARGET_P10_FUSION && TARGET_P10_FUSION_2STORE))
>> +    return false;
>> +
>> +  /* Insn must be a non-prefixed base+disp form store.  */
>> +  if (is_store_insn (insn, str_mem)
>> +      && get_attr_prefixed (insn) == PREFIXED_NO
>> +      && get_attr_update (insn) == UPDATE_NO
>> +      && get_attr_indexed (insn) == INDEXED_NO)
>> +    {
>> +      /* Further restictions by mode and size.  */
>> +      machine_mode mode = GET_MODE (*str_mem);
>> +      HOST_WIDE_INT size;
>> +      if MEM_SIZE_KNOWN_P (*str_mem)
>> +    size = MEM_SIZE (*str_mem);
>> +      else
>> +    return false;
>> +
>> +      if INTEGRAL_MODE_P (mode)
>> +    {
>> +      /* Must be word or dword size.  */
>> +      return (size == 4 || size == 8);
>> +    }
>> +      else if FLOAT_MODE_P (mode)
>> +    {
>> +      /* Must be dword size.  */
>> +      return (size == 8);
>> +    }
>> +    }
>> +
>> +  return false;
>> +}
>> +
>> +/* Do Power10 specific reordering of the ready list.  */
>> +
>> +static int
>> +power10_sched_reorder (rtx_insn **ready, int lastpos)
>> +{
>> +  int pos;
>> +  rtx mem1, mem2;
>> +
>> +  /* Do store fusion during sched2 only.  */
>> +  if (!reload_completed)
>> +    return cached_can_issue_more;
>> +
>> +  /* If the prior insn finished off a store fusion pair then simply
>> +     reset the counter and return, nothing more to do.  */
> 
> 
> Good comments throughout, thanks!
> 
>> +  if (load_store_pendulum != 0)
>> +    {
>> +      load_store_pendulum = 0;
>> +      return cached_can_issue_more;
>> +    }
>> +
>> +  /* Try to pair certain store insns to adjacent memory locations
>> +     so that the hardware will fuse them to a single operation.  */
>> +  if (is_fusable_store (last_scheduled_insn, &mem1))
>> +    {
>> +      /* A fusable store was just scheduled.  Scan the ready list for another
>> +     store that it can fuse with.  */
>> +      pos = lastpos;
>> +      while (pos >= 0)
>> +    {
>> +      /* GPR stores can be ascending or descending offsets, FPR/VSR stores
> VSR?  I don't see how that applies here.

Scalar floating point store from VSX reg (i.e. stxsd).

>> +         must be ascending only.  */
>> +      if (is_fusable_store (ready[pos], &mem2)
>> +          && ((INTEGRAL_MODE_P (GET_MODE (mem1))
>> +           && adjacent_mem_locations (mem1, mem2))
>> +          || (FLOAT_MODE_P (GET_MODE (mem1))
>> +           && (adjacent_mem_locations (mem1, mem2) == mem1))))
>> +        {
>> +          /* Found a fusable store.  Move it to the end of the ready list
>> +         so it is scheduled next.  */
>> +          move_to_end_of_ready (ready, pos, lastpos);
>> +
>> +          load_store_pendulum = -1;
>> +          break;
>> +        }
>> +      pos--;
>> +    }
>> +    }
>> +
>> +  return cached_can_issue_more;
>> +}
>> +
>>   /* We are about to begin issuing insns for this clock cycle. */
>>
>>   static int
>> @@ -18885,6 +18980,10 @@ rs6000_sched_reorder (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose,
>>     if (rs6000_tune == PROCESSOR_POWER6)
>>       load_store_pendulum = 0;
>>
>> +  /* Do Power10 dependent reordering.  */
>> +  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
>> +    power10_sched_reorder (ready, *pn_ready - 1);
>> +
> 
> 
> I happened to notice that pn_ready is marked as ATTRIBUTE_UNUSED.  This predates your patch, but maybe you could clean that up too.

Will do as a separate cleanup.

> 
>>     return rs6000_issue_rate ();
>>   }
>>
>> @@ -18906,6 +19005,10 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready,
>>         && recog_memoized (last_scheduled_insn) >= 0)
>>       return power9_sched_reorder2 (ready, *pn_ready - 1);
>>
>> +  /* Do Power10 dependent reordering.  */
>> +  if (rs6000_tune == PROCESSOR_POWER10 && last_scheduled_insn)
>> +    return power10_sched_reorder (ready, *pn_ready - 1);
>> +
>>     return cached_can_issue_more;
>>   }
>>
>> diff --git a/gcc/config/rs6000/rs6000.opt b/gcc/config/rs6000/rs6000.opt
>> index 0538db387dc..3753de19557 100644
>> --- a/gcc/config/rs6000/rs6000.opt
>> +++ b/gcc/config/rs6000/rs6000.opt
>> @@ -514,6 +514,10 @@ mpower10-fusion-2add
>>   Target Undocumented Mask(P10_FUSION_2ADD) Var(rs6000_isa_flags)
>>   Fuse dependent pairs of add or vaddudm instructions for better performance on power10.
>>
>> +mpower10-fusion-2store
>> +Target Undocumented Mask(P10_FUSION_2STORE) Var(rs6000_isa_flags)
>> +Fuse certain store operations together for better performance on power10.
>> +
>>   mcrypto
>>   Target Mask(CRYPTO) Var(rs6000_isa_flags)
>>   Use ISA 2.07 Category:Vector.AES and Category:Vector.SHA2 instructions.
> 
> Can you think of any test cases we can use to demonstrate store fusion?

Yeah, should be able to come up with something to verify two adjacent stores.

Thanks for the review.
> 
> Otherwise looks good to me.  Thanks!
> 
> Bill
> 


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, rs6000] Add store fusion support for Power10
  2021-08-04 21:16   ` Pat Haugen
@ 2021-08-04 21:55     ` Segher Boessenkool
  0 siblings, 0 replies; 5+ messages in thread
From: Segher Boessenkool @ 2021-08-04 21:55 UTC (permalink / raw)
  To: Pat Haugen; +Cc: wschmidt, GCC Patches, Peter Bergner, David Edelsohn

Hi!

On Wed, Aug 04, 2021 at 04:16:45PM -0500, Pat Haugen wrote:
> On 8/4/21 9:23 AM, Bill Schmidt wrote:
> >> +      /* GPR stores can be ascending or descending offsets, FPR/VSR stores
> > VSR?  I don't see how that applies here.

Almost all scalar FP insns have a VR alternative, but unfortunately not
(always) with a unified or consistent syntax.

> > Can you think of any test cases we can use to demonstrate store fusion?
> 
> Yeah, should be able to come up with something to verify two adjacent stores.

The testcase should also stop working when the fusion does.  This might
mean you need more than one testcase.  The number one goal of our
testsuite is to warn of regressions, so it really should be more than
just a single two-liner scanning for one assembler insn.  Maybe do a
handful of cases?  :-)


Segher

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-08-04 21:56 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-02 20:19 [PATCH, rs6000] Add store fusion support for Power10 Pat Haugen
2021-08-04 14:23 ` Bill Schmidt
2021-08-04 16:12   ` Segher Boessenkool
2021-08-04 21:16   ` Pat Haugen
2021-08-04 21:55     ` Segher Boessenkool

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).