[PATCH] dse: Remove partial load after full store for high part access[PR71309]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] dse: Remove partial load after full store for high part access[PR71309]
@ 2020-07-21 10:54 Xiong Hu Luo
  2020-07-21 15:30 ` Richard Sandiford
  2020-07-21 21:54 ` [PATCH] " Segher Boessenkool
  0 siblings, 2 replies; 21+ messages in thread
From: Xiong Hu Luo @ 2020-07-21 10:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: segher, wschmidt, luoxhu, richard.sandiford, dberlin, jakub

This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

BTW, for AArch64, one ldr is replaced by mov with this patch, though
no performance change observerd...

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-21  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (get_stored_val): Use subreg before extract if shifting
	from high part.

gcc/testsuite/ChangeLog:

2020-07-21  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
	* gcc.target/powerpc/fold-vec-extract-short.p7.c: Add -mbig.
---
 gcc/dse.c                                     | 26 ++++++++++++---
 .../powerpc/fold-vec-extract-short.p7.c       |  2 +-
 gcc/testsuite/gcc.target/powerpc/pr71309.c    | 33 +++++++++++++++++++
 3 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..13f952ee5ff 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1855,7 +1855,7 @@ get_stored_val (store_info *store_info, machine_mode read_mode,
 {
   machine_mode store_mode = GET_MODE (store_info->mem);
   poly_int64 gap;
-  rtx read_reg;
+  rtx read_reg = NULL;
 
   /* To get here the read is within the boundaries of the write so
      shift will never be negative.  Start out with the shift being in
@@ -1872,9 +1872,27 @@ get_stored_val (store_info *store_info, machine_mode read_mode,
     {
       poly_int64 shift = gap * BITS_PER_UNIT;
       poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
-      read_reg = find_shift_sequence (access_size, store_info, read_mode,
-				      shift, optimize_bb_for_speed_p (bb),
-				      require_cst);
+      rtx rhs_subreg = NULL;
+
+      if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
+	{
+	  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
+	  poly_uint64 sub_off
+	    = ((!BYTES_BIG_ENDIAN)
+		 ? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
+		 : 0);
+
+	  rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
+					    store_mode, sub_off);
+	  if (rhs_subreg)
+	    read_reg
+	      = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
+	}
+
+      if (read_reg == NULL)
+	read_reg
+	  = find_shift_sequence (access_size, store_info, read_mode, shift,
+				 optimize_bb_for_speed_p (bb), require_cst);
     }
   else if (store_mode == BLKmode)
     {
diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c b/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
index 8616e7b11ad..b5cefe7dc12 100644
--- a/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
+++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
@@ -3,7 +3,7 @@
 
 /* { dg-do compile { target { powerpc*-*-linux* } } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-mdejagnu-cpu=power7 -O2" } */
+/* { dg-options "-mdejagnu-cpu=power7 -O2 -mbig" } */
 
 // six tests total. Targeting P7 BE.
 // p7 (be) vars:                 li, addi,              stxvw4x, rldic, addi, lhax/lhzx
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.27.0.90.geebb51ba8c


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-21 10:54 [PATCH] dse: Remove partial load after full store for high part access[PR71309] Xiong Hu Luo
@ 2020-07-21 15:30 ` Richard Sandiford
  2020-07-22  9:18   ` [PATCH v2] " luoxhu
  2020-07-21 21:54 ` [PATCH] " Segher Boessenkool
  1 sibling, 1 reply; 21+ messages in thread
From: Richard Sandiford @ 2020-07-21 15:30 UTC (permalink / raw)
  To: Xiong Hu Luo; +Cc: gcc-patches, segher, wschmidt, dberlin, jakub

Xiong Hu Luo <luoxhu@linux.ibm.com> writes:
> This patch could optimize (works for char/short/int/void*):
>
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
>
> =>
>
> 6: r119:TI=[r118:DI+0x10]
> 16: r122:DI=r119:TI#8
>
> Final ASM will be as below without partial load after full store(stxv+ld):
>   ld 10,16(3)
>   mr 9,3
>   ld 3,24(3)
>   std 10,0(9)
>   std 3,8(9)
>   blr
>
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression tested on Power9-LE.
>
> BTW, for AArch64, one ldr is replaced by mov with this patch, though
> no performance change observerd...
>
> ldp     x2, x3, [x0, 16]
> stp     x2, x3, [x0]
> ldr     x0, [x0, 8]
>
> =>
>
> mov     x1, x0
> ldp     x2, x0, [x0, 16]
> stp     x2, x0, [x1]

Looks like a nice optimisation!

> gcc/ChangeLog:
>
> 2020-07-21  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* dse.c (get_stored_val): Use subreg before extract if shifting
> 	from high part.
>
> gcc/testsuite/ChangeLog:
>
> 2020-07-21  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* gcc.target/powerpc/pr71309.c: New test.
> 	* gcc.target/powerpc/fold-vec-extract-short.p7.c: Add -mbig.
> ---
>  gcc/dse.c                                     | 26 ++++++++++++---
>  .../powerpc/fold-vec-extract-short.p7.c       |  2 +-
>  gcc/testsuite/gcc.target/powerpc/pr71309.c    | 33 +++++++++++++++++++
>  3 files changed, 56 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
>
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..13f952ee5ff 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1855,7 +1855,7 @@ get_stored_val (store_info *store_info, machine_mode read_mode,
>  {
>    machine_mode store_mode = GET_MODE (store_info->mem);
>    poly_int64 gap;
> -  rtx read_reg;
> +  rtx read_reg = NULL;
>  
>    /* To get here the read is within the boundaries of the write so
>       shift will never be negative.  Start out with the shift being in
> @@ -1872,9 +1872,27 @@ get_stored_val (store_info *store_info, machine_mode read_mode,
>      {
>        poly_int64 shift = gap * BITS_PER_UNIT;
>        poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
> -      read_reg = find_shift_sequence (access_size, store_info, read_mode,
> -				      shift, optimize_bb_for_speed_p (bb),
> -				      require_cst);
> +      rtx rhs_subreg = NULL;
> +
> +      if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
> +	{
> +	  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
> +	  poly_uint64 sub_off
> +	    = ((!BYTES_BIG_ENDIAN)
> +		 ? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
> +		 : 0);
> +
> +	  rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
> +					    store_mode, sub_off);
> +	  if (rhs_subreg)
> +	    read_reg
> +	      = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
> +	}
> +
> +      if (read_reg == NULL)
> +	read_reg
> +	  = find_shift_sequence (access_size, store_info, read_mode, shift,
> +				 optimize_bb_for_speed_p (bb), require_cst);

Did you consider doing this in find_shift_sequence instead?
ISTM that this is really using subregs to optimise:

      /* In theory we could also check for an ashr.  Ian Taylor knows
	 of one dsp where the cost of these two was not the same.  But
	 this really is a rare case anyway.  */
      target = expand_binop (new_mode, lshr_optab, new_reg,
			     gen_int_shift_amount (new_mode, shift),
			     new_reg, 1, OPTAB_DIRECT);

I think everything up to:

      /* Also try a wider mode if the necessary punning is either not
	 desirable or not possible.  */
      if (!CONSTANT_P (store_info->rhs)
	  && !targetm.modes_tieable_p (new_mode, store_mode))
	continue;

is either neutral or helpful for the subreg case too, so maybe
we could just add the optimisation after that.  (It probably isn't
worth reusing any of the later loop body code though, since the
subreg case is much simpler.)

I don't think we need to restrict this case to modes of size
shift * 2.  We can just check whether the shift is a multiple of
the new_mode calculated by find_shift_sequence (using multiple_p).

An easier way of converting the shift to a subreg byte offset
is to use subreg_offset_from_lsb, which also handles
BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN.

Thanks,
Richard


>      }
>    else if (store_mode == BLKmode)
>      {
> diff --git a/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c b/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
> index 8616e7b11ad..b5cefe7dc12 100644
> --- a/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
> +++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
> @@ -3,7 +3,7 @@
>  
>  /* { dg-do compile { target { powerpc*-*-linux* } } } */
>  /* { dg-require-effective-target powerpc_vsx_ok } */
> -/* { dg-options "-mdejagnu-cpu=power7 -O2" } */
> +/* { dg-options "-mdejagnu-cpu=power7 -O2 -mbig" } */
>  
>  // six tests total. Targeting P7 BE.
>  // p7 (be) vars:                 li, addi,              stxvw4x, rldic, addi, lhax/lhzx
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> new file mode 100644
> index 00000000000..94d727a8ed9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
> +
> +#define TYPE void*
> +#define TYPE2 void*
> +
> +struct path {
> +    TYPE2 mnt;
> +    TYPE dentry;
> +};
> +
> +struct nameidata {
> +    struct path path;
> +    struct path root;
> +};
> +
> +__attribute__ ((noinline))
> +TYPE foo(struct nameidata *nd)
> +{
> +  TYPE d;
> +  TYPE2 d2;
> +
> +  nd->path = nd->root;
> +  d = nd->path.dentry;
> +  d2 = nd->path.mnt;
> +  return d;
> +}
> +
> +/* { dg-final { scan-assembler-not {\mlxv\M} } } */
> +/* { dg-final { scan-assembler-not {\mstxv\M} } } */
> +/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-21 10:54 [PATCH] dse: Remove partial load after full store for high part access[PR71309] Xiong Hu Luo
  2020-07-21 15:30 ` Richard Sandiford
@ 2020-07-21 21:54 ` Segher Boessenkool
  2020-07-21 22:37   ` David Edelsohn
  1 sibling, 1 reply; 21+ messages in thread
From: Segher Boessenkool @ 2020-07-21 21:54 UTC (permalink / raw)
  To: Xiong Hu Luo, dje.gcc
  Cc: gcc-patches, wschmidt, richard.sandiford, dberlin, jakub

Hi!

On Tue, Jul 21, 2020 at 05:54:27AM -0500, Xiong Hu Luo wrote:
> --- a/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
> +++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
> @@ -3,7 +3,7 @@
>  
>  /* { dg-do compile { target { powerpc*-*-linux* } } } */
>  /* { dg-require-effective-target powerpc_vsx_ok } */
> -/* { dg-options "-mdejagnu-cpu=power7 -O2" } */
> +/* { dg-options "-mdejagnu-cpu=power7 -O2 -mbig" } */

-mbig and friends do not exist on all powerpc configurations, but since
this testcases requires *-*-linux anyway, it will work.  But we should
probably have a selector for this, or alternatively, allow the option
always (the target cannot run the resulting code, but we have many other
options like that, starting with -mcpu=).  David, what is your
preference?

The rs6000 parts of this patch are fine for trunk.  Thanks!


Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-21 21:54 ` [PATCH] " Segher Boessenkool
@ 2020-07-21 22:37   ` David Edelsohn
  2020-07-21 23:12     ` Segher Boessenkool
  0 siblings, 1 reply; 21+ messages in thread
From: David Edelsohn @ 2020-07-21 22:37 UTC (permalink / raw)
  To: Segher Boessenkool, Xiong Hu Luo
  Cc: GCC Patches, Bill Schmidt, Richard Sandiford, Daniel Berlin,
	Jakub Jelinek

On Tue, Jul 21, 2020 at 5:54 PM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> Hi!
>
> On Tue, Jul 21, 2020 at 05:54:27AM -0500, Xiong Hu Luo wrote:
> > --- a/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
> > +++ b/gcc/testsuite/gcc.target/powerpc/fold-vec-extract-short.p7.c
> > @@ -3,7 +3,7 @@
> >
> >  /* { dg-do compile { target { powerpc*-*-linux* } } } */
> >  /* { dg-require-effective-target powerpc_vsx_ok } */
> > -/* { dg-options "-mdejagnu-cpu=power7 -O2" } */
> > +/* { dg-options "-mdejagnu-cpu=power7 -O2 -mbig" } */
>
> -mbig and friends do not exist on all powerpc configurations, but since
> this testcases requires *-*-linux anyway, it will work.  But we should
> probably have a selector for this, or alternatively, allow the option
> always (the target cannot run the resulting code, but we have many other
> options like that, starting with -mcpu=).  David, what is your
> preference?
>
> The rs6000 parts of this patch are fine for trunk.  Thanks!

There already is a { target be } selector.  This set of testcases
shouldn't be limited to powerpc-linux.  Ideally the scan-assembler
strings should have "be" and "le" versions to confirm the correct
results for either.

Thanks, David

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-21 22:37   ` David Edelsohn
@ 2020-07-21 23:12     ` Segher Boessenkool
  0 siblings, 0 replies; 21+ messages in thread
From: Segher Boessenkool @ 2020-07-21 23:12 UTC (permalink / raw)
  To: David Edelsohn
  Cc: Xiong Hu Luo, GCC Patches, Bill Schmidt, Richard Sandiford,
	Daniel Berlin, Jakub Jelinek

On Tue, Jul 21, 2020 at 06:37:29PM -0400, David Edelsohn wrote:
> On Tue, Jul 21, 2020 at 5:54 PM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> > always (the target cannot run the resulting code, but we have many other
> > options like that, starting with -mcpu=).  David, what is your
> > preference?
> >
> > The rs6000 parts of this patch are fine for trunk.  Thanks!
> 
> There already is a { target be } selector.  This set of testcases
> shouldn't be limited to powerpc-linux.  Ideally the scan-assembler
> strings should have "be" and "le" versions to confirm the correct
> results for either.

There are some testcases that are really just for BE (or LE); but yeah,
we can just not scan the assembler on the oppositely endianed targets.

Very few tests are run tests that only work on one endianness, but that
is easy to adjust to as well (wrap some #ifdef around it), and
infrequent enough.  Okay, thanks, I worry too much :-)


Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-21 15:30 ` Richard Sandiford
@ 2020-07-22  9:18   ` luoxhu
  2020-07-22 11:05     ` Richard Sandiford
  0 siblings, 1 reply; 21+ messages in thread
From: luoxhu @ 2020-07-22  9:18 UTC (permalink / raw)
  To: gcc-patches, segher, wschmidt, dberlin, jakub, richard.sandiford

Hi,

On 2020/7/21 23:30, Richard Sandiford wrote:
> Xiong Hu Luo <luoxhu@linux.ibm.com> writes:>> @@ -1872,9 +1872,27 @@ get_stored_val (store_info *store_info, machine_mode read_mode,
>>       {
>>         poly_int64 shift = gap * BITS_PER_UNIT;
>>         poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
>> -      read_reg = find_shift_sequence (access_size, store_info, read_mode,
>> -				      shift, optimize_bb_for_speed_p (bb),
>> -				      require_cst);
>> +      rtx rhs_subreg = NULL;
>> +
>> +      if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
>> +	{
>> +	  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
>> +	  poly_uint64 sub_off
>> +	    = ((!BYTES_BIG_ENDIAN)
>> +		 ? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
>> +		 : 0);
>> +
>> +	  rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
>> +					    store_mode, sub_off);
>> +	  if (rhs_subreg)
>> +	    read_reg
>> +	      = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
>> +	}
>> +
>> +      if (read_reg == NULL)
>> +	read_reg
>> +	  = find_shift_sequence (access_size, store_info, read_mode, shift,
>> +				 optimize_bb_for_speed_p (bb), require_cst);
> 
> Did you consider doing this in find_shift_sequence instead?
> ISTM that this is really using subregs to optimise:
> 
>        /* In theory we could also check for an ashr.  Ian Taylor knows
> 	 of one dsp where the cost of these two was not the same.  But
> 	 this really is a rare case anyway.  */
>        target = expand_binop (new_mode, lshr_optab, new_reg,
> 			     gen_int_shift_amount (new_mode, shift),
> 			     new_reg, 1, OPTAB_DIRECT);
> 
> I think everything up to:
> 
>        /* Also try a wider mode if the necessary punning is either not
> 	 desirable or not possible.  */
>        if (!CONSTANT_P (store_info->rhs)
> 	  && !targetm.modes_tieable_p (new_mode, store_mode))
> 	continue;
> 
> is either neutral or helpful for the subreg case too, so maybe
> we could just add the optimisation after that.  (It probably isn't
> worth reusing any of the later loop body code though, since the
> subreg case is much simpler.)
> 
> I don't think we need to restrict this case to modes of size
> shift * 2.  We can just check whether the shift is a multiple of
> the new_mode calculated by find_shift_sequence (using multiple_p).
> 
> An easier way of converting the shift to a subreg byte offset
> is to use subreg_offset_from_lsb, which also handles
> BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN.
> 

Thanks, I've updated the patch by moving it into find_shift_sequence.
Not sure whether meets your comments precisely though it still works:)
There is a comment mentioned that 

  /* Some machines like the x86 have shift insns for each size of
     operand.  Other machines like the ppc or the ia-64 may only have
     shift insns that shift values within 32 or 64 bit registers.
     This loop tries to find the smallest shift insn that will right
     justify the value we want to read but is available in one insn on
     the machine.  */

So it will early break without some additional check as the new_mode is
TImode here:

      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
	break;



[PATCH v2] dse: Remove partial load after full store for high part access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
18: r122:TI=r119:TI
16: r123:TI#0=r122:TI#8 0>>0
17: r123:TI#8=0
19: r124:DI=r123:TI#0
7: [r118:DI]=r119:TI
8: r121:DI=r124:DI

Final ASM will be as below without partial load after full store(stxv+ld):
  mr 9,3
  ld 3,24(3)
  ld 10,16(3)
  std 3,8(9)
  std 10,0(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression testing on Power9-LE.

For AArch64, one ldr is replaced by mov:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 15 +++++++++-
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..e06a9fbb0cd 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,7 +1736,8 @@ find_shift_sequence (poly_int64 access_size,
       int cost;
 
       new_mode = new_mode_iter.require ();
-      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
+      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD
+	  && !multiple_p (GET_MODE_BITSIZE (new_mode), shift))
 	break;
 
       /* If a constant was stored into memory, try to simplify it here,
@@ -1793,6 +1794,18 @@ find_shift_sequence (poly_int64 access_size,
       shift_seq = get_insns ();
       end_sequence ();
 
+      /* Use subreg shifting from high part to avoid full store followed by
+	 partial load.  See PR71309.  */
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift) && shift_seq)
+	{
+	  new_lhs = extract_low_bits (new_mode, store_mode,
+				      copy_rtx (store_info->rhs));
+	  emit_move_insn (new_reg, new_lhs);
+	  emit_insn (shift_seq);
+	  read_reg = extract_low_bits (read_mode, new_mode, target);
+	  break;
+	}
+
       if (target != new_reg || shift_seq == NULL)
 	continue;
 
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.27.0.90.geebb51ba8c



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v2] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-22  9:18   ` [PATCH v2] " luoxhu
@ 2020-07-22 11:05     ` Richard Sandiford
  2020-07-22 15:22       ` [PATCH v3] " luoxhu
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Sandiford @ 2020-07-22 11:05 UTC (permalink / raw)
  To: luoxhu; +Cc: gcc-patches, segher, wschmidt, dberlin, jakub

luoxhu <luoxhu@linux.ibm.com> writes:
> Hi,
>
> On 2020/7/21 23:30, Richard Sandiford wrote:
>> Xiong Hu Luo <luoxhu@linux.ibm.com> writes:>> @@ -1872,9 +1872,27 @@ get_stored_val (store_info *store_info, machine_mode read_mode,
>>>       {
>>>         poly_int64 shift = gap * BITS_PER_UNIT;
>>>         poly_int64 access_size = GET_MODE_SIZE (read_mode) + gap;
>>> -      read_reg = find_shift_sequence (access_size, store_info, read_mode,
>>> -				      shift, optimize_bb_for_speed_p (bb),
>>> -				      require_cst);
>>> +      rtx rhs_subreg = NULL;
>>> +
>>> +      if (known_eq (GET_MODE_BITSIZE (store_mode), shift * 2))
>>> +	{
>>> +	  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
>>> +	  poly_uint64 sub_off
>>> +	    = ((!BYTES_BIG_ENDIAN)
>>> +		 ? GET_MODE_SIZE (store_mode) - GET_MODE_SIZE (inner_mode)
>>> +		 : 0);
>>> +
>>> +	  rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
>>> +					    store_mode, sub_off);
>>> +	  if (rhs_subreg)
>>> +	    read_reg
>>> +	      = extract_low_bits (read_mode, inner_mode, copy_rtx (rhs_subreg));
>>> +	}
>>> +
>>> +      if (read_reg == NULL)
>>> +	read_reg
>>> +	  = find_shift_sequence (access_size, store_info, read_mode, shift,
>>> +				 optimize_bb_for_speed_p (bb), require_cst);
>> 
>> Did you consider doing this in find_shift_sequence instead?
>> ISTM that this is really using subregs to optimise:
>> 
>>        /* In theory we could also check for an ashr.  Ian Taylor knows
>> 	 of one dsp where the cost of these two was not the same.  But
>> 	 this really is a rare case anyway.  */
>>        target = expand_binop (new_mode, lshr_optab, new_reg,
>> 			     gen_int_shift_amount (new_mode, shift),
>> 			     new_reg, 1, OPTAB_DIRECT);
>> 
>> I think everything up to:
>> 
>>        /* Also try a wider mode if the necessary punning is either not
>> 	 desirable or not possible.  */
>>        if (!CONSTANT_P (store_info->rhs)
>> 	  && !targetm.modes_tieable_p (new_mode, store_mode))
>> 	continue;
>> 
>> is either neutral or helpful for the subreg case too, so maybe
>> we could just add the optimisation after that.  (It probably isn't
>> worth reusing any of the later loop body code though, since the
>> subreg case is much simpler.)
>> 
>> I don't think we need to restrict this case to modes of size
>> shift * 2.  We can just check whether the shift is a multiple of
>> the new_mode calculated by find_shift_sequence (using multiple_p).
>> 
>> An easier way of converting the shift to a subreg byte offset
>> is to use subreg_offset_from_lsb, which also handles
>> BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN.
>> 
>
> Thanks, I've updated the patch by moving it into find_shift_sequence.
> Not sure whether meets your comments precisely though it still works:)
> There is a comment mentioned that 
>
>   /* Some machines like the x86 have shift insns for each size of
>      operand.  Other machines like the ppc or the ia-64 may only have
>      shift insns that shift values within 32 or 64 bit registers.
>      This loop tries to find the smallest shift insn that will right
>      justify the value we want to read but is available in one insn on
>      the machine.  */
>
> So it will early break without some additional check as the new_mode is
> TImode here:
>
>       if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
> 	break;
>
>
>
> [PATCH v2] dse: Remove partial load after full store for high part access[PR71309]
>
>
> This patch could optimize (works for char/short/int/void*):
>
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
>
> =>
>
> 6: r119:TI=[r118:DI+0x10]
> 18: r122:TI=r119:TI
> 16: r123:TI#0=r122:TI#8 0>>0
> 17: r123:TI#8=0
> 19: r124:DI=r123:TI#0
> 7: [r118:DI]=r119:TI
> 8: r121:DI=r124:DI
>
> Final ASM will be as below without partial load after full store(stxv+ld):
>   mr 9,3
>   ld 3,24(3)
>   ld 10,16(3)
>   std 3,8(9)
>   std 10,0(9)
>   blr
>
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression testing on Power9-LE.
>
> For AArch64, one ldr is replaced by mov:
>
> ldp     x2, x3, [x0, 16]
> stp     x2, x3, [x0]
> ldr     x0, [x0, 8]
>
> =>
>
> mov     x1, x0
> ldp     x2, x0, [x0, 16]
> stp     x2, x0, [x1]
>
> gcc/ChangeLog:
>
> 2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* dse.c (find_shift_sequence): Use subreg of shifted from high part
> 	register to avoid loading from address.
>
> gcc/testsuite/ChangeLog:
>
> 2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* gcc.target/powerpc/pr71309.c: New test.
> ---
>  gcc/dse.c                                  | 15 +++++++++-
>  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
>  2 files changed, 47 insertions(+), 1 deletion(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
>
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..e06a9fbb0cd 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1736,7 +1736,8 @@ find_shift_sequence (poly_int64 access_size,
>        int cost;
>  
>        new_mode = new_mode_iter.require ();
> -      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
> +      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD
> +	  && !multiple_p (GET_MODE_BITSIZE (new_mode), shift))
>  	break;
>  
>        /* If a constant was stored into memory, try to simplify it here,
> @@ -1793,6 +1794,18 @@ find_shift_sequence (poly_int64 access_size,
>        shift_seq = get_insns ();
>        end_sequence ();
>  
> +      /* Use subreg shifting from high part to avoid full store followed by
> +	 partial load.  See PR71309.  */
> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift) && shift_seq)
> +	{
> +	  new_lhs = extract_low_bits (new_mode, store_mode,
> +				      copy_rtx (store_info->rhs));
> +	  emit_move_insn (new_reg, new_lhs);
> +	  emit_insn (shift_seq);
> +	  read_reg = extract_low_bits (read_mode, new_mode, target);
> +	  break;
> +	}
> +

This wasn't really what I meant.  Using subregs is fine, but I was
thinking of:

      /* Also try a wider mode if the necessary punning is either not
	 desirable or not possible.  */
      if (!CONSTANT_P (store_info->rhs)
	  && !targetm.modes_tieable_p (new_mode, store_mode))
	continue;

      if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
	{
	  /* Try to implement the shift using a subreg.  */
	  poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
						      shift);
	  rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
					    store_mode, offset);
	  if (rhs_subreg)
	    {
	      ...
	      break;
	    }
	}

where the rhs_subreg is from your original patch.

The multiple_p should be that way round: the shift needs to be a
multiple of the new_mode for the subreg to be valid.

I think this should also avoid the BITS_PER_WORD problem.  On the
other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
we're using subregs, so maybe moving it to after the multiple_p
if block would still make sense.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-22 11:05     ` Richard Sandiford
@ 2020-07-22 15:22       ` luoxhu
  2020-07-22 17:28         ` Richard Sandiford
  0 siblings, 1 reply; 21+ messages in thread
From: luoxhu @ 2020-07-22 15:22 UTC (permalink / raw)
  To: gcc-patches, segher, wschmidt, dberlin, jakub, richard.sandiford

Hi,

On 2020/7/22 19:05, Richard Sandiford wrote:
> This wasn't really what I meant.  Using subregs is fine, but I was
> thinking of:
> 
>        /* Also try a wider mode if the necessary punning is either not
> 	 desirable or not possible.  */
>        if (!CONSTANT_P (store_info->rhs)
> 	  && !targetm.modes_tieable_p (new_mode, store_mode))
> 	continue;
> 
>        if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
> 	{
> 	  /* Try to implement the shift using a subreg.  */
> 	  poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
> 						      shift);
> 	  rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
> 					    store_mode, offset);
> 	  if (rhs_subreg)
> 	    {
> 	      ...
> 	      break;
> 	    }
> 	}
> 
> where the rhs_subreg is from your original patch.
> 
> The multiple_p should be that way round: the shift needs to be a
> multiple of the new_mode for the subreg to be valid.
> 
> I think this should also avoid the BITS_PER_WORD problem.  On the
> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
> we're using subregs, so maybe moving it to after the multiple_p
> if block would still make sense.
> 

Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
based on your prototype, shift should be put in op1 as multiple_p requires
op0 >= op1. 

Then, new_mode is still TImode same to store_mode, offset will return 8 when
shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
otherwise it will return NULL if new_mode is equal to store_mode.

Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
shift_seq for other circumstances. :)


[PATCH v3] dse: Remove partial load after full store for high part access[PR71309]


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 21 ++++++++++++--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..aaa161237c3 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1736,8 +1736,6 @@ find_shift_sequence (poly_int64 access_size,
       int cost;
 
       new_mode = new_mode_iter.require ();
-      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
-	break;
 
       /* If a constant was stored into memory, try to simplify it here,
 	 otherwise the cost of the shift might preclude this optimization
@@ -1779,6 +1777,25 @@ find_shift_sequence (poly_int64 access_size,
 	  && !targetm.modes_tieable_p (new_mode, store_mode))
 	continue;
 
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift))
+	{
+	  /* Try to implement the shift using a subreg.  */
+	  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
+	  poly_int64 offset
+	    = subreg_offset_from_lsb (new_mode, store_mode, shift);
+	  rtx rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
+						store_mode, offset);
+	  if (rhs_subreg)
+	    {
+	      read_reg = extract_low_bits (read_mode, inner_mode,
+					   copy_rtx (rhs_subreg));
+	      break;
+	    }
+	}
+
+      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
+	break;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.27.0.90.geebb51ba8c




^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-22 15:22       ` [PATCH v3] " luoxhu
@ 2020-07-22 17:28         ` Richard Sandiford
  2020-07-22 20:30           ` Richard Sandiford
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Sandiford @ 2020-07-22 17:28 UTC (permalink / raw)
  To: luoxhu; +Cc: gcc-patches, segher, wschmidt, dberlin, jakub

luoxhu <luoxhu@linux.ibm.com> writes:
> Hi,
>
> On 2020/7/22 19:05, Richard Sandiford wrote:
>> This wasn't really what I meant.  Using subregs is fine, but I was
>> thinking of:
>> 
>>        /* Also try a wider mode if the necessary punning is either not
>> 	 desirable or not possible.  */
>>        if (!CONSTANT_P (store_info->rhs)
>> 	  && !targetm.modes_tieable_p (new_mode, store_mode))
>> 	continue;
>> 
>>        if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
>> 	{
>> 	  /* Try to implement the shift using a subreg.  */
>> 	  poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
>> 						      shift);
>> 	  rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
>> 					    store_mode, offset);
>> 	  if (rhs_subreg)
>> 	    {
>> 	      ...
>> 	      break;
>> 	    }
>> 	}
>> 
>> where the rhs_subreg is from your original patch.
>> 
>> The multiple_p should be that way round: the shift needs to be a
>> multiple of the new_mode for the subreg to be valid.
>> 
>> I think this should also avoid the BITS_PER_WORD problem.  On the
>> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
>> we're using subregs, so maybe moving it to after the multiple_p
>> if block would still make sense.
>> 
>
> Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
> based on your prototype, shift should be put in op1 as multiple_p requires
> op0 >= op1. 
>
> Then, new_mode is still TImode same to store_mode, offset will return 8 when
> shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
> generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
> otherwise it will return NULL if new_mode is equal to store_mode.
>
> Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
> shift_seq for other circumstances. :)

I don't understand why my version doesn't work though.  The point
is that we're using the containing:

  FOR_EACH_MODE_FROM (new_mode_iter,
		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))

to find a suitable mode.  In the existing code it's searching for a mode
that is suitable for the shift.  In the new code it's finding a mode that
is suitable for the outer mode of the subreg (hence using new_mode as the
first argument to simplify_gen_subreg above).  It shouldn't be necessary
to use smallest_int_mode_for_size to find a different mode.

That's also why the multiple_p is the way round it is above.  The idea
is that the shift amount must be a multiple of the size of the outer mode
(here new_mode) in order for the subreg to be valid.

So in your example, the loop should be finding new_mode == DImode,
seeing that the shift amount of 64 is a multiple of the size of DImode,
and trying to convert that shift anount into a DImode subreg of the
TImode value.

Thanks,
Richard

> [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]
>
>
> This patch could optimize (works for char/short/int/void*):
>
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
>
> =>
>
> 6: r119:TI=[r118:DI+0x10]
> 16: r122:DI=r119:TI#8
>
> Final ASM will be as below without partial load after full store(stxv+ld):
>   ld 10,16(3)
>   mr 9,3
>   ld 3,24(3)
>   std 10,0(9)
>   std 3,8(9)
>   blr
>
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression tested on Power9-LE.
>
> For AArch64, one ldr is replaced by mov with this patch:
>
> ldp     x2, x3, [x0, 16]
> stp     x2, x3, [x0]
> ldr     x0, [x0, 8]
>
> =>
>
> mov     x1, x0
> ldp     x2, x0, [x0, 16]
> stp     x2, x0, [x1]
>
> gcc/ChangeLog:
>
> 2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* dse.c (find_shift_sequence): Use subreg of shifted from high part
> 	register to avoid loading from address.
>
> gcc/testsuite/ChangeLog:
>
> 2020-07-22  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* gcc.target/powerpc/pr71309.c: New test.
> ---
>  gcc/dse.c                                  | 21 ++++++++++++--
>  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
>  2 files changed, 52 insertions(+), 2 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
>
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..aaa161237c3 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1736,8 +1736,6 @@ find_shift_sequence (poly_int64 access_size,
>        int cost;
>  
>        new_mode = new_mode_iter.require ();
> -      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
> -	break;
>  
>        /* If a constant was stored into memory, try to simplify it here,
>  	 otherwise the cost of the shift might preclude this optimization
> @@ -1779,6 +1777,25 @@ find_shift_sequence (poly_int64 access_size,
>  	  && !targetm.modes_tieable_p (new_mode, store_mode))
>  	continue;
>  
> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift))
> +	{
> +	  /* Try to implement the shift using a subreg.  */
> +	  scalar_int_mode inner_mode = smallest_int_mode_for_size (shift);
> +	  poly_int64 offset
> +	    = subreg_offset_from_lsb (new_mode, store_mode, shift);
> +	  rtx rhs_subreg = simplify_gen_subreg (inner_mode, store_info->rhs,
> +						store_mode, offset);
> +	  if (rhs_subreg)
> +	    {
> +	      read_reg = extract_low_bits (read_mode, inner_mode,
> +					   copy_rtx (rhs_subreg));
> +	      break;
> +	    }
> +	}
> +
> +      if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
> +	break;
> +
>        new_reg = gen_reg_rtx (new_mode);
>  
>        start_sequence ();
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> new file mode 100644
> index 00000000000..94d727a8ed9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
> +
> +#define TYPE void*
> +#define TYPE2 void*
> +
> +struct path {
> +    TYPE2 mnt;
> +    TYPE dentry;
> +};
> +
> +struct nameidata {
> +    struct path path;
> +    struct path root;
> +};
> +
> +__attribute__ ((noinline))
> +TYPE foo(struct nameidata *nd)
> +{
> +  TYPE d;
> +  TYPE2 d2;
> +
> +  nd->path = nd->root;
> +  d = nd->path.dentry;
> +  d2 = nd->path.mnt;
> +  return d;
> +}
> +
> +/* { dg-final { scan-assembler-not {\mlxv\M} } } */
> +/* { dg-final { scan-assembler-not {\mstxv\M} } } */
> +/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-22 17:28         ` Richard Sandiford
@ 2020-07-22 20:30           ` Richard Sandiford
  2020-07-23 12:35             ` luoxhu
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Sandiford @ 2020-07-22 20:30 UTC (permalink / raw)
  To: luoxhu; +Cc: gcc-patches, segher, wschmidt, dberlin, jakub

Richard Sandiford <richard.sandiford@arm.com> writes:
> luoxhu <luoxhu@linux.ibm.com> writes:
>> Hi,
>>
>> On 2020/7/22 19:05, Richard Sandiford wrote:
>>> This wasn't really what I meant.  Using subregs is fine, but I was
>>> thinking of:
>>> 
>>>        /* Also try a wider mode if the necessary punning is either not
>>> 	 desirable or not possible.  */
>>>        if (!CONSTANT_P (store_info->rhs)
>>> 	  && !targetm.modes_tieable_p (new_mode, store_mode))
>>> 	continue;
>>> 
>>>        if (multiple_p (shift, GET_MODE_BITSIZE (new_mode)))
>>> 	{
>>> 	  /* Try to implement the shift using a subreg.  */
>>> 	  poly_int64 offset = subreg_offset_from_lsb (new_mode, store_mode,
>>> 						      shift);
>>> 	  rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
>>> 					    store_mode, offset);
>>> 	  if (rhs_subreg)
>>> 	    {
>>> 	      ...
>>> 	      break;
>>> 	    }
>>> 	}
>>> 
>>> where the rhs_subreg is from your original patch.
>>> 
>>> The multiple_p should be that way round: the shift needs to be a
>>> multiple of the new_mode for the subreg to be valid.
>>> 
>>> I think this should also avoid the BITS_PER_WORD problem.  On the
>>> other hand, I agree BITS_PER_UNIT isn't a very sensible limit if
>>> we're using subregs, so maybe moving it to after the multiple_p
>>> if block would still make sense.
>>> 
>>
>> Thanks, I took that rhs_subreg part back for the v3 patch and updated a bit
>> based on your prototype, shift should be put in op1 as multiple_p requires
>> op0 >= op1. 
>>
>> Then, new_mode is still TImode same to store_mode, offset will return 8 when
>> shift is 64,  simplify_gen_subreg needs an additional inner_mode(DImode) 
>> generated from "smallest_int_mode_for_size (shift)" to get rhs_subreg, 
>> otherwise it will return NULL if new_mode is equal to store_mode.
>>
>> Lastly, move the BITS_PER_UNIT after multiple_p as it still need generate
>> shift_seq for other circumstances. :)
>
> I don't understand why my version doesn't work though.  The point
> is that we're using the containing:
>
>   FOR_EACH_MODE_FROM (new_mode_iter,
> 		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
>
> to find a suitable mode.  In the existing code it's searching for a mode
> that is suitable for the shift.  In the new code it's finding a mode that
> is suitable for the outer mode of the subreg (hence using new_mode as the
> first argument to simplify_gen_subreg above).  It shouldn't be necessary
> to use smallest_int_mode_for_size to find a different mode.

I now realise the reason is that the starting mode is too wide.
I think we should fix that by doing:

  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
    {
      …

and then add:

      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
        continue;

after your optimisation, so that the shift code proper still only
considers modes that are wide enough to hold the unshifted value.

I don't think there are any efficiency concerns with that, since
smallest_int_mode_for_size does its own similar iteration internally.

Sorry for not picking up on that first time.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v3] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-22 20:30           ` Richard Sandiford
@ 2020-07-23 12:35             ` luoxhu
  2020-07-24 10:47               ` [PATCH v4] " luoxhu
  0 siblings, 1 reply; 21+ messages in thread
From: luoxhu @ 2020-07-23 12:35 UTC (permalink / raw)
  To: gcc-patches, segher, wschmidt, dberlin, jakub, richard.sandiford



On 2020/7/23 04:30, Richard Sandiford wrote:
> 
> I now realise the reason is that the starting mode is too wide.
> I think we should fix that by doing:
> 
>    FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
>      {
>        …
> 
> and then add:
> 
>        if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
>          continue;
> 
> after your optimisation, so that the shift code proper still only
> considers modes that are wide enough to hold the unshifted value.
> 
> I don't think there are any efficiency concerns with that, since
> smallest_int_mode_for_size does its own similar iteration internally.
> 
> Sorry for not picking up on that first time.
> 

Thanks:), I didn't make it clear that it starts from TImode first...

The updated patch use "FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)"
to iterate from QImode now, but still need size "new_mode <= store_mode"
to get legal subreg, otherwise "gfortran.dg/shift-kind.f90" will fail.
Even so, there are still below execute fail when running regression tests:

gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 -fbounds-check
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 -fomit-frame-pointer -finline-functions
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O2 -fomit-frame-pointer -finline-functions -funroll-loops
gfortran.fortran-torture/execute/equiv_2.f90 execution,  -O3 -g
gfortran.fortran-torture/execute/equiv_2.f90 execution, -O2 -ftree-vectorize -maltivec 

Any clue about the execution outputs "STOP 2", please? Still investigating.

PS: if switch the sequence of GET_MODE_BITSIZE (new_mode) and shift in multiple_p,
it will generates incorrect ASM for PR71309.


This patch could optimize (works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-23  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-23  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 22 +++++++++++++--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..afc6ad30623 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
      the machine.  */
 
   opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
-		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
     {
       rtx target, new_reg, new_lhs;
       rtx_insn *shift_seq, *insn;
@@ -1779,6 +1778,25 @@ find_shift_sequence (poly_int64 access_size,
 	  && !targetm.modes_tieable_p (new_mode, store_mode))
 	continue;
 
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+	  && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+	{
+	  /* Try to implement the shift using a subreg.  */
+	  poly_int64 offset
+	    = subreg_offset_from_lsb (new_mode, store_mode, shift);
+	  rtx rhs_subreg
+	    = simplify_gen_subreg (new_mode, store_info->rhs, store_mode, offset);
+	  if (rhs_subreg)
+	    {
+	      read_reg
+		= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+	      break;
+	    }
+	}
+
+      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+	continue;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.27.0.90.geebb51ba8c


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v4] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-23 12:35             ` luoxhu
@ 2020-07-24 10:47               ` luoxhu
  2020-07-28 23:06                 ` luoxhu
  2020-07-31 16:29                 ` Richard Sandiford
  0 siblings, 2 replies; 21+ messages in thread
From: luoxhu @ 2020-07-24 10:47 UTC (permalink / raw)
  To: gcc-patches, segher, wschmidt, dberlin, jakub, richard.sandiford

Hi Richard,

This is the updated version that could pass all regression test on
Power9-LE. 
Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
before generating shift for store_info->const_rhs to ensure correct
constant is generated, take testsuite/gfortran1/equiv_2.x for example:

equiv_2.x-equiv_2.f90.264r.cse2:
5: r119:DI=0x6867666564636261
6: [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=[sfp:DI+0x21]
20: r125:CC=cmp(r124:SI,r123:SI)

Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
5:  r119:DI=0x6867666564636261
32: r126:SI=0x65646362
6:  [sfp:DI+0x20]=r119:DI
18: r123:SI=0x65646362
19: r124:SI=r126:SI
20: r125:CC=cmp(r124:SI,r123:SI)

So the new_mode size should also be GE than access_size for const_rhs shift.
It seems a bit more complicated now...



[PATCH v4] dse: Remove partial load after full store for high part access[PR71309]


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-07-24  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-07-24  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 25 ++++++++++++++--
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
 2 files changed, 56 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..a7cdc43a182 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
      the machine.  */
 
   opt_scalar_int_mode new_mode_iter;
-  FOR_EACH_MODE_FROM (new_mode_iter,
-		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
     {
       rtx target, new_reg, new_lhs;
       rtx_insn *shift_seq, *insn;
@@ -1744,6 +1743,9 @@ find_shift_sequence (poly_int64 access_size,
 	 e.g. at -Os, even when no actual shift will be needed.  */
       if (store_info->const_rhs)
 	{
+	  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+	    continue;
+
 	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
 	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
 				     store_mode, byte);
@@ -1779,6 +1781,25 @@ find_shift_sequence (poly_int64 access_size,
 	  && !targetm.modes_tieable_p (new_mode, store_mode))
 	continue;
 
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+	  && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+	{
+	  /* Try to implement the shift using a subreg.  */
+	  poly_int64 offset = subreg_offset_from_lsb (new_mode,
+						      store_mode, shift);
+	  rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+						store_mode, offset);
+	  if (rhs_subreg)
+	    {
+	      read_reg
+		= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+	      break;
+	    }
+	}
+
+      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+	continue;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.27.0.90.geebb51ba8c



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-24 10:47               ` [PATCH v4] " luoxhu
@ 2020-07-28 23:06                 ` luoxhu
  2020-07-31 16:29                 ` Richard Sandiford
  1 sibling, 0 replies; 21+ messages in thread
From: luoxhu @ 2020-07-28 23:06 UTC (permalink / raw)
  To: gcc-patches, segher, wschmidt, dberlin, jakub, richard.sandiford

Gentle ping in case this mail is missed, Thanks :)

https://gcc.gnu.org/pipermail/gcc-patches/2020-July/550602.html

Xionghu


On 2020/7/24 18:47, luoxhu via Gcc-patches wrote:
> Hi Richard,
> 
> This is the updated version that could pass all regression test on
> Power9-LE.
> Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
> before generating shift for store_info->const_rhs to ensure correct
> constant is generated, take testsuite/gfortran1/equiv_2.x for example:
> 
> equiv_2.x-equiv_2.f90.264r.cse2:
> 5: r119:DI=0x6867666564636261
> 6: [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=[sfp:DI+0x21]
> 20: r125:CC=cmp(r124:SI,r123:SI)
> 
> Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
> insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
> 5:  r119:DI=0x6867666564636261
> 32: r126:SI=0x646362
> 6:  [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=r126:SI
> 20: r125:CC=cmp(r124:SI,r123:SI)
> 
> Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
> 5:  r119:DI=0x6867666564636261
> 32: r126:SI=0x65646362
> 6:  [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=r126:SI
> 20: r125:CC=cmp(r124:SI,r123:SI)
> 
> So the new_mode size should also be GE than access_size for const_rhs shift.
> It seems a bit more complicated now...
> 
> 
> 
> [PATCH v4] dse: Remove partial load after full store for high part access[PR71309]
> 
> 
> This patch could optimize(works for char/short/int/void*):
> 
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
> 
> =>
> 
> 6: r119:TI=[r118:DI+0x10]
> 16: r122:DI=r119:TI#8
> 
> Final ASM will be as below without partial load after full store(stxv+ld):
>    ld 10,16(3)
>    mr 9,3
>    ld 3,24(3)
>    std 10,0(9)
>    std 3,8(9)
>    blr
> 
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression tested on Power9-LE.
> 
> For AArch64, one ldr is replaced by mov with this patch:
> 
> ldp     x2, x3, [x0, 16]
> stp     x2, x3, [x0]
> ldr     x0, [x0, 8]
> 
> =>
> 
> mov     x1, x0
> ldp     x2, x0, [x0, 16]
> stp     x2, x0, [x1]
> 
> gcc/ChangeLog:
> 
> 2020-07-24  Xionghu Luo  <luoxhu@linux.ibm.com>
> 
> 	PR rtl-optimization/71309
> 	* dse.c (find_shift_sequence): Use subreg of shifted from high part
> 	register to avoid loading from address.
> 
> gcc/testsuite/ChangeLog:
> 
> 2020-07-24  Xionghu Luo  <luoxhu@linux.ibm.com>
> 
> 	PR rtl-optimization/71309
> 	* gcc.target/powerpc/pr71309.c: New test.
> ---
>   gcc/dse.c                                  | 25 ++++++++++++++--
>   gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 ++++++++++++++++++++++
>   2 files changed, 56 insertions(+), 2 deletions(-)
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
> 
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..a7cdc43a182 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
>        the machine.  */
> 
>     opt_scalar_int_mode new_mode_iter;
> -  FOR_EACH_MODE_FROM (new_mode_iter,
> -		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
> +  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)
>       {
>         rtx target, new_reg, new_lhs;
>         rtx_insn *shift_seq, *insn;
> @@ -1744,6 +1743,9 @@ find_shift_sequence (poly_int64 access_size,
>   	 e.g. at -Os, even when no actual shift will be needed.  */
>         if (store_info->const_rhs)
>   	{
> +	  if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
> +	    continue;
> +
>   	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
>   	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
>   				     store_mode, byte);
> @@ -1779,6 +1781,25 @@ find_shift_sequence (poly_int64 access_size,
>   	  && !targetm.modes_tieable_p (new_mode, store_mode))
>   	continue;
> 
> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
> +	  && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
> +	{
> +	  /* Try to implement the shift using a subreg.  */
> +	  poly_int64 offset = subreg_offset_from_lsb (new_mode,
> +						      store_mode, shift);
> +	  rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
> +						store_mode, offset);
> +	  if (rhs_subreg)
> +	    {
> +	      read_reg
> +		= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
> +	      break;
> +	    }
> +	}
> +
> +      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
> +	continue;
> +
>         new_reg = gen_reg_rtx (new_mode);
> 
>         start_sequence ();
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> new file mode 100644
> index 00000000000..94d727a8ed9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
> +
> +#define TYPE void*
> +#define TYPE2 void*
> +
> +struct path {
> +    TYPE2 mnt;
> +    TYPE dentry;
> +};
> +
> +struct nameidata {
> +    struct path path;
> +    struct path root;
> +};
> +
> +__attribute__ ((noinline))
> +TYPE foo(struct nameidata *nd)
> +{
> +  TYPE d;
> +  TYPE2 d2;
> +
> +  nd->path = nd->root;
> +  d = nd->path.dentry;
> +  d2 = nd->path.mnt;
> +  return d;
> +}
> +
> +/* { dg-final { scan-assembler-not {\mlxv\M} } } */
> +/* { dg-final { scan-assembler-not {\mstxv\M} } } */
> +/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v4] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-24 10:47               ` [PATCH v4] " luoxhu
  2020-07-28 23:06                 ` luoxhu
@ 2020-07-31 16:29                 ` Richard Sandiford
  2020-08-03  6:58                   ` [PATCH v5] " luoxhu
  1 sibling, 1 reply; 21+ messages in thread
From: Richard Sandiford @ 2020-07-31 16:29 UTC (permalink / raw)
  To: luoxhu; +Cc: gcc-patches, segher, wschmidt, dberlin, jakub

Sorry for the slow reply.

luoxhu <luoxhu@linux.ibm.com> writes:
> Hi Richard,
>
> This is the updated version that could pass all regression test on
> Power9-LE. 
> Just need another  "maybe_lt (GET_MODE_SIZE (new_mode), access_size)"
> before generating shift for store_info->const_rhs to ensure correct
> constant is generated, take testsuite/gfortran1/equiv_2.x for example:
>
> equiv_2.x-equiv_2.f90.264r.cse2:
> 5: r119:DI=0x6867666564636261
> 6: [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=[sfp:DI+0x21]
> 20: r125:CC=cmp(r124:SI,r123:SI)
>
> Bad code due to get SImode subreg0 and shift right 8 bits got 0x646362 in
> insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
> 5:  r119:DI=0x6867666564636261
> 32: r126:SI=0x646362
> 6:  [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=r126:SI
> 20: r125:CC=cmp(r124:SI,r123:SI)
>
> Good code, get 0x65646362 in insn #32, equiv_2.x-equiv_2.f90.265r.dse1:
> 5:  r119:DI=0x6867666564636261
> 32: r126:SI=0x65646362
> 6:  [sfp:DI+0x20]=r119:DI
> 18: r123:SI=0x65646362
> 19: r124:SI=r126:SI
> 20: r125:CC=cmp(r124:SI,r123:SI)
>
> So the new_mode size should also be GE than access_size for const_rhs shift.
> It seems a bit more complicated now...

Yeah.  The structure of this code makes it hard to make changes to it. :-/

Since the const_rhs code is only trying to fold the operation at compile
time, without regard for what the target supports at runtime, I think it
*does* make sense to use smallest_int_mode_for_size there.  I.e. we should
be able to hoist the code out of the loop and do:

  if (store_info->const_rhs)
    {
      auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
      auto byte = subreg_lowpart_offset (new_mode, store_mode);
      rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
                                 store_mode, byte);
      if (ret && CONSTANT_P (ret))
        …
    }

  if (require_cst)
    return NULL_RTX;

  …the current loop…

I don't see any reason to restrict the const_rhs case to BITS_PER_WORD.
And if the simplify_subreg above fails for new_mode, I can't think it
would succeed for a wider mode (which would have to provide *more*
constant data than the failed subreg).

> @@ -1728,8 +1728,7 @@ find_shift_sequence (poly_int64 access_size,
>       the machine.  */
>  
>    opt_scalar_int_mode new_mode_iter;
> -  FOR_EACH_MODE_FROM (new_mode_iter,
> -		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
> +  FOR_EACH_MODE_IN_CLASS (new_mode_iter, MODE_INT)

Sorry, I now realise it would be better to start at:

smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode))

Thanks,
Richard

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]
  2020-07-31 16:29                 ` Richard Sandiford
@ 2020-08-03  6:58                   ` luoxhu
  2020-08-03 14:01                     ` Richard Sandiford
  2020-08-03 23:17                     ` Segher Boessenkool
  0 siblings, 2 replies; 21+ messages in thread
From: luoxhu @ 2020-08-03  6:58 UTC (permalink / raw)
  To: gcc-patches, segher, wschmidt, dberlin, jakub, richard.sandiford

Thanks, the v5 update as comments:
 1. Move const_rhs shift out of loop;
 2. Iterate from int size for read_mode.


This patch could optimize(works for char/short/int/void*):

6: r119:TI=[r118:DI+0x10]
7: [r118:DI]=r119:TI
8: r121:DI=[r118:DI+0x8]

=>

6: r119:TI=[r118:DI+0x10]
16: r122:DI=r119:TI#8

Final ASM will be as below without partial load after full store(stxv+ld):
  ld 10,16(3)
  mr 9,3
  ld 3,24(3)
  std 10,0(9)
  std 3,8(9)
  blr

It could achieve ~25% performance improvement for typical cases on
Power9.  Bootstrap and regression tested on Power9-LE.

For AArch64, one ldr is replaced by mov with this patch:

ldp     x2, x3, [x0, 16]
stp     x2, x3, [x0]
ldr     x0, [x0, 8]

=>

mov     x1, x0
ldp     x2, x0, [x0, 16]
stp     x2, x0, [x1]

gcc/ChangeLog:

2020-08-03  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* dse.c (find_shift_sequence): Use subreg of shifted from high part
	register to avoid loading from address.

gcc/testsuite/ChangeLog:

2020-08-03  Xionghu Luo  <luoxhu@linux.ibm.com>

	PR rtl-optimization/71309
	* gcc.target/powerpc/pr71309.c: New test.
---
 gcc/dse.c                                  | 78 ++++++++++++++--------
 gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 +++++++++
 2 files changed, 82 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c

diff --git a/gcc/dse.c b/gcc/dse.c
index bbe792e48e8..b67b5c2ba35 100644
--- a/gcc/dse.c
+++ b/gcc/dse.c
@@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
   scalar_int_mode new_mode;
   rtx read_reg = NULL;
 
+  /* If a constant was stored into memory, try to simplify it here,
+     otherwise the cost of the shift might preclude this optimization
+     e.g. at -Os, even when no actual shift will be needed.  */
+  if (store_info->const_rhs)
+    {
+      auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
+      auto byte = subreg_lowpart_offset (new_mode, store_mode);
+      rtx ret
+	= simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
+      if (ret && CONSTANT_P (ret))
+	{
+	  rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
+	  ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
+						 shift_rtx);
+	  if (ret && CONSTANT_P (ret))
+	    {
+	      byte = subreg_lowpart_offset (read_mode, new_mode);
+	      ret = simplify_subreg (read_mode, ret, new_mode, byte);
+	      if (ret && CONSTANT_P (ret)
+		  && (set_src_cost (ret, read_mode, speed)
+		      <= COSTS_N_INSNS (1)))
+		return ret;
+	    }
+	}
+    }
+
+  if (require_cst)
+    return NULL_RTX;
+
   /* Some machines like the x86 have shift insns for each size of
      operand.  Other machines like the ppc or the ia-64 may only have
      shift insns that shift values within 32 or 64 bit registers.
@@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
 
   opt_scalar_int_mode new_mode_iter;
   FOR_EACH_MODE_FROM (new_mode_iter,
-		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
+		      smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
     {
       rtx target, new_reg, new_lhs;
       rtx_insn *shift_seq, *insn;
@@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
       if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
 	break;
 
-      /* If a constant was stored into memory, try to simplify it here,
-	 otherwise the cost of the shift might preclude this optimization
-	 e.g. at -Os, even when no actual shift will be needed.  */
-      if (store_info->const_rhs)
-	{
-	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
-	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
-				     store_mode, byte);
-	  if (ret && CONSTANT_P (ret))
-	    {
-	      rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
-	      ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
-						     ret, shift_rtx);
-	      if (ret && CONSTANT_P (ret))
-		{
-		  byte = subreg_lowpart_offset (read_mode, new_mode);
-		  ret = simplify_subreg (read_mode, ret, new_mode, byte);
-		  if (ret && CONSTANT_P (ret)
-		      && (set_src_cost (ret, read_mode, speed)
-			  <= COSTS_N_INSNS (1)))
-		    return ret;
-		}
-	    }
-	}
-
-      if (require_cst)
-	return NULL_RTX;
-
       /* Try a wider mode if truncating the store mode to NEW_MODE
 	 requires a real instruction.  */
       if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
@@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
 	  && !targetm.modes_tieable_p (new_mode, store_mode))
 	continue;
 
+      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
+	  && known_le (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode)))
+	{
+	  /* Try to implement the shift using a subreg.  */
+	  poly_int64 offset = subreg_offset_from_lsb (new_mode,
+						      store_mode, shift);
+	  rtx rhs_subreg = simplify_gen_subreg (new_mode, store_info->rhs,
+						store_mode, offset);
+	  if (rhs_subreg)
+	    {
+	      read_reg
+		= extract_low_bits (read_mode, new_mode, copy_rtx (rhs_subreg));
+	      break;
+	    }
+	}
+
+      if (maybe_lt (GET_MODE_SIZE (new_mode), access_size))
+	continue;
+
       new_reg = gen_reg_rtx (new_mode);
 
       start_sequence ();
diff --git a/gcc/testsuite/gcc.target/powerpc/pr71309.c b/gcc/testsuite/gcc.target/powerpc/pr71309.c
new file mode 100644
index 00000000000..94d727a8ed9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9" } */
+
+#define TYPE void*
+#define TYPE2 void*
+
+struct path {
+    TYPE2 mnt;
+    TYPE dentry;
+};
+
+struct nameidata {
+    struct path path;
+    struct path root;
+};
+
+__attribute__ ((noinline))
+TYPE foo(struct nameidata *nd)
+{
+  TYPE d;
+  TYPE2 d2;
+
+  nd->path = nd->root;
+  d = nd->path.dentry;
+  d2 = nd->path.mnt;
+  return d;
+}
+
+/* { dg-final { scan-assembler-not {\mlxv\M} } } */
+/* { dg-final { scan-assembler-not {\mstxv\M} } } */
+/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */
-- 
2.25.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]
  2020-08-03  6:58                   ` [PATCH v5] " luoxhu
@ 2020-08-03 14:01                     ` Richard Sandiford
  2020-08-04  3:28                       ` luoxhu
  2020-08-05 23:30                       ` luoxhu
  2020-08-03 23:17                     ` Segher Boessenkool
  1 sibling, 2 replies; 21+ messages in thread
From: Richard Sandiford @ 2020-08-03 14:01 UTC (permalink / raw)
  To: luoxhu; +Cc: gcc-patches, segher, wschmidt, dberlin, jakub

luoxhu <luoxhu@linux.ibm.com> writes:
> Thanks, the v5 update as comments:
>  1. Move const_rhs shift out of loop;
>  2. Iterate from int size for read_mode.
>
>
> This patch could optimize(works for char/short/int/void*):
>
> 6: r119:TI=[r118:DI+0x10]
> 7: [r118:DI]=r119:TI
> 8: r121:DI=[r118:DI+0x8]
>
> =>
>
> 6: r119:TI=[r118:DI+0x10]
> 16: r122:DI=r119:TI#8
>
> Final ASM will be as below without partial load after full store(stxv+ld):
>   ld 10,16(3)
>   mr 9,3
>   ld 3,24(3)
>   std 10,0(9)
>   std 3,8(9)
>   blr
>
> It could achieve ~25% performance improvement for typical cases on
> Power9.  Bootstrap and regression tested on Power9-LE.
>
> For AArch64, one ldr is replaced by mov with this patch:
>
> ldp     x2, x3, [x0, 16]
> stp     x2, x3, [x0]
> ldr     x0, [x0, 8]
>
> =>
>
> mov     x1, x0
> ldp     x2, x0, [x0, 16]
> stp     x2, x0, [x1]
>
> gcc/ChangeLog:
>
> 2020-08-03  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* dse.c (find_shift_sequence): Use subreg of shifted from high part
> 	register to avoid loading from address.
>
> gcc/testsuite/ChangeLog:
>
> 2020-08-03  Xionghu Luo  <luoxhu@linux.ibm.com>
>
> 	PR rtl-optimization/71309
> 	* gcc.target/powerpc/pr71309.c: New test.
> ---
>  gcc/dse.c                                  | 78 ++++++++++++++--------
>  gcc/testsuite/gcc.target/powerpc/pr71309.c | 33 +++++++++
>  2 files changed, 82 insertions(+), 29 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/powerpc/pr71309.c
>
> diff --git a/gcc/dse.c b/gcc/dse.c
> index bbe792e48e8..b67b5c2ba35 100644
> --- a/gcc/dse.c
> +++ b/gcc/dse.c
> @@ -1720,6 +1720,35 @@ find_shift_sequence (poly_int64 access_size,
>    scalar_int_mode new_mode;
>    rtx read_reg = NULL;
>  
> +  /* If a constant was stored into memory, try to simplify it here,
> +     otherwise the cost of the shift might preclude this optimization
> +     e.g. at -Os, even when no actual shift will be needed.  */
> +  if (store_info->const_rhs)
> +    {
> +      auto new_mode = smallest_int_mode_for_size (access_size * BITS_PER_UNIT);
> +      auto byte = subreg_lowpart_offset (new_mode, store_mode);
> +      rtx ret
> +	= simplify_subreg (new_mode, store_info->const_rhs, store_mode, byte);
> +      if (ret && CONSTANT_P (ret))
> +	{
> +	  rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
> +	  ret = simplify_const_binary_operation (LSHIFTRT, new_mode, ret,
> +						 shift_rtx);
> +	  if (ret && CONSTANT_P (ret))
> +	    {
> +	      byte = subreg_lowpart_offset (read_mode, new_mode);
> +	      ret = simplify_subreg (read_mode, ret, new_mode, byte);
> +	      if (ret && CONSTANT_P (ret)
> +		  && (set_src_cost (ret, read_mode, speed)
> +		      <= COSTS_N_INSNS (1)))
> +		return ret;
> +	    }
> +	}
> +    }
> +
> +  if (require_cst)
> +    return NULL_RTX;
> +
>    /* Some machines like the x86 have shift insns for each size of
>       operand.  Other machines like the ppc or the ia-64 may only have
>       shift insns that shift values within 32 or 64 bit registers.
> @@ -1729,7 +1758,7 @@ find_shift_sequence (poly_int64 access_size,
>  
>    opt_scalar_int_mode new_mode_iter;
>    FOR_EACH_MODE_FROM (new_mode_iter,
> -		      smallest_int_mode_for_size (access_size * BITS_PER_UNIT))
> +		      smallest_int_mode_for_size (GET_MODE_BITSIZE (read_mode)))
>      {
>        rtx target, new_reg, new_lhs;
>        rtx_insn *shift_seq, *insn;
> @@ -1739,34 +1768,6 @@ find_shift_sequence (poly_int64 access_size,
>        if (GET_MODE_BITSIZE (new_mode) > BITS_PER_WORD)
>  	break;
>  
> -      /* If a constant was stored into memory, try to simplify it here,
> -	 otherwise the cost of the shift might preclude this optimization
> -	 e.g. at -Os, even when no actual shift will be needed.  */
> -      if (store_info->const_rhs)
> -	{
> -	  poly_uint64 byte = subreg_lowpart_offset (new_mode, store_mode);
> -	  rtx ret = simplify_subreg (new_mode, store_info->const_rhs,
> -				     store_mode, byte);
> -	  if (ret && CONSTANT_P (ret))
> -	    {
> -	      rtx shift_rtx = gen_int_shift_amount (new_mode, shift);
> -	      ret = simplify_const_binary_operation (LSHIFTRT, new_mode,
> -						     ret, shift_rtx);
> -	      if (ret && CONSTANT_P (ret))
> -		{
> -		  byte = subreg_lowpart_offset (read_mode, new_mode);
> -		  ret = simplify_subreg (read_mode, ret, new_mode, byte);
> -		  if (ret && CONSTANT_P (ret)
> -		      && (set_src_cost (ret, read_mode, speed)
> -			  <= COSTS_N_INSNS (1)))
> -		    return ret;
> -		}
> -	    }
> -	}
> -
> -      if (require_cst)
> -	return NULL_RTX;
> -
>        /* Try a wider mode if truncating the store mode to NEW_MODE
>  	 requires a real instruction.  */
>        if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>  	  && !targetm.modes_tieable_p (new_mode, store_mode))
>  	continue;
>  
> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)

Like I mentioned before, this should be the other way around:

    multiple_p (shift, GET_MODE_BITSIZE (new_mode))

i.e. for the subreg to be valid, the shift amount must be a multiple
of the outer mode size in bits.

OK with that change, thanks, and sorry for the multiple review cycles.

Richard

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]
  2020-08-03  6:58                   ` [PATCH v5] " luoxhu
  2020-08-03 14:01                     ` Richard Sandiford
@ 2020-08-03 23:17                     ` Segher Boessenkool
  1 sibling, 0 replies; 21+ messages in thread
From: Segher Boessenkool @ 2020-08-03 23:17 UTC (permalink / raw)
  To: luoxhu; +Cc: gcc-patches, wschmidt, dberlin, jakub, richard.sandiford

Hi Xiong Hu,

Very sorry I notice this only just now...

On Mon, Aug 03, 2020 at 02:58:45PM +0800, luoxhu wrote:
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr71309.c
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target powerpc_p9vector_ok } */

This needs

/* { dg-require-effective-target lp64 } */

as well:

> +/* { dg-final { scan-assembler-times {\mld\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mstd\M} 2 } } */

... or those insns will not be generated.


Well done getting this fixed!  Thanks,


Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]
  2020-08-03 14:01                     ` Richard Sandiford
@ 2020-08-04  3:28                       ` luoxhu
  2020-08-05 23:30                       ` luoxhu
  1 sibling, 0 replies; 21+ messages in thread
From: luoxhu @ 2020-08-04  3:28 UTC (permalink / raw)
  To: gcc-patches, segher, wschmidt, dberlin, jakub, richard.sandiford


On 2020/8/3 22:01, Richard Sandiford wrote:

>>         /* Try a wider mode if truncating the store mode to NEW_MODE
>>   	 requires a real instruction.  */
>>         if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
>> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>>   	  && !targetm.modes_tieable_p (new_mode, store_mode))
>>   	continue;
>>   
>> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
> 
> Like I mentioned before, this should be the other way around:
> 
>      multiple_p (shift, GET_MODE_BITSIZE (new_mode))
> 
> i.e. for the subreg to be valid, the shift amount must be a multiple
> of the outer mode size in bits.
> 
> OK with that change, thanks, and sorry for the multiple review cycles.

Appreciate your time and patience to make it better :).
Updated the change and lp64 requirements from Segher and committed by
r11-2526-g265d817b1eb4644c7a9613ad6920315d98e2e0a4, thank you all.

Xionghu

> 
> Richard
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]
  2020-08-03 14:01                     ` Richard Sandiford
  2020-08-04  3:28                       ` luoxhu
@ 2020-08-05 23:30                       ` luoxhu
  2020-08-06  5:23                         ` Richard Sandiford
  1 sibling, 1 reply; 21+ messages in thread
From: luoxhu @ 2020-08-05 23:30 UTC (permalink / raw)
  To: gcc-patches, segher, wschmidt, dberlin, jakub, richard.sandiford

Hi Richard,

On 2020/8/3 22:01, Richard Sandiford wrote:
>>         /* Try a wider mode if truncating the store mode to NEW_MODE
>>   	 requires a real instruction.  */
>>         if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
>> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>>   	  && !targetm.modes_tieable_p (new_mode, store_mode))
>>   	continue;
>>   
>> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
> 
> Like I mentioned before, this should be the other way around:
> 
>      multiple_p (shift, GET_MODE_BITSIZE (new_mode))
> 
> i.e. for the subreg to be valid, the shift amount must be a multiple
> of the outer mode size in bits.
> 
> OK with that change, thanks, and sorry for the multiple review cycles.

Just had another question is do we want to backport this patch to gcc-10
and gcc-9 branches? :)


Xionghu


> 
> Richard
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]
  2020-08-05 23:30                       ` luoxhu
@ 2020-08-06  5:23                         ` Richard Sandiford
  2020-08-24 19:09                           ` Jeff Law
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Sandiford @ 2020-08-06  5:23 UTC (permalink / raw)
  To: luoxhu; +Cc: gcc-patches, segher, wschmidt, dberlin, jakub

luoxhu <luoxhu@linux.ibm.com> writes:
> Hi Richard,
>
> On 2020/8/3 22:01, Richard Sandiford wrote:
>>>         /* Try a wider mode if truncating the store mode to NEW_MODE
>>>   	 requires a real instruction.  */
>>>         if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
>>> @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
>>>   	  && !targetm.modes_tieable_p (new_mode, store_mode))
>>>   	continue;
>>>   
>>> +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
>> 
>> Like I mentioned before, this should be the other way around:
>> 
>>      multiple_p (shift, GET_MODE_BITSIZE (new_mode))
>> 
>> i.e. for the subreg to be valid, the shift amount must be a multiple
>> of the outer mode size in bits.
>> 
>> OK with that change, thanks, and sorry for the multiple review cycles.
>
> Just had another question is do we want to backport this patch to gcc-10
> and gcc-9 branches? :)

One for the RMs, but it looks like the PR was reported against GCC 7 and
isn't a regression.  Given that it's also “just” a missed optimisation,
I'm guessing it should be trunk only.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v5] dse: Remove partial load after full store for high part access[PR71309]
  2020-08-06  5:23                         ` Richard Sandiford
@ 2020-08-24 19:09                           ` Jeff Law
  0 siblings, 0 replies; 21+ messages in thread
From: Jeff Law @ 2020-08-24 19:09 UTC (permalink / raw)
  To: Richard Sandiford, luoxhu; +Cc: jakub, wschmidt, dberlin, gcc-patches, segher

On Thu, 2020-08-06 at 06:23 +0100, Richard Sandiford wrote:
> luoxhu <luoxhu@linux.ibm.com> writes:
> > Hi Richard,
> > 
> > On 2020/8/3 22:01, Richard Sandiford wrote:
> > > >         /* Try a wider mode if truncating the store mode to NEW_MODE
> > > >   	 requires a real instruction.  */
> > > >         if (maybe_lt (GET_MODE_SIZE (new_mode), GET_MODE_SIZE (store_mode))
> > > > @@ -1779,6 +1780,25 @@ find_shift_sequence (poly_int64 access_size,
> > > >   	  && !targetm.modes_tieable_p (new_mode, store_mode))
> > > >   	continue;
> > > >   
> > > > +      if (multiple_p (GET_MODE_BITSIZE (new_mode), shift)
> > > 
> > > Like I mentioned before, this should be the other way around:
> > > 
> > >      multiple_p (shift, GET_MODE_BITSIZE (new_mode))
> > > 
> > > i.e. for the subreg to be valid, the shift amount must be a multiple
> > > of the outer mode size in bits.
> > > 
> > > OK with that change, thanks, and sorry for the multiple review cycles.
> > 
> > Just had another question is do we want to backport this patch to gcc-10
> > and gcc-9 branches? :)
> 
> One for the RMs, but it looks like the PR was reported against GCC 7 and
> isn't a regression.  Given that it's also “just” a missed optimisation,
> I'm guessing it should be trunk only.
Not a release manager, but I agree, let's keep this on the trunk.

jeff
> 


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2020-08-24 19:09 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-21 10:54 [PATCH] dse: Remove partial load after full store for high part access[PR71309] Xiong Hu Luo
2020-07-21 15:30 ` Richard Sandiford
2020-07-22  9:18   ` [PATCH v2] " luoxhu
2020-07-22 11:05     ` Richard Sandiford
2020-07-22 15:22       ` [PATCH v3] " luoxhu
2020-07-22 17:28         ` Richard Sandiford
2020-07-22 20:30           ` Richard Sandiford
2020-07-23 12:35             ` luoxhu
2020-07-24 10:47               ` [PATCH v4] " luoxhu
2020-07-28 23:06                 ` luoxhu
2020-07-31 16:29                 ` Richard Sandiford
2020-08-03  6:58                   ` [PATCH v5] " luoxhu
2020-08-03 14:01                     ` Richard Sandiford
2020-08-04  3:28                       ` luoxhu
2020-08-05 23:30                       ` luoxhu
2020-08-06  5:23                         ` Richard Sandiford
2020-08-24 19:09                           ` Jeff Law
2020-08-03 23:17                     ` Segher Boessenkool
2020-07-21 21:54 ` [PATCH] " Segher Boessenkool
2020-07-21 22:37   ` David Edelsohn
2020-07-21 23:12     ` Segher Boessenkool

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).