public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
@ 2017-09-22 15:49 Jim Wilson
  2017-09-22 15:59 ` Jim Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Jim Wilson @ 2017-09-22 15:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jim Wilson, wilson

On Falkor, because of an idiosyncracy of how the pipelines are designed, a
quad-word store using a reg+reg addressing mode is almost twice as slow as an
add followed by a quad-word store with a single reg addressing mode.  So we
get better performance if we disallow addressing modes using register offsets
with quad-word stores.

Using lmbench compiled with -O2 -ftree-vectorize as my benchmark, I see a 13%
performance increase on stream copy using this patch, and a 16% performance
increase on stream scale using this patch.  I also see a small performance
increase on SPEC CPU2006 of around 0.2% for int and 0.4% for FP at -O3.

	gcc/
	* config/aarch64/aarch64-protos.h (aarch64_movti_target_operand_p):
	New.
	* config/aarch64/aarch64-simd.md (aarch64_simd_mov<mode>): Use Utf.
	* config/aarch64/aarch64-tuning-flags.def
	(SLOW_REGOFFSET_QUADWORD_STORE): New.
	* config/aarch64/aarch64.c (qdf24xx_tunings): Add
	SLOW_REGOFFSET_QUADWORD_STORE to tuning flags.
	(aarch64_movti_target_operand_p): New.
	* config/aarch64/aarch64.md (movti_aarch64): Use Utf.
	(movtf_aarch64): Likewise.
	* config/aarch64/constraints.md (Utf): New.
---
 gcc/config/aarch64/aarch64-protos.h         |  1 +
 gcc/config/aarch64/aarch64-simd.md          |  4 ++--
 gcc/config/aarch64/aarch64-tuning-flags.def |  4 ++++
 gcc/config/aarch64/aarch64.c                | 14 +++++++++++++-
 gcc/config/aarch64/aarch64.md               |  6 +++---
 gcc/config/aarch64/constraints.md           |  6 ++++++
 6 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index e67c2ed..2dfd057 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -379,6 +379,7 @@ const char *aarch64_output_move_struct (rtx *operands);
 rtx aarch64_return_addr (int, rtx);
 rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
 bool aarch64_simd_mem_operand_p (rtx);
+bool aarch64_movti_target_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, bool);
 rtx aarch64_tls_get_addr (void);
 tree aarch64_fold_builtin (tree, int, tree *, bool);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 70e9339..88bf210 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -133,9 +133,9 @@
 
 (define_insn "*aarch64_simd_mov<mode>"
   [(set (match_operand:VQ 0 "nonimmediate_operand"
-		"=w, Umq,  m,  w, ?r, ?w, ?r, w")
+		"=w, Umq,  Utf,  w, ?r, ?w, ?r, w")
 	(match_operand:VQ 1 "general_operand"
-		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+		"m,  Dz,   w,    w,  w,  r,  r, Dn"))]
   "TARGET_SIMD
    && (register_operand (operands[0], <MODE>mode)
        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def
index f48642c..7d0b104 100644
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -41,4 +41,8 @@ AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
    are not considered cheap.  */
 AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)
 
+/* Don't use a register offset in a memory address for a quad-word store.  */
+AARCH64_EXTRA_TUNING_OPTION ("slow_regoffset_quadword_store",
+			     SLOW_REGOFFSET_QUADWORD_STORE)
+
 #undef AARCH64_EXTRA_TUNING_OPTION
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5e26cb7..d6f1133 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -818,7 +818,7 @@ static const struct tune_params qdf24xx_tunings =
   2,	/* min_div_recip_mul_df.  */
   0,	/* max_case_values.  */
   tune_params::AUTOPREFETCHER_STRONG,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE),	/* tune_flags.  */
   &qdf24xx_prefetch_tune
 };
 
@@ -11821,6 +11821,18 @@ aarch64_simd_mem_operand_p (rtx op)
 			|| REG_P (XEXP (op, 0)));
 }
 
+/* Return TRUE if OP uses an efficient memory address for quad-word target.  */
+bool
+aarch64_movti_target_operand_p (rtx op)
+{
+  if (! optimize_size
+      && (aarch64_tune_params.extra_tuning_flags
+	  & AARCH64_EXTRA_TUNE_SLOW_REGOFFSET_QUADWORD_STORE))
+    return MEM_P (op) && ! (GET_CODE (XEXP (op, 0)) == PLUS
+			    && ! CONST_INT_P (XEXP (XEXP (op, 0), 1)));
+  return MEM_P (op);
+}
+
 /* Emit a register copy from operand to operand, taking care not to
    early-clobber source registers in the process.
 
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f8cdb06..9c7e356 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -1023,7 +1023,7 @@
 
 (define_insn "*movti_aarch64"
   [(set (match_operand:TI 0
-	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,m")
+	 "nonimmediate_operand"  "=r, w,r,w,r,m,m,w,Utf")
 	(match_operand:TI 1
 	 "aarch64_movti_operand" " rn,r,w,w,m,r,Z,m,w"))]
   "(register_operand (operands[0], TImode)
@@ -1170,9 +1170,9 @@
 
 (define_insn "*movtf_aarch64"
   [(set (match_operand:TF 0
-	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,m,?r,m ,m")
+	 "nonimmediate_operand" "=w,?&r,w ,?r,w,?w,w,Utf,?r,m ,m")
 	(match_operand:TF 1
-	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w,m ,?r,Y"))]
+	 "general_operand"      " w,?r, ?r,w ,Y,Y ,m,w  ,m ,?r,Y"))]
   "TARGET_FLOAT && (register_operand (operands[0], TFmode)
     || aarch64_reg_or_fp_zero (operands[1], TFmode))"
   "@
diff --git a/gcc/config/aarch64/constraints.md b/gcc/config/aarch64/constraints.md
index 3649fb4..b1defb6 100644
--- a/gcc/config/aarch64/constraints.md
+++ b/gcc/config/aarch64/constraints.md
@@ -171,6 +171,12 @@
        (match_test "aarch64_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
 						  PARALLEL, 1)")))
 
+(define_memory_constraint "Utf"
+  "@iternal
+   An efficient memory address for a quad-word target operand."
+  (and (match_code "mem")
+       (match_test "aarch64_movti_target_operand_p (op)")))
+
 (define_memory_constraint "Utv"
   "@internal
    An address valid for loading/storing opaque structure
-- 
2.7.4

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
  2017-09-22 15:49 [PATCH, AArch64] Disable reg offset in quad-word store for Falkor Jim Wilson
@ 2017-09-22 15:59 ` Jim Wilson
  2017-09-22 17:59   ` Andrew Pinski
  0 siblings, 1 reply; 10+ messages in thread
From: Jim Wilson @ 2017-09-22 15:59 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jim Wilson, Jim Wilson

On Fri, Sep 22, 2017 at 8:49 AM, Jim Wilson <jim.wilson@linaro.org> wrote:
> On Falkor, because of an idiosyncracy of how the pipelines are designed, a
> quad-word store using a reg+reg addressing mode is almost twice as slow as an
> add followed by a quad-word store with a single reg addressing mode.  So we
> get better performance if we disallow addressing modes using register offsets
> with quad-word stores.

This was tested with a bootstrap and make check of course.  Also,
gcc.c-torture/compile/20021212-1.c compiled with -O3 -mcpu=falkor
makes a nice testcase to demonstrate that the patch works.

OK?

Jim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
  2017-09-22 15:59 ` Jim Wilson
@ 2017-09-22 17:59   ` Andrew Pinski
  2017-09-22 18:39     ` Jim Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Andrew Pinski @ 2017-09-22 17:59 UTC (permalink / raw)
  To: Jim Wilson; +Cc: gcc-patches, Jim Wilson

On Fri, Sep 22, 2017 at 8:59 AM, Jim Wilson <jim.wilson@linaro.org> wrote:
> On Fri, Sep 22, 2017 at 8:49 AM, Jim Wilson <jim.wilson@linaro.org> wrote:
>> On Falkor, because of an idiosyncracy of how the pipelines are designed, a
>> quad-word store using a reg+reg addressing mode is almost twice as slow as an
>> add followed by a quad-word store with a single reg addressing mode.  So we
>> get better performance if we disallow addressing modes using register offsets
>> with quad-word stores.
>
> This was tested with a bootstrap and make check of course.  Also,
> gcc.c-torture/compile/20021212-1.c compiled with -O3 -mcpu=falkor
> makes a nice testcase to demonstrate that the patch works.
>
> OK?

Two overall comments:
* What about splitting register_offset into two different elements,
one for non 128bit modes and one for 128bit (and more; OI, etc.) modes
so you get better address generation right away for the simd load
cases rather than having LRA/reload having to reload the address into
a register.
* Maybe adding a testcase to the testsuite to show this change.

One extra comment:
* should we change the generic tuning to avoid reg+reg for 128bit modes?

Thanks,
Andrew

>
> Jim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
  2017-09-22 17:59   ` Andrew Pinski
@ 2017-09-22 18:39     ` Jim Wilson
  2017-09-22 21:11       ` Andrew Pinski
  0 siblings, 1 reply; 10+ messages in thread
From: Jim Wilson @ 2017-09-22 18:39 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: gcc-patches, Jim Wilson

On Fri, Sep 22, 2017 at 10:58 AM, Andrew Pinski <pinskia@gmail.com> wrote:
> Two overall comments:
> * What about splitting register_offset into two different elements,
> one for non 128bit modes and one for 128bit (and more; OI, etc.) modes
> so you get better address generation right away for the simd load
> cases rather than having LRA/reload having to reload the address into
> a register.

I'm not sure if changing register_offset cost would make a difference,
since costs are usually used during optimization, not during address
generation.  This is something that I didn't think to try though.  I
can try taking a look at this.

I did try writing a patch to modify predicates to disallow reg offset
for 128bit modes, and that got complicated, as I had to split apart a
number of patterns in the aarch64-simd.md file that accept both VD and
VQ modes.  I ended up with a patch 3-4 times as big as the one I
submitted, without any additional performance improvement, so it
wasn't worth the trouble.

> * Maybe adding a testcase to the testsuite to show this change.

Yes, I can add a testcase.

> One extra comment:
> * should we change the generic tuning to avoid reg+reg for 128bit modes?

Are there other targets with a similar problem?  I only know that it
is a problem for Falkor.  It might be a loss for some targets as it is
replacing one instruction with two.

Jim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
  2017-09-22 18:39     ` Jim Wilson
@ 2017-09-22 21:11       ` Andrew Pinski
  2017-10-12 21:54         ` Jim Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Andrew Pinski @ 2017-09-22 21:11 UTC (permalink / raw)
  To: Jim Wilson; +Cc: gcc-patches, Jim Wilson

On Fri, Sep 22, 2017 at 11:39 AM, Jim Wilson <jim.wilson@linaro.org> wrote:
> On Fri, Sep 22, 2017 at 10:58 AM, Andrew Pinski <pinskia@gmail.com> wrote:
>> Two overall comments:
>> * What about splitting register_offset into two different elements,
>> one for non 128bit modes and one for 128bit (and more; OI, etc.) modes
>> so you get better address generation right away for the simd load
>> cases rather than having LRA/reload having to reload the address into
>> a register.
>
> I'm not sure if changing register_offset cost would make a difference,
> since costs are usually used during optimization, not during address
> generation.  This is something that I didn't think to try though.  I
> can try taking a look at this.

It does taken into account when fwprop is propagating the addition into
the MEM (the tree level is always a_1 = POINTER_PLUS_EXPR; MEM_REF(a_1)).
IV-OPTS will produce much better code if the address_cost is correct.

It looks like no other pass (combine, etc.) would take that into
account except for postreload CSE but maybe they should.

>
> I did try writing a patch to modify predicates to disallow reg offset
> for 128bit modes, and that got complicated, as I had to split apart a
> number of patterns in the aarch64-simd.md file that accept both VD and
> VQ modes.  I ended up with a patch 3-4 times as big as the one I
> submitted, without any additional performance improvement, so it
> wasn't worth the trouble.
>
>> * Maybe adding a testcase to the testsuite to show this change.
>
> Yes, I can add a testcase.
>
>> One extra comment:
>> * should we change the generic tuning to avoid reg+reg for 128bit modes?
>
> Are there other targets with a similar problem?  I only know that it
> is a problem for Falkor.  It might be a loss for some targets as it is
> replacing one instruction with two.

Well that is why I was suggesting the address cost model change.
Because the cost model change actually might provide better code in
the first place and still allow for reasonable generic code to be
produced.

Thanks,
Andrew

>
> Jim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
  2017-09-22 21:11       ` Andrew Pinski
@ 2017-10-12 21:54         ` Jim Wilson
  2017-10-31  3:40           ` Kugan Vivekanandarajah
  0 siblings, 1 reply; 10+ messages in thread
From: Jim Wilson @ 2017-10-12 21:54 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: gcc-patches

On Fri, 2017-09-22 at 14:11 -0700, Andrew Pinski wrote:
> On Fri, Sep 22, 2017 at 11:39 AM, Jim Wilson <jim.wilson@linaro.org>
> wrote:
> > 
> > On Fri, Sep 22, 2017 at 10:58 AM, Andrew Pinski <pinskia@gmail.com>
> > wrote:
> > > 
> > > Two overall comments:
> > > * What about splitting register_offset into two different
> > > elements,
> > > one for non 128bit modes and one for 128bit (and more; OI, etc.)
> > > modes
> > > so you get better address generation right away for the simd load
> > > cases rather than having LRA/reload having to reload the address
> > > into
> > > a register.
> > I'm not sure if changing register_offset cost would make a
> > difference,
> > since costs are usually used during optimization, not during
> > address
> > generation.  This is something that I didn't think to try
> > though.  I
> > can try taking a look at this.
> It does taken into account when fwprop is propagating the addition
> into
> the MEM (the tree level is always a_1 = POINTER_PLUS_EXPR;
> MEM_REF(a_1)).
> IV-OPTS will produce much better code if the address_cost is correct.
> 
> It looks like no other pass (combine, etc.) would take that into
> account except for postreload CSE but maybe they should.

I tried increasing the cost of register_offset.  This got rid of the
reg+reg addressing mode in the middle of the main loop for lmbench
stream copy, but did not eliminate it after the main loop.

The tree optimized dump has 
  _52 = a_15 + _51;
  _53 = c_17 + _51;
  _54 = *_52;
  *_53 = _54;
and the RTL expand dump has
(insn 64 63 65 10 (set (reg:DF 96 [ _54 ])
        (mem:DF (plus:DI (reg/v/f:DI 78 [ a ])
                (reg:DI 93 [ _51 ])) [3 *_52+0 S8 A64])) "stream.c":223
-1
     (nil))
(insn 65 64 66 10 (set (mem:DF (plus:DI (reg/v/f:DI 79 [ c ])
                (reg:DI 93 [ _51 ])) [3 *_53+0 S8 A64])
        (reg:DF 96 [ _54 ])) "stream.c":223 -1
     (nil))

That may be fixable, but there is a bigger problem here which is that
increasing the costs of register_offset affects both loads and stores.
 On falkor, it is only quad-word stores that are inefficient with a
reg+reg address.  Quad-word loads with a reg+reg address are faster
than the equivalent add/ldr.  Disabling reg+reg address for quad-word
loads will hurt performance.

Since the address cost stuff makes no distinction between loads and
stores, I see no way to get the result I need by using address costs.
 I can only get the result I need by modifying the md file.

> > I did try writing a patch to modify predicates to disallow reg
> > offset
> > for 128bit modes, and that got complicated, as I had to split apart
> > a
> > number of patterns in the aarch64-simd.md file that accept both VD
> > and
> > VQ modes.  I ended up with a patch 3-4 times as big as the one I
> > submitted, without any additional performance improvement, so it
> > wasn't worth the trouble.
> > 
> > > 
> > > * Maybe adding a testcase to the testsuite to show this change.
> > Yes, I can add a testcase.
> > 
> > > 
> > > One extra comment:
> > > * should we change the generic tuning to avoid reg+reg for 128bit
> > > modes?
> > Are there other targets with a similar problem?  I only know that
> > it
> > is a problem for Falkor.  It might be a loss for some targets as it
> > is
> > replacing one instruction with two.
> Well that is why I was suggesting the address cost model change.
> Because the cost model change actually might provide better code in
> the first place and still allow for reasonable generic code to be
> produced.

The patch I posted only affects Falkor.  It doesn't change generic
code.  I don't know of any reason why we need to change generic code
here.

The Falkor core has out-of-order execution and multiple function units,
so there isn't any noticeable performance gain from trying to fix this
earlier.  Fixing this with a md file change gives optimal performance
for the testcases I've looked at.

Since I'm no longer at Linaro, I expect that someone else will take
over this patch submission.  I will create a bug report to document the
issue, to make it easier to track it and hand off to someone else.

Jim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
  2017-10-12 21:54         ` Jim Wilson
@ 2017-10-31  3:40           ` Kugan Vivekanandarajah
  2017-10-31 16:18             ` Jim Wilson
  0 siblings, 1 reply; 10+ messages in thread
From: Kugan Vivekanandarajah @ 2017-10-31  3:40 UTC (permalink / raw)
  To: Jim Wilson; +Cc: Andrew Pinski, gcc-patches

Ping ?

I see that Jim has clarified the comments from Andrew.

Thanks,
Kugan

On 13 October 2017 at 08:48, Jim Wilson <wilson@tuliptree.org> wrote:
> On Fri, 2017-09-22 at 14:11 -0700, Andrew Pinski wrote:
>> On Fri, Sep 22, 2017 at 11:39 AM, Jim Wilson <jim.wilson@linaro.org>
>> wrote:
>> >
>> > On Fri, Sep 22, 2017 at 10:58 AM, Andrew Pinski <pinskia@gmail.com>
>> > wrote:
>> > >
>> > > Two overall comments:
>> > > * What about splitting register_offset into two different
>> > > elements,
>> > > one for non 128bit modes and one for 128bit (and more; OI, etc.)
>> > > modes
>> > > so you get better address generation right away for the simd load
>> > > cases rather than having LRA/reload having to reload the address
>> > > into
>> > > a register.
>> > I'm not sure if changing register_offset cost would make a
>> > difference,
>> > since costs are usually used during optimization, not during
>> > address
>> > generation.  This is something that I didn't think to try
>> > though.  I
>> > can try taking a look at this.
>> It does taken into account when fwprop is propagating the addition
>> into
>> the MEM (the tree level is always a_1 = POINTER_PLUS_EXPR;
>> MEM_REF(a_1)).
>> IV-OPTS will produce much better code if the address_cost is correct.
>>
>> It looks like no other pass (combine, etc.) would take that into
>> account except for postreload CSE but maybe they should.
>
> I tried increasing the cost of register_offset.  This got rid of the
> reg+reg addressing mode in the middle of the main loop for lmbench
> stream copy, but did not eliminate it after the main loop.
>
> The tree optimized dump has
>   _52 = a_15 + _51;
>   _53 = c_17 + _51;
>   _54 = *_52;
>   *_53 = _54;
> and the RTL expand dump has
> (insn 64 63 65 10 (set (reg:DF 96 [ _54 ])
>         (mem:DF (plus:DI (reg/v/f:DI 78 [ a ])
>                 (reg:DI 93 [ _51 ])) [3 *_52+0 S8 A64])) "stream.c":223
> -1
>      (nil))
> (insn 65 64 66 10 (set (mem:DF (plus:DI (reg/v/f:DI 79 [ c ])
>                 (reg:DI 93 [ _51 ])) [3 *_53+0 S8 A64])
>         (reg:DF 96 [ _54 ])) "stream.c":223 -1
>      (nil))
>
> That may be fixable, but there is a bigger problem here which is that
> increasing the costs of register_offset affects both loads and stores.
>  On falkor, it is only quad-word stores that are inefficient with a
> reg+reg address.  Quad-word loads with a reg+reg address are faster
> than the equivalent add/ldr.  Disabling reg+reg address for quad-word
> loads will hurt performance.
>
> Since the address cost stuff makes no distinction between loads and
> stores, I see no way to get the result I need by using address costs.
>  I can only get the result I need by modifying the md file.
>
>> > I did try writing a patch to modify predicates to disallow reg
>> > offset
>> > for 128bit modes, and that got complicated, as I had to split apart
>> > a
>> > number of patterns in the aarch64-simd.md file that accept both VD
>> > and
>> > VQ modes.  I ended up with a patch 3-4 times as big as the one I
>> > submitted, without any additional performance improvement, so it
>> > wasn't worth the trouble.
>> >
>> > >
>> > > * Maybe adding a testcase to the testsuite to show this change.
>> > Yes, I can add a testcase.
>> >
>> > >
>> > > One extra comment:
>> > > * should we change the generic tuning to avoid reg+reg for 128bit
>> > > modes?
>> > Are there other targets with a similar problem?  I only know that
>> > it
>> > is a problem for Falkor.  It might be a loss for some targets as it
>> > is
>> > replacing one instruction with two.
>> Well that is why I was suggesting the address cost model change.
>> Because the cost model change actually might provide better code in
>> the first place and still allow for reasonable generic code to be
>> produced.
>
> The patch I posted only affects Falkor.  It doesn't change generic
> code.  I don't know of any reason why we need to change generic code
> here.
>
> The Falkor core has out-of-order execution and multiple function units,
> so there isn't any noticeable performance gain from trying to fix this
> earlier.  Fixing this with a md file change gives optimal performance
> for the testcases I've looked at.
>
> Since I'm no longer at Linaro, I expect that someone else will take
> over this patch submission.  I will create a bug report to document the
> issue, to make it easier to track it and hand off to someone else.
>
> Jim
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
  2017-10-31  3:40           ` Kugan Vivekanandarajah
@ 2017-10-31 16:18             ` Jim Wilson
  2017-11-01 22:59               ` Kugan Vivekanandarajah
  0 siblings, 1 reply; 10+ messages in thread
From: Jim Wilson @ 2017-10-31 16:18 UTC (permalink / raw)
  To: Kugan Vivekanandarajah; +Cc: Andrew Pinski, gcc-patches

On Tue, 2017-10-31 at 14:35 +1100, Kugan Vivekanandarajah wrote:
> Ping ?
> 
> I see that Jim has clarified the comments from Andrew.

Andrew also suggested that we add a testcase to the testsuite.  I
didn't do that.  I did put a testcase to reproduce in the bug report.
 See
    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82533
It should be a relatively simple matter of taking that testcase,
compiling to assembly with -mcpu=falkor, then doing a pattern search
for a str q [r+r] instruction, and fail if it is present.

Jim

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
  2017-10-31 16:18             ` Jim Wilson
@ 2017-11-01 22:59               ` Kugan Vivekanandarajah
  0 siblings, 0 replies; 10+ messages in thread
From: Kugan Vivekanandarajah @ 2017-11-01 22:59 UTC (permalink / raw)
  To: Jim Wilson; +Cc: Andrew Pinski, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 718 bytes --]

Hi,

On 1 November 2017 at 03:12, Jim Wilson <wilson@tuliptree.org> wrote:
> On Tue, 2017-10-31 at 14:35 +1100, Kugan Vivekanandarajah wrote:
>> Ping ?
>>
>> I see that Jim has clarified the comments from Andrew.
>
> Andrew also suggested that we add a testcase to the testsuite.  I
> didn't do that.  I did put a testcase to reproduce in the bug report.
>  See
>     https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82533
> It should be a relatively simple matter of taking that testcase,
> compiling to assembly with -mcpu=falkor, then doing a pattern search
> for a str q [r+r] instruction, and fail if it is present.
>
> Jim
>

Here is the testcase. Is the original patch OK with the testcase attached.

Thanks,
Kugan

[-- Attachment #2: test.txt --]
[-- Type: text/plain, Size: 499 bytes --]

diff --git a/gcc/testsuite/gcc.target/aarch64/pr82533.c b/gcc/testsuite/gcc.target/aarch64/pr82533.c
new file mode 100644
index 0000000..fa28ffa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr82533.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mcpu=falkor -O2 -ftree-vectorize" } */
+
+void
+copy (int N, double *c, double *a)
+{
+  for (int i = 0; i < N; ++i)
+    c[i] = a[i];
+}
+
+/* { dg-final { scan-assembler-not "str\tq\[0-9\]+, \\\[x\[0-9\]+, x\[0-9\]+\\\]" } } */

[-- Attachment #3: test.log --]
[-- Type: text/x-log, Size: 143 bytes --]

gcc/testsuite/ChangeLog:

2017-11-02  Kugan Vivekanandarajah  <kugan.vivekanandarajah@linaro.org>

	* gcc.target/aarch64/pr82533.c: New test.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH, AArch64] Disable reg offset in quad-word store for Falkor.
@ 2017-09-22 20:06 Wilco Dijkstra
  0 siblings, 0 replies; 10+ messages in thread
From: Wilco Dijkstra @ 2017-09-22 20:06 UTC (permalink / raw)
  To: jim.wilson, Andrew.pinski; +Cc: nd, GCC Patches

Hi Jim,

This looks like a general issue with reg+reg addressing modes being generated in
cases where it is not correct. I haven't looked at lmbench for a while, but it generated
absolutely horrible code like:

add x1, x0, #120
ldr v0, [x2, x1]
add x1, x0, #128
ldr v1, [x2, x1]

If this is still happening, we need to fix this in a general way as this is bad for any
CPU even if reg+reg is fast. Reduced testcases for this would be welcome as it's
likely many other targets are affected. A while ago I posted a patch to reassociate
(x + C1) * C2 -> x * C2 + C3 to improve cases like this.

Note we already support adjusting addressing costs. There are several other CPUs
which increase the reg+reg costs. So that's where I would start - assuming reg+reg
addressing mode was correctly used.

Finally, if you really want to completely disable a particular addressing mode, it's
best done in classify_address rather than changing the md patterns. My experience
is that if you use anything other than the standard 'm' constraint, GCC reload starts
to generate inefficient code even if the pattern should still apply. I have posted
several patches to use 'm' more to get better spill code and efficient expansions
if the offset happens to be too large.

Wilco

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-11-01 22:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-22 15:49 [PATCH, AArch64] Disable reg offset in quad-word store for Falkor Jim Wilson
2017-09-22 15:59 ` Jim Wilson
2017-09-22 17:59   ` Andrew Pinski
2017-09-22 18:39     ` Jim Wilson
2017-09-22 21:11       ` Andrew Pinski
2017-10-12 21:54         ` Jim Wilson
2017-10-31  3:40           ` Kugan Vivekanandarajah
2017-10-31 16:18             ` Jim Wilson
2017-11-01 22:59               ` Kugan Vivekanandarajah
2017-09-22 20:06 Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).