public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* RE: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
@ 2015-12-15 10:54 Wilco Dijkstra
  2015-12-16  9:54 ` James Greenhalgh
  0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2015-12-15 10:54 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches, nd

ping

> -----Original Message-----
> From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
> Sent: 06 November 2015 20:06
> To: 'gcc-patches@gcc.gnu.org'
> Subject: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> 
> This patch adds support for the TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS hook. When the cost of GENERAL_REGS and
> FP_REGS is identical, the register allocator always uses ALL_REGS even when it has a much higher cost. The hook changes the class to
> either FP_REGS or GENERAL_REGS depending on the mode of the register. This results in better register allocation overall, fewer spills
> and reduced codesize - particularly in SPEC2006 gamess.
> 
> GCC regression passes with several minor fixes.
> 
> OK for commit?
> 
> ChangeLog:
> 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* gcc/config/aarch64/aarch64.c
> 	(TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS): New define.
> 	(aarch64_ira_change_pseudo_allocno_class): New function.
> 	* gcc/testsuite/gcc.target/aarch64/cvtf_1.c: Build with -O2.
> 	* gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> 	(test_corners_sisd_di): Improve force to SIMD register.
> 	(test_corners_sisd_si): Likewise.
> 	* gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c: Build with -O2.
> 	* gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c:
> 	Remove scan-assembler check for ldr.
> 
> --
>  gcc/config/aarch64/aarch64.c                       | 22 ++++++++++++++++++++++
>  gcc/testsuite/gcc.target/aarch64/cvtf_1.c          |  2 +-
>  gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c  |  4 ++--
>  gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c     |  2 +-
>  .../gcc.target/aarch64/vect-ld1r-compile-fp.c      |  1 -
>  5 files changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 6da7245..9b60666 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -597,6 +597,24 @@ aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
>      error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
>  }
> 
> +/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
> +   The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
> +   the same cost even if ALL_REGS has a much larger cost.  This results in bad
> +   allocations and spilling.  To avoid this we force the class to GENERAL_REGS
> +   if the mode is integer.  */
> +
> +static reg_class_t
> +aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class)
> +{
> +  enum machine_mode mode;
> +
> +  if (allocno_class != ALL_REGS)
> +    return allocno_class;
> +
> +  mode = PSEUDO_REGNO_MODE (regno);
> +  return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
> +}
> +
>  static unsigned int
>  aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
>  {
> @@ -13113,6 +13131,10 @@ aarch64_promoted_type (const_tree t)
>  #undef  TARGET_INIT_BUILTINS
>  #define TARGET_INIT_BUILTINS  aarch64_init_builtins
> 
> +#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> +#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
> +  aarch64_ira_change_pseudo_allocno_class
> +
>  #undef TARGET_LEGITIMATE_ADDRESS_P
>  #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/cvtf_1.c b/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> index 5f2ff81..96501db 100644
> --- a/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> @@ -1,5 +1,5 @@
>  /* { dg-do run } */
> -/* { dg-options "-save-temps -fno-inline -O1" } */
> +/* { dg-options "-save-temps -fno-inline -O2" } */
> 
>  #define FCVTDEF(ftype,itype) \
>  void \
> diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> index 363f554..8465c89 100644
> --- a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> +++ b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> @@ -186,9 +186,9 @@ test_corners_sisd_di (Int64x1 b)
>  {
>    force_simd_di (b);
>    b = b >> 63;
> +  force_simd_di (b);
>    b = b >> 0;
>    b += b >> 65; /* { dg-warning "right shift count >= width of type" } */
> -  force_simd_di (b);
> 
>    return b;
>  }
> @@ -199,9 +199,9 @@ test_corners_sisd_si (Int32x1 b)
>  {
>    force_simd_si (b);
>    b = b >> 31;
> +  force_simd_si (b);
>    b = b >> 0;
>    b += b >> 33; /* { dg-warning "right shift count >= width of type" } */
> -  force_simd_si (b);
> 
>    return b;
>  }
> diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> index a49db3e..c5a9c52 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> @@ -1,6 +1,6 @@
>  /* Test vdup_lane intrinsics work correctly.  */
>  /* { dg-do run } */
> -/* { dg-options "-O1 --save-temps" } */
> +/* { dg-options "-O2 --save-temps" } */
> 
>  #include <arm_neon.h>
> 
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c b/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
> index 66e0168..4711c61 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
> @@ -8,6 +8,5 @@ DEF (float)
>  DEF (double)
> 
>  /* { dg-final { scan-assembler "ld1r\\t\{v\[0-9\]+\.4s"} } */
> -/* { dg-final { scan-assembler "ldr\\t\x\[0-9\]+"} } */
>  /* { dg-final { scan-assembler "ld1r\\t\{v\[0-9\]+\.2d"} } */
> 
> --
> 1.8.3

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
  2015-12-15 10:54 [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS Wilco Dijkstra
@ 2015-12-16  9:54 ` James Greenhalgh
  2015-12-16 13:05   ` Wilco Dijkstra
  0 siblings, 1 reply; 8+ messages in thread
From: James Greenhalgh @ 2015-12-16  9:54 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

On Tue, Dec 15, 2015 at 10:54:49AM +0000, Wilco Dijkstra wrote:
> ping
> 
> > -----Original Message-----
> > From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
> > Sent: 06 November 2015 20:06
> > To: 'gcc-patches@gcc.gnu.org'
> > Subject: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > 
> > This patch adds support for the TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > hook. When the cost of GENERAL_REGS and FP_REGS is identical, the register
> > allocator always uses ALL_REGS even when it has a much higher cost. The
> > hook changes the class to either FP_REGS or GENERAL_REGS depending on the
> > mode of the register. This results in better register allocation overall,
> > fewer spills and reduced codesize - particularly in SPEC2006 gamess.
> > 
> > GCC regression passes with several minor fixes.
> > 
> > OK for commit?
> > 
> > ChangeLog:
> > 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> > 
> > 	* gcc/config/aarch64/aarch64.c
> > 	(TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS): New define.
> > 	(aarch64_ira_change_pseudo_allocno_class): New function.
> > 	* gcc/testsuite/gcc.target/aarch64/cvtf_1.c: Build with -O2.
> > 	* gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > 	(test_corners_sisd_di): Improve force to SIMD register.
> > 	(test_corners_sisd_si): Likewise.
> > 	* gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c: Build with -O2.
> > 	* gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c:
> > 	Remove scan-assembler check for ldr.

Drop the gcc/ from the ChangeLog.

> > --
> >  gcc/config/aarch64/aarch64.c                       | 22 ++++++++++++++++++++++
> >  gcc/testsuite/gcc.target/aarch64/cvtf_1.c          |  2 +-
> >  gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c  |  4 ++--
> >  gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c     |  2 +-
> >  .../gcc.target/aarch64/vect-ld1r-compile-fp.c      |  1 -

These testsuite changes concern me a bit, and you don't mention them beyond
saying they are minor fixes...

> > diff --git a/gcc/testsuite/gcc.target/aarch64/cvtf_1.c b/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> > index 5f2ff81..96501db 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> > @@ -1,5 +1,5 @@
> >  /* { dg-do run } */
> > -/* { dg-options "-save-temps -fno-inline -O1" } */
> > +/* { dg-options "-save-temps -fno-inline -O2" } */

This one says we have a code-gen regression at -O1 ?

> > 
> >  #define FCVTDEF(ftype,itype) \
> >  void \
> > diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > index 363f554..8465c89 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > @@ -186,9 +186,9 @@ test_corners_sisd_di (Int64x1 b)
> >  {
> >    force_simd_di (b);
> >    b = b >> 63;
> > +  force_simd_di (b);
> >    b = b >> 0;
> >    b += b >> 65; /* { dg-warning "right shift count >= width of type" } */
> > -  force_simd_di (b);

This one I don't understand, but seems to say that we've decided to move
b out of FP_REGS after getting it in there for b = b << 63; ? So this is
another register allocator regression?

> > diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > index a49db3e..c5a9c52 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > @@ -1,6 +1,6 @@
> >  /* Test vdup_lane intrinsics work correctly.  */
> >  /* { dg-do run } */
> > -/* { dg-options "-O1 --save-temps" } */
> > +/* { dg-options "-O2 --save-temps" } */

Another -O1 regression ?

> > 
> >  #include <arm_neon.h>
> > 
> > diff --git a/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c b/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
> > index 66e0168..4711c61 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
> > @@ -8,6 +8,5 @@ DEF (float)
> >  DEF (double)
> > 
> >  /* { dg-final { scan-assembler "ld1r\\t\{v\[0-9\]+\.4s"} } */
> > -/* { dg-final { scan-assembler "ldr\\t\x\[0-9\]+"} } */
> >  /* { dg-final { scan-assembler "ld1r\\t\{v\[0-9\]+\.2d"} } */

This one is fine, I don't really understand what it was hoping to catch
in the first place!

Could you go in to some detail about why your testsuite changes are correct?

Thanks,
James

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
  2015-12-16  9:54 ` James Greenhalgh
@ 2015-12-16 13:05   ` Wilco Dijkstra
  2015-12-16 14:27     ` James Greenhalgh
  0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2015-12-16 13:05 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches, nd

James Greenhalgh wrote:
> On Tue, Dec 15, 2015 at 10:54:49AM +0000, Wilco Dijkstra wrote:
> > ping
> >
> > > -----Original Message-----
> > > From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
> > > Sent: 06 November 2015 20:06
> > > To: 'gcc-patches@gcc.gnu.org'
> > > Subject: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > >
> > > This patch adds support for the TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > hook. When the cost of GENERAL_REGS and FP_REGS is identical, the register
> > > allocator always uses ALL_REGS even when it has a much higher cost. The
> > > hook changes the class to either FP_REGS or GENERAL_REGS depending on the
> > > mode of the register. This results in better register allocation overall,
> > > fewer spills and reduced codesize - particularly in SPEC2006 gamess.
> > >
> > > GCC regression passes with several minor fixes.
> > >
> > > OK for commit?
> > >
> > > ChangeLog:
> > > 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> > >
> > > 	* gcc/config/aarch64/aarch64.c
> > > 	(TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS): New define.
> > > 	(aarch64_ira_change_pseudo_allocno_class): New function.
> > > 	* gcc/testsuite/gcc.target/aarch64/cvtf_1.c: Build with -O2.
> > > 	* gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > 	(test_corners_sisd_di): Improve force to SIMD register.
> > > 	(test_corners_sisd_si): Likewise.
> > > 	* gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c: Build with -O2.
> > > 	* gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c:
> > > 	Remove scan-assembler check for ldr.
> 
> Drop the gcc/ from the ChangeLog.
> 
> > > --
> > >  gcc/config/aarch64/aarch64.c                       | 22 ++++++++++++++++++++++
> > >  gcc/testsuite/gcc.target/aarch64/cvtf_1.c          |  2 +-
> > >  gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c  |  4 ++--
> > >  gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c     |  2 +-
> > >  .../gcc.target/aarch64/vect-ld1r-compile-fp.c      |  1 -
> 
> These testsuite changes concern me a bit, and you don't mention them beyond
> saying they are minor fixes...

Well any changes to register allocator preferencing would cause fallout in tests that
are assuming which register is allocated, especially if they use nasty inline assembler
hacks to do so...

> > > diff --git a/gcc/testsuite/gcc.target/aarch64/cvtf_1.c b/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> > > index 5f2ff81..96501db 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
> > > @@ -1,5 +1,5 @@
> > >  /* { dg-do run } */
> > > -/* { dg-options "-save-temps -fno-inline -O1" } */
> > > +/* { dg-options "-save-temps -fno-inline -O2" } */
> 
> This one says we have a code-gen regression at -O1 ?

It avoids a regalloc bug - see below.

> > >  #define FCVTDEF(ftype,itype) \
> > >  void \
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > index 363f554..8465c89 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > @@ -186,9 +186,9 @@ test_corners_sisd_di (Int64x1 b)
> > >  {
> > >    force_simd_di (b);
> > >    b = b >> 63;
> > > +  force_simd_di (b);
> > >    b = b >> 0;
> > >    b += b >> 65; /* { dg-warning "right shift count >= width of type" } */
> > > -  force_simd_di (b);
> 
> This one I don't understand, but seems to say that we've decided to move
> b out of FP_REGS after getting it in there for b = b << 63; ? So this is
> another register allocator regression?

No, basically the register allocator is now making better decisions as to where to
allocate integer variables. It will only allocate them to FP registers if they are primarily
used by other FP operations. The force_simd_di inline assembler tries to mimic FP uses,
and if there are enough of them at the right places then everything works as expected.
If however you do 3 consecutive integer operations then the allocator will now correctly
prefer to allocate them to the integer registers (while previously it wouldn't, which is
inefficient).

> > > diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > index a49db3e..c5a9c52 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > @@ -1,6 +1,6 @@
> > >  /* Test vdup_lane intrinsics work correctly.  */
> > >  /* { dg-do run } */
> > > -/* { dg-options "-O1 --save-temps" } */
> > > +/* { dg-options "-O2 --save-temps" } */
> 
> Another -O1 regression ?

No, it's triggering a bug in the -O1 register preferencing that causes incorrect preferences to be
selected despite the costs being right. The cost calculation with -O1 for eg. 
wrap_vdupb_lane_s8_0() in vdup_lane_2.c:

Pass 0 for finding pseudo/allocno costs

    r79: preferred FP_REGS, alternative GENERAL_REGS, allocno GENERAL_REGS
    a1 (r79,l0) best GENERAL_REGS, allocno GENERAL_REGS
    r78: preferred GENERAL_REGS, alternative NO_REGS, allocno GENERAL_REGS
    a0 (r78,l0) best GENERAL_REGS, allocno GENERAL_REGS

  a0(r78,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:5000,5000 FP_REGS:5000,5000 ALL_REGS:10000,10000 MEM:9000,9000
  a1(r79,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:0,0 FP_REGS:0,0 ALL_REGS:10000,10000 MEM:9000,9000

So it correctly prefers FP_REGS for r79 as it has the lowest cost, but then forces the allocno and
best register to GENERAL_REGS... We could work around it by not having the "r" variant first in
the aarch64_get_lane patterns and further discouraging its use via "?r", but that's a different patch.

Wilco

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
  2015-12-16 13:05   ` Wilco Dijkstra
@ 2015-12-16 14:27     ` James Greenhalgh
  2015-12-17 13:38       ` Wilco Dijkstra
  0 siblings, 1 reply; 8+ messages in thread
From: James Greenhalgh @ 2015-12-16 14:27 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

On Wed, Dec 16, 2015 at 01:05:21PM +0000, Wilco Dijkstra wrote:
> James Greenhalgh wrote:
> > On Tue, Dec 15, 2015 at 10:54:49AM +0000, Wilco Dijkstra wrote:
> > > ping
> > >
> > > > -----Original Message-----
> > > > From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
> > > > Sent: 06 November 2015 20:06
> > > > To: 'gcc-patches@gcc.gnu.org'
> > > > Subject: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > >
> > > > This patch adds support for the TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > > hook. When the cost of GENERAL_REGS and FP_REGS is identical, the register
> > > > allocator always uses ALL_REGS even when it has a much higher cost. The
> > > > hook changes the class to either FP_REGS or GENERAL_REGS depending on the
> > > > mode of the register. This results in better register allocation overall,
> > > > fewer spills and reduced codesize - particularly in SPEC2006 gamess.
> > > >
> > > > GCC regression passes with several minor fixes.
> > > >
> > > > OK for commit?
> > > >
> > > > ChangeLog:
> > > > 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> > > >
> > > > 	* gcc/config/aarch64/aarch64.c
> > > > 	(TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS): New define.
> > > > 	(aarch64_ira_change_pseudo_allocno_class): New function.
> > > > 	* gcc/testsuite/gcc.target/aarch64/cvtf_1.c: Build with -O2.
> > > > 	* gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > 	(test_corners_sisd_di): Improve force to SIMD register.
> > > > 	(test_corners_sisd_si): Likewise.
> > > > 	* gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c: Build with -O2.
> > > > 	* gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c:
> > > > 	Remove scan-assembler check for ldr.
> > 
> > Drop the gcc/ from the ChangeLog.
> > 
> > > > --
> > > >  gcc/config/aarch64/aarch64.c                       | 22 ++++++++++++++++++++++
> > > >  gcc/testsuite/gcc.target/aarch64/cvtf_1.c          |  2 +-
> > > >  gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c  |  4 ++--
> > > >  gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c     |  2 +-
> > > >  .../gcc.target/aarch64/vect-ld1r-compile-fp.c      |  1 -
> > 
> > These testsuite changes concern me a bit, and you don't mention them beyond
> > saying they are minor fixes...
> 
> Well any changes to register allocator preferencing would cause fallout in
> tests that are assuming which register is allocated, especially if they use
> nasty inline assembler hacks to do so...

Sure, but the testcases here each operate on data that should live in
FP_REGS given the initial conditions that the nasty hacks try to mimic -
that's what makes the regressions notable.

>
> > > >  #define FCVTDEF(ftype,itype) \
> > > >  void \
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > index 363f554..8465c89 100644
> > > > --- a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > @@ -186,9 +186,9 @@ test_corners_sisd_di (Int64x1 b)
> > > >  {
> > > >    force_simd_di (b);
> > > >    b = b >> 63;
> > > > +  force_simd_di (b);
> > > >    b = b >> 0;
> > > >    b += b >> 65; /* { dg-warning "right shift count >= width of type" } */
> > > > -  force_simd_di (b);
> > 
> > This one I don't understand, but seems to say that we've decided to move
> > b out of FP_REGS after getting it in there for b = b << 63; ? So this is
> > another register allocator regression?
> 
> No, basically the register allocator is now making better decisions as to
> where to allocate integer variables. It will only allocate them to FP
> registers if they are primarily used by other FP operations. The
> force_simd_di inline assembler tries to mimic FP uses, and if there are
> enough of them at the right places then everything works as expected.  If
> however you do 3 consecutive integer operations then the allocator will now
> correctly prefer to allocate them to the integer registers (while previously
> it wouldn't, which is inefficient).

I'm not sure I understand this argument in the abstract (though I believe
it for some of the supported cores for the AArch64 target). At an abstract
level, given a set of operations which can execute in either FP_REGS or
GENERAL_REGS and initial and post conditions that allocate all input and
output registers from those operations to FP_REGS, I would expect those
operations to take place using FP_REGS? Your patch seems to break this
expectation?

> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > index a49db3e..c5a9c52 100644
> > > > --- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > @@ -1,6 +1,6 @@
> > > >  /* Test vdup_lane intrinsics work correctly.  */
> > > >  /* { dg-do run } */
> > > > -/* { dg-options "-O1 --save-temps" } */
> > > > +/* { dg-options "-O2 --save-temps" } */
> > 
> > Another -O1 regression ?
> 
> No, it's triggering a bug in the -O1 register preferencing that causes incorrect preferences to be
> selected despite the costs being right. The cost calculation with -O1 for eg. 
> wrap_vdupb_lane_s8_0() in vdup_lane_2.c:
> 
> Pass 0 for finding pseudo/allocno costs
> 
>     r79: preferred FP_REGS, alternative GENERAL_REGS, allocno GENERAL_REGS
>     a1 (r79,l0) best GENERAL_REGS, allocno GENERAL_REGS
>     r78: preferred GENERAL_REGS, alternative NO_REGS, allocno GENERAL_REGS
>     a0 (r78,l0) best GENERAL_REGS, allocno GENERAL_REGS
> 
>   a0(r78,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:5000,5000 FP_REGS:5000,5000 ALL_REGS:10000,10000 MEM:9000,9000
>   a1(r79,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:0,0 FP_REGS:0,0 ALL_REGS:10000,10000 MEM:9000,9000
> 
> So it correctly prefers FP_REGS for r79 as it has the lowest cost, but then
> forces the allocno and best register to GENERAL_REGS... We could work around
> it by not having the "r" variant first in the aarch64_get_lane patterns and
> further discouraging its use via "?r", but that's a different patch.

Well, that patch (moving "r" alternative away from first) does seem to
better fit with what we've done elsewhere in aarch64-simd.md (e.g.
aarch64_combinez below). Does making this change obviate the need to
change these testcases to -O1? If so, I'd rather break them with your patch
and fix it in a follow-up than paper over the cracks.

Thanks,
James

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
  2015-12-16 14:27     ` James Greenhalgh
@ 2015-12-17 13:38       ` Wilco Dijkstra
  2016-01-26 17:39         ` Wilco Dijkstra
  0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2015-12-17 13:38 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches, nd

James Greenhalgh wrote:
> On Wed, Dec 16, 2015 at 01:05:21PM +0000, Wilco Dijkstra wrote:
> > James Greenhalgh wrote:
> > > On Tue, Dec 15, 2015 at 10:54:49AM +0000, Wilco Dijkstra wrote:
> > > > ping
> > > >
> > > > > -----Original Message-----
> > > > > From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
> > > > > Sent: 06 November 2015 20:06
> > > > > To: 'gcc-patches@gcc.gnu.org'
> > > > > Subject: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > > >
> > > > > This patch adds support for the TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > > > hook. When the cost of GENERAL_REGS and FP_REGS is identical, the register
> > > > > allocator always uses ALL_REGS even when it has a much higher cost. The
> > > > > hook changes the class to either FP_REGS or GENERAL_REGS depending on the
> > > > > mode of the register. This results in better register allocation overall,
> > > > > fewer spills and reduced codesize - particularly in SPEC2006 gamess.
> > > > >
> > > > > GCC regression passes with several minor fixes.
> > > > >
> > > > > OK for commit?
> > > > >
> > > > > ChangeLog:
> > > > > 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> > > > >
> > > > > 	* gcc/config/aarch64/aarch64.c
> > > > > 	(TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS): New define.
> > > > > 	(aarch64_ira_change_pseudo_allocno_class): New function.
> > > > > 	* gcc/testsuite/gcc.target/aarch64/cvtf_1.c: Build with -O2.
> > > > > 	* gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > 	(test_corners_sisd_di): Improve force to SIMD register.
> > > > > 	(test_corners_sisd_si): Likewise.
> > > > > 	* gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c: Build with -O2.
> > > > > 	* gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c:
> > > > > 	Remove scan-assembler check for ldr.
> > >
> > > Drop the gcc/ from the ChangeLog.
> > >
> > > > > --
> > > > >  gcc/config/aarch64/aarch64.c                       | 22 ++++++++++++++++++++++
> > > > >  gcc/testsuite/gcc.target/aarch64/cvtf_1.c          |  2 +-
> > > > >  gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c  |  4 ++--
> > > > >  gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c     |  2 +-
> > > > >  .../gcc.target/aarch64/vect-ld1r-compile-fp.c      |  1 -
> > >
> > > These testsuite changes concern me a bit, and you don't mention them beyond
> > > saying they are minor fixes...
> >
> > Well any changes to register allocator preferencing would cause fallout in
> > tests that are assuming which register is allocated, especially if they use
> > nasty inline assembler hacks to do so...
> 
> Sure, but the testcases here each operate on data that should live in
> FP_REGS given the initial conditions that the nasty hacks try to mimic -
> that's what makes the regressions notable.
> 
> >
> > > > >  #define FCVTDEF(ftype,itype) \
> > > > >  void \
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > index 363f554..8465c89 100644
> > > > > --- a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > @@ -186,9 +186,9 @@ test_corners_sisd_di (Int64x1 b)
> > > > >  {
> > > > >    force_simd_di (b);
> > > > >    b = b >> 63;
> > > > > +  force_simd_di (b);
> > > > >    b = b >> 0;
> > > > >    b += b >> 65; /* { dg-warning "right shift count >= width of type" } */
> > > > > -  force_simd_di (b);
> > >
> > > This one I don't understand, but seems to say that we've decided to move
> > > b out of FP_REGS after getting it in there for b = b << 63; ? So this is
> > > another register allocator regression?
> >
> > No, basically the register allocator is now making better decisions as to
> > where to allocate integer variables. It will only allocate them to FP
> > registers if they are primarily used by other FP operations. The
> > force_simd_di inline assembler tries to mimic FP uses, and if there are
> > enough of them at the right places then everything works as expected.  If
> > however you do 3 consecutive integer operations then the allocator will now
> > correctly prefer to allocate them to the integer registers (while previously
> > it wouldn't, which is inefficient).
> 
> I'm not sure I understand this argument in the abstract (though I believe
> it for some of the supported cores for the AArch64 target). At an abstract
> level, given a set of operations which can execute in either FP_REGS or
> GENERAL_REGS and initial and post conditions that allocate all input and
> output registers from those operations to FP_REGS, I would expect those
> operations to take place using FP_REGS? Your patch seems to break this
> expectation?

No my patch doesn't break that expectation. The goal is that if the cost of 
allocating to either integer or FP registers is the same, we prefer the most
natural register file based on the type. We'll continue to allocate integer 
operations to FP_REGS if that has the lowest cost.

Like I mentioned in the explanation, the issue is that the register allocator simply
ignores the the much higher cost of ALL_REGS and uses it eventhough it results in
very suboptimal allocations and a large number of redundant int<->fp moves.
This patch fixes this by forcing the preference to FP_REGS or GENERAL_REGS if it
Is ALL_REGS.

> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > index a49db3e..c5a9c52 100644
> > > > > --- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > @@ -1,6 +1,6 @@
> > > > >  /* Test vdup_lane intrinsics work correctly.  */
> > > > >  /* { dg-do run } */
> > > > > -/* { dg-options "-O1 --save-temps" } */
> > > > > +/* { dg-options "-O2 --save-temps" } */
> > >
> > > Another -O1 regression ?
> >
> > No, it's triggering a bug in the -O1 register preferencing that causes incorrect preferences to be
> > selected despite the costs being right. The cost calculation with -O1 for eg.
> > wrap_vdupb_lane_s8_0() in vdup_lane_2.c:
> >
> > Pass 0 for finding pseudo/allocno costs
> >
> >     r79: preferred FP_REGS, alternative GENERAL_REGS, allocno GENERAL_REGS
> >     a1 (r79,l0) best GENERAL_REGS, allocno GENERAL_REGS
> >     r78: preferred GENERAL_REGS, alternative NO_REGS, allocno GENERAL_REGS
> >     a0 (r78,l0) best GENERAL_REGS, allocno GENERAL_REGS
> >
> >   a0(r78,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:5000,5000 FP_REGS:5000,5000
> ALL_REGS:10000,10000 MEM:9000,9000
> >   a1(r79,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:0,0 FP_REGS:0,0 ALL_REGS:10000,10000
> MEM:9000,9000
> >
> > So it correctly prefers FP_REGS for r79 as it has the lowest cost, but then
> > forces the allocno and best register to GENERAL_REGS... We could work around
> > it by not having the "r" variant first in the aarch64_get_lane patterns and
> > further discouraging its use via "?r", but that's a different patch.
> 
> Well, that patch (moving "r" alternative away from first) does seem to
> better fit with what we've done elsewhere in aarch64-simd.md (e.g.
> aarch64_combinez below). Does making this change obviate the need to
> change these testcases to -O1? If so, I'd rather break them with your patch
> and fix it in a follow-up than paper over the cracks.

Yes, using "?r" works. I can easily add this to my combinez patch - the issue is that there are a
lot more patterns that have the same problem, so we also need a fix in the register allocator
(we need to do both as reload also has bugs where it completely ignores all the costs and 
preferences, so the order really matters a lot...).

So I looked a bit further, and the bug is that the preferencing also forces ALL_REGS if the 
GENERAL_REGS and FP_REGS costs are not equal but both are lower than the memory cost
(again even if ALL_REGS cost is higher than the memory cost!). 

In that case TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS will force the preference
irrespectively of the best preference. To fix this we need to extend it with the best register
class (and possibly alternate class) so we can avoid forcing the wrong preference if there already
is a good preference (ie. not ALL_REGS). I'll write a patch for that - it's trivial but presumably too
late for GCC6 as it affects a target callback...

Wilco

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
  2015-12-17 13:38       ` Wilco Dijkstra
@ 2016-01-26 17:39         ` Wilco Dijkstra
  2016-02-02 10:03           ` James Greenhalgh
  0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2016-01-26 17:39 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: gcc-patches, nd

ping (note the regressions discussed below are addressed by https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01761.html)

________________________________________
From: Wilco Dijkstra
Sent: 17 December 2015 13:37
To: James Greenhalgh
Cc: gcc-patches@gcc.gnu.org; nd
Subject: RE: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS

James Greenhalgh wrote:
> On Wed, Dec 16, 2015 at 01:05:21PM +0000, Wilco Dijkstra wrote:
> > James Greenhalgh wrote:
> > > On Tue, Dec 15, 2015 at 10:54:49AM +0000, Wilco Dijkstra wrote:
> > > > ping
> > > >
> > > > > -----Original Message-----
> > > > > From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
> > > > > Sent: 06 November 2015 20:06
> > > > > To: 'gcc-patches@gcc.gnu.org'
> > > > > Subject: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > > >
> > > > > This patch adds support for the TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > > > hook. When the cost of GENERAL_REGS and FP_REGS is identical, the register
> > > > > allocator always uses ALL_REGS even when it has a much higher cost. The
> > > > > hook changes the class to either FP_REGS or GENERAL_REGS depending on the
> > > > > mode of the register. This results in better register allocation overall,
> > > > > fewer spills and reduced codesize - particularly in SPEC2006 gamess.
> > > > >
> > > > > GCC regression passes with several minor fixes.
> > > > >
> > > > > OK for commit?
> > > > >
> > > > > ChangeLog:
> > > > > 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> > > > >
> > > > >       * gcc/config/aarch64/aarch64.c
> > > > >       (TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS): New define.
> > > > >       (aarch64_ira_change_pseudo_allocno_class): New function.
> > > > >       * gcc/testsuite/gcc.target/aarch64/cvtf_1.c: Build with -O2.
> > > > >       * gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > >       (test_corners_sisd_di): Improve force to SIMD register.
> > > > >       (test_corners_sisd_si): Likewise.
> > > > >       * gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c: Build with -O2.
> > > > >       * gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c:
> > > > >       Remove scan-assembler check for ldr.
> > >
> > > Drop the gcc/ from the ChangeLog.
> > >
> > > > > --
> > > > >  gcc/config/aarch64/aarch64.c                       | 22 ++++++++++++++++++++++
> > > > >  gcc/testsuite/gcc.target/aarch64/cvtf_1.c          |  2 +-
> > > > >  gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c  |  4 ++--
> > > > >  gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c     |  2 +-
> > > > >  .../gcc.target/aarch64/vect-ld1r-compile-fp.c      |  1 -
> > >
> > > These testsuite changes concern me a bit, and you don't mention them beyond
> > > saying they are minor fixes...
> >
> > Well any changes to register allocator preferencing would cause fallout in
> > tests that are assuming which register is allocated, especially if they use
> > nasty inline assembler hacks to do so...
>
> Sure, but the testcases here each operate on data that should live in
> FP_REGS given the initial conditions that the nasty hacks try to mimic -
> that's what makes the regressions notable.
>
> >
> > > > >  #define FCVTDEF(ftype,itype) \
> > > > >  void \
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > index 363f554..8465c89 100644
> > > > > --- a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > @@ -186,9 +186,9 @@ test_corners_sisd_di (Int64x1 b)
> > > > >  {
> > > > >    force_simd_di (b);
> > > > >    b = b >> 63;
> > > > > +  force_simd_di (b);
> > > > >    b = b >> 0;
> > > > >    b += b >> 65; /* { dg-warning "right shift count >= width of type" } */
> > > > > -  force_simd_di (b);
> > >
> > > This one I don't understand, but seems to say that we've decided to move
> > > b out of FP_REGS after getting it in there for b = b << 63; ? So this is
> > > another register allocator regression?
> >
> > No, basically the register allocator is now making better decisions as to
> > where to allocate integer variables. It will only allocate them to FP
> > registers if they are primarily used by other FP operations. The
> > force_simd_di inline assembler tries to mimic FP uses, and if there are
> > enough of them at the right places then everything works as expected.  If
> > however you do 3 consecutive integer operations then the allocator will now
> > correctly prefer to allocate them to the integer registers (while previously
> > it wouldn't, which is inefficient).
>
> I'm not sure I understand this argument in the abstract (though I believe
> it for some of the supported cores for the AArch64 target). At an abstract
> level, given a set of operations which can execute in either FP_REGS or
> GENERAL_REGS and initial and post conditions that allocate all input and
> output registers from those operations to FP_REGS, I would expect those
> operations to take place using FP_REGS? Your patch seems to break this
> expectation?

No my patch doesn't break that expectation. The goal is that if the cost of
allocating to either integer or FP registers is the same, we prefer the most
natural register file based on the type. We'll continue to allocate integer
operations to FP_REGS if that has the lowest cost.

Like I mentioned in the explanation, the issue is that the register allocator simply
ignores the the much higher cost of ALL_REGS and uses it eventhough it results in
very suboptimal allocations and a large number of redundant int<->fp moves.
This patch fixes this by forcing the preference to FP_REGS or GENERAL_REGS if it
Is ALL_REGS.

> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > index a49db3e..c5a9c52 100644
> > > > > --- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > @@ -1,6 +1,6 @@
> > > > >  /* Test vdup_lane intrinsics work correctly.  */
> > > > >  /* { dg-do run } */
> > > > > -/* { dg-options "-O1 --save-temps" } */
> > > > > +/* { dg-options "-O2 --save-temps" } */
> > >
> > > Another -O1 regression ?
> >
> > No, it's triggering a bug in the -O1 register preferencing that causes incorrect preferences to be
> > selected despite the costs being right. The cost calculation with -O1 for eg.
> > wrap_vdupb_lane_s8_0() in vdup_lane_2.c:
> >
> > Pass 0 for finding pseudo/allocno costs
> >
> >     r79: preferred FP_REGS, alternative GENERAL_REGS, allocno GENERAL_REGS
> >     a1 (r79,l0) best GENERAL_REGS, allocno GENERAL_REGS
> >     r78: preferred GENERAL_REGS, alternative NO_REGS, allocno GENERAL_REGS
> >     a0 (r78,l0) best GENERAL_REGS, allocno GENERAL_REGS
> >
> >   a0(r78,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:5000,5000 FP_REGS:5000,5000
> ALL_REGS:10000,10000 MEM:9000,9000
> >   a1(r79,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:0,0 FP_REGS:0,0 ALL_REGS:10000,10000
> MEM:9000,9000
> >
> > So it correctly prefers FP_REGS for r79 as it has the lowest cost, but then
> > forces the allocno and best register to GENERAL_REGS... We could work around
> > it by not having the "r" variant first in the aarch64_get_lane patterns and
> > further discouraging its use via "?r", but that's a different patch.
>
> Well, that patch (moving "r" alternative away from first) does seem to
> better fit with what we've done elsewhere in aarch64-simd.md (e.g.
> aarch64_combinez below). Does making this change obviate the need to
> change these testcases to -O1? If so, I'd rather break them with your patch
> and fix it in a follow-up than paper over the cracks.

Yes, using "?r" works. I can easily add this to my combinez patch - the issue is that there are a
lot more patterns that have the same problem, so we also need a fix in the register allocator
(we need to do both as reload also has bugs where it completely ignores all the costs and
preferences, so the order really matters a lot...).

So I looked a bit further, and the bug is that the preferencing also forces ALL_REGS if the
GENERAL_REGS and FP_REGS costs are not equal but both are lower than the memory cost
(again even if ALL_REGS cost is higher than the memory cost!).

In that case TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS will force the preference
irrespectively of the best preference. To fix this we need to extend it with the best register
class (and possibly alternate class) so we can avoid forcing the wrong preference if there already
is a good preference (ie. not ALL_REGS). I'll write a patch for that - it's trivial but presumably too
late for GCC6 as it affects a target callback...

Wilco

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
  2016-01-26 17:39         ` Wilco Dijkstra
@ 2016-02-02 10:03           ` James Greenhalgh
  0 siblings, 0 replies; 8+ messages in thread
From: James Greenhalgh @ 2016-02-02 10:03 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: gcc-patches, nd

On Tue, Jan 26, 2016 at 05:39:24PM +0000, Wilco Dijkstra wrote:
> ping (note the regressions discussed below are addressed by https://gcc.gnu.org/ml/gcc-patches/2016-01/msg01761.html)

OK, but please be extra vigilant for any fallout on AArch64 after this
and the follow-up linked above is applied.

Thanks,
James

> James Greenhalgh wrote:
> > On Wed, Dec 16, 2015 at 01:05:21PM +0000, Wilco Dijkstra wrote:
> > > James Greenhalgh wrote:
> > > > On Tue, Dec 15, 2015 at 10:54:49AM +0000, Wilco Dijkstra wrote:
> > > > > ping
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
> > > > > > Sent: 06 November 2015 20:06
> > > > > > To: 'gcc-patches@gcc.gnu.org'
> > > > > > Subject: [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > > > >
> > > > > > This patch adds support for the TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
> > > > > > hook. When the cost of GENERAL_REGS and FP_REGS is identical, the register
> > > > > > allocator always uses ALL_REGS even when it has a much higher cost. The
> > > > > > hook changes the class to either FP_REGS or GENERAL_REGS depending on the
> > > > > > mode of the register. This results in better register allocation overall,
> > > > > > fewer spills and reduced codesize - particularly in SPEC2006 gamess.
> > > > > >
> > > > > > GCC regression passes with several minor fixes.
> > > > > >
> > > > > > OK for commit?
> > > > > >
> > > > > > ChangeLog:
> > > > > > 2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>
> > > > > >
> > > > > >       * gcc/config/aarch64/aarch64.c
> > > > > >       (TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS): New define.
> > > > > >       (aarch64_ira_change_pseudo_allocno_class): New function.
> > > > > >       * gcc/testsuite/gcc.target/aarch64/cvtf_1.c: Build with -O2.
> > > > > >       * gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > >       (test_corners_sisd_di): Improve force to SIMD register.
> > > > > >       (test_corners_sisd_si): Likewise.
> > > > > >       * gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c: Build with -O2.
> > > > > >       * gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c:
> > > > > >       Remove scan-assembler check for ldr.
> > > >
> > > > Drop the gcc/ from the ChangeLog.
> > > >
> > > > > > --
> > > > > >  gcc/config/aarch64/aarch64.c                       | 22 ++++++++++++++++++++++
> > > > > >  gcc/testsuite/gcc.target/aarch64/cvtf_1.c          |  2 +-
> > > > > >  gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c  |  4 ++--
> > > > > >  gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c     |  2 +-
> > > > > >  .../gcc.target/aarch64/vect-ld1r-compile-fp.c      |  1 -
> > > >
> > > > These testsuite changes concern me a bit, and you don't mention them beyond
> > > > saying they are minor fixes...
> > >
> > > Well any changes to register allocator preferencing would cause fallout in
> > > tests that are assuming which register is allocated, especially if they use
> > > nasty inline assembler hacks to do so...
> >
> > Sure, but the testcases here each operate on data that should live in
> > FP_REGS given the initial conditions that the nasty hacks try to mimic -
> > that's what makes the regressions notable.
> >
> > >
> > > > > >  #define FCVTDEF(ftype,itype) \
> > > > > >  void \
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > > index 363f554..8465c89 100644
> > > > > > --- a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
> > > > > > @@ -186,9 +186,9 @@ test_corners_sisd_di (Int64x1 b)
> > > > > >  {
> > > > > >    force_simd_di (b);
> > > > > >    b = b >> 63;
> > > > > > +  force_simd_di (b);
> > > > > >    b = b >> 0;
> > > > > >    b += b >> 65; /* { dg-warning "right shift count >= width of type" } */
> > > > > > -  force_simd_di (b);
> > > >
> > > > This one I don't understand, but seems to say that we've decided to move
> > > > b out of FP_REGS after getting it in there for b = b << 63; ? So this is
> > > > another register allocator regression?
> > >
> > > No, basically the register allocator is now making better decisions as to
> > > where to allocate integer variables. It will only allocate them to FP
> > > registers if they are primarily used by other FP operations. The
> > > force_simd_di inline assembler tries to mimic FP uses, and if there are
> > > enough of them at the right places then everything works as expected.  If
> > > however you do 3 consecutive integer operations then the allocator will now
> > > correctly prefer to allocate them to the integer registers (while previously
> > > it wouldn't, which is inefficient).
> >
> > I'm not sure I understand this argument in the abstract (though I believe
> > it for some of the supported cores for the AArch64 target). At an abstract
> > level, given a set of operations which can execute in either FP_REGS or
> > GENERAL_REGS and initial and post conditions that allocate all input and
> > output registers from those operations to FP_REGS, I would expect those
> > operations to take place using FP_REGS? Your patch seems to break this
> > expectation?
> 
> No my patch doesn't break that expectation. The goal is that if the cost of
> allocating to either integer or FP registers is the same, we prefer the most
> natural register file based on the type. We'll continue to allocate integer
> operations to FP_REGS if that has the lowest cost.
> 
> Like I mentioned in the explanation, the issue is that the register allocator simply
> ignores the the much higher cost of ALL_REGS and uses it eventhough it results in
> very suboptimal allocations and a large number of redundant int<->fp moves.
> This patch fixes this by forcing the preference to FP_REGS or GENERAL_REGS if it
> Is ALL_REGS.
> 
> > > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > > index a49db3e..c5a9c52 100644
> > > > > > --- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > > +++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
> > > > > > @@ -1,6 +1,6 @@
> > > > > >  /* Test vdup_lane intrinsics work correctly.  */
> > > > > >  /* { dg-do run } */
> > > > > > -/* { dg-options "-O1 --save-temps" } */
> > > > > > +/* { dg-options "-O2 --save-temps" } */
> > > >
> > > > Another -O1 regression ?
> > >
> > > No, it's triggering a bug in the -O1 register preferencing that causes incorrect preferences to be
> > > selected despite the costs being right. The cost calculation with -O1 for eg.
> > > wrap_vdupb_lane_s8_0() in vdup_lane_2.c:
> > >
> > > Pass 0 for finding pseudo/allocno costs
> > >
> > >     r79: preferred FP_REGS, alternative GENERAL_REGS, allocno GENERAL_REGS
> > >     a1 (r79,l0) best GENERAL_REGS, allocno GENERAL_REGS
> > >     r78: preferred GENERAL_REGS, alternative NO_REGS, allocno GENERAL_REGS
> > >     a0 (r78,l0) best GENERAL_REGS, allocno GENERAL_REGS
> > >
> > >   a0(r78,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:5000,5000 FP_REGS:5000,5000
> > ALL_REGS:10000,10000 MEM:9000,9000
> > >   a1(r79,l0) costs: CALLER_SAVE_REGS:5000,5000 GENERAL_REGS:5000,5000 FP_LO_REGS:0,0 FP_REGS:0,0 ALL_REGS:10000,10000
> > MEM:9000,9000
> > >
> > > So it correctly prefers FP_REGS for r79 as it has the lowest cost, but then
> > > forces the allocno and best register to GENERAL_REGS... We could work around
> > > it by not having the "r" variant first in the aarch64_get_lane patterns and
> > > further discouraging its use via "?r", but that's a different patch.
> >
> > Well, that patch (moving "r" alternative away from first) does seem to
> > better fit with what we've done elsewhere in aarch64-simd.md (e.g.
> > aarch64_combinez below). Does making this change obviate the need to
> > change these testcases to -O1? If so, I'd rather break them with your patch
> > and fix it in a follow-up than paper over the cracks.
> 
> Yes, using "?r" works. I can easily add this to my combinez patch - the issue is that there are a
> lot more patterns that have the same problem, so we also need a fix in the register allocator
> (we need to do both as reload also has bugs where it completely ignores all the costs and
> preferences, so the order really matters a lot...).
> 
> So I looked a bit further, and the bug is that the preferencing also forces ALL_REGS if the
> GENERAL_REGS and FP_REGS costs are not equal but both are lower than the memory cost
> (again even if ALL_REGS cost is higher than the memory cost!).
> 
> In that case TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS will force the preference
> irrespectively of the best preference. To fix this we need to extend it with the best register
> class (and possibly alternate class) so we can avoid forcing the wrong preference if there already
> is a good preference (ie. not ALL_REGS). I'll write a patch for that - it's trivial but presumably too
> late for GCC6 as it affects a target callback...
> 
> Wilco

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
@ 2015-11-06 20:06 Wilco Dijkstra
  0 siblings, 0 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2015-11-06 20:06 UTC (permalink / raw)
  To: gcc-patches

This patch adds support for the TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS hook.
When the cost of GENERAL_REGS and FP_REGS is identical, the register
allocator always uses ALL_REGS even when it has a much higher cost. The hook
changes the class to either FP_REGS or GENERAL_REGS depending on the mode of
the register. This results in better register allocation overall, fewer
spills and reduced codesize - particularly in SPEC2006 gamess.

GCC regression passes with several minor fixes.

OK for commit?

ChangeLog:
2015-11-06  Wilco Dijkstra  <wdijkstr@arm.com>

	* gcc/config/aarch64/aarch64.c
	(TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS): New define.
	(aarch64_ira_change_pseudo_allocno_class): New function.
	* gcc/testsuite/gcc.target/aarch64/cvtf_1.c: Build with -O2.        
	* gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
	(test_corners_sisd_di): Improve force to SIMD register.
	(test_corners_sisd_si): Likewise.
	* gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c: Build with -O2.

	* gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c:
	Remove scan-assembler check for ldr.

--
 gcc/config/aarch64/aarch64.c                       | 22
++++++++++++++++++++++
 gcc/testsuite/gcc.target/aarch64/cvtf_1.c          |  2 +-
 gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c  |  4 ++--
 gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c     |  2 +-
 .../gcc.target/aarch64/vect-ld1r-compile-fp.c      |  1 -
 5 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 6da7245..9b60666 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -597,6 +597,24 @@ aarch64_err_no_fpadvsimd (machine_mode mode, const char
*msg)
     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc,
msg);
 }
 
+/* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
+   The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
+   the same cost even if ALL_REGS has a much larger cost.  This results in
bad
+   allocations and spilling.  To avoid this we force the class to
GENERAL_REGS
+   if the mode is integer.  */
+
+static reg_class_t
+aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t
allocno_class)
+{
+  enum machine_mode mode;
+
+  if (allocno_class != ALL_REGS)
+    return allocno_class;
+
+  mode = PSEUDO_REGNO_MODE (regno);
+  return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS :
GENERAL_REGS;
+}
+
 static unsigned int
 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 {
@@ -13113,6 +13131,10 @@ aarch64_promoted_type (const_tree t)
 #undef  TARGET_INIT_BUILTINS
 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
 
+#undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
+#define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
+  aarch64_ira_change_pseudo_allocno_class
+
 #undef TARGET_LEGITIMATE_ADDRESS_P
 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
 
diff --git a/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
b/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
index 5f2ff81..96501db 100644
--- a/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/cvtf_1.c
@@ -1,5 +1,5 @@
 /* { dg-do run } */
-/* { dg-options "-save-temps -fno-inline -O1" } */
+/* { dg-options "-save-temps -fno-inline -O2" } */
 
 #define FCVTDEF(ftype,itype) \
 void \
diff --git a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
index 363f554..8465c89 100644
--- a/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/scalar_shift_1.c
@@ -186,9 +186,9 @@ test_corners_sisd_di (Int64x1 b)
 {
   force_simd_di (b);
   b = b >> 63;
+  force_simd_di (b);
   b = b >> 0;
   b += b >> 65; /* { dg-warning "right shift count >= width of type" } */
-  force_simd_di (b);
 
   return b;
 }
@@ -199,9 +199,9 @@ test_corners_sisd_si (Int32x1 b)
 {
   force_simd_si (b);
   b = b >> 31;
+  force_simd_si (b);
   b = b >> 0;
   b += b >> 33; /* { dg-warning "right shift count >= width of type" } */
-  force_simd_si (b);
 
   return b;
 }
diff --git a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
index a49db3e..c5a9c52 100644
--- a/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
+++ b/gcc/testsuite/gcc.target/aarch64/vdup_lane_2.c
@@ -1,6 +1,6 @@
 /* Test vdup_lane intrinsics work correctly.  */
 /* { dg-do run } */
-/* { dg-options "-O1 --save-temps" } */
+/* { dg-options "-O2 --save-temps" } */
 
 #include <arm_neon.h>
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
b/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
index 66e0168..4711c61 100644
--- a/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
+++ b/gcc/testsuite/gcc.target/aarch64/vect-ld1r-compile-fp.c
@@ -8,6 +8,5 @@ DEF (float)
 DEF (double)
 
 /* { dg-final { scan-assembler "ld1r\\t\{v\[0-9\]+\.4s"} } */
-/* { dg-final { scan-assembler "ldr\\t\x\[0-9\]+"} } */
 /* { dg-final { scan-assembler "ld1r\\t\{v\[0-9\]+\.2d"} } */
 
-- 
1.8.3


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-02-02 10:03 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-15 10:54 [PATCH][AArch64] Add TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS Wilco Dijkstra
2015-12-16  9:54 ` James Greenhalgh
2015-12-16 13:05   ` Wilco Dijkstra
2015-12-16 14:27     ` James Greenhalgh
2015-12-17 13:38       ` Wilco Dijkstra
2016-01-26 17:39         ` Wilco Dijkstra
2016-02-02 10:03           ` James Greenhalgh
  -- strict thread matches above, loose matches on Subject: below --
2015-11-06 20:06 Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).