public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode
@ 2017-08-15  0:17 Peter Bergner
  2017-08-16 22:57 ` Segher Boessenkool
  0 siblings, 1 reply; 5+ messages in thread
From: Peter Bergner @ 2017-08-15  0:17 UTC (permalink / raw)
  To: GCC Patches; +Cc: Segher Boessenkool, Bill Schmidt, Michael Meissner

The following patch fixes a performance issue when loading/storing/moving
TImode values when using -mvsx-timode -mcpu=power7 with LRA.  The problem is
that the vsx_le_permute_<mode> and vsx_le_perm_{load,store}_<mode> patterns
do no support TImode values in GPRs, and LRA is using these patterns to
fixup constraints, which ends up leading to really bad code gen as seen by
the test cases in the bug report.

This patch fixes the bug by adding GPR support to the above patterns, as
well as a couple of peepholes that improve the code for loads and stores
to/from GPRs.

This passed bootstrapping and regtesting with no regressions and Mike
ran this on SPEC2006 and found no performance regressions with it.

Ok for trunk?  Do we want this on the GCC 7 branch where LRA is on by default?

Peter


gcc/
	* config/rs6000/vsx.md (*vsx_le_permute_<mode>): Add support for
	operands residing in integer registers.
	(*vsx_le_perm_load_<mode>): Likewise.
	(*vsx_le_perm_store_<mode>): Likewise.
	(define_peephole2): Add peepholes to optimize the above.

gcc/
	* gcc.target/powerpc/pr72804.c: New test.

Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md	(revision 250918)
+++ gcc/config/rs6000/vsx.md	(working copy)
@@ -759,17 +759,20 @@ (define_split
 ;; special V1TI container class, which it is not appropriate to use vec_select
 ;; for the type.
 (define_insn "*vsx_le_permute_<mode>"
-  [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=<VSa>,<VSa>,Z")
+  [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=<VSa>,<VSa>,Z,&r,&r,Q")
 	(rotate:VSX_TI
-	 (match_operand:VSX_TI 1 "input_operand" "<VSa>,Z,<VSa>")
+	 (match_operand:VSX_TI 1 "input_operand" "<VSa>,Z,<VSa>,r,Q,r")
 	 (const_int 64)))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
   "@
    xxpermdi %x0,%x1,%x1,2
    lxvd2x %x0,%y1
-   stxvd2x %x1,%y0"
-  [(set_attr "length" "4")
-   (set_attr "type" "vecperm,vecload,vecstore")])
+   stxvd2x %x1,%y0
+   mr %0,%L1; mr %L0,%1
+   ld%U1%X1 %0,%L1; ld%U1%X1 %L0,%1
+   std%U0%X0 %L1,%0; std%U0%X0 %1,%L0"
+  [(set_attr "length" "4,4,4,8,8,8")
+   (set_attr "type" "vecperm,vecload,vecstore,*,load,store")])
 
 (define_insn_and_split "*vsx_le_undo_permute_<mode>"
   [(set (match_operand:VSX_TI 0 "vsx_register_operand" "=<VSa>,<VSa>")
@@ -795,10 +798,12 @@ (define_insn_and_split "*vsx_le_undo_per
    (set_attr "type" "veclogical")])
 
 (define_insn_and_split "*vsx_le_perm_load_<mode>"
-  [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>")
-        (match_operand:VSX_LE_128 1 "memory_operand" "Z"))]
+  [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>,r")
+        (match_operand:VSX_LE_128 1 "memory_operand" "Z,Q"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
-  "#"
+  "@
+   #
+   #"
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
   [(const_int 0)]
   "
@@ -811,16 +816,18 @@ (define_insn_and_split "*vsx_le_perm_loa
   DONE;
 }
   "
-  [(set_attr "type" "vecload")
-   (set_attr "length" "8")])
+  [(set_attr "type" "vecload,load")
+   (set_attr "length" "8,8")])
 
 (define_insn "*vsx_le_perm_store_<mode>"
-  [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z")
-        (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>"))]
+  [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q")
+        (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>,r"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
-  "#"
-  [(set_attr "type" "vecstore")
-   (set_attr "length" "12")])
+  "@
+   #
+   #"
+  [(set_attr "type" "vecstore,store")
+   (set_attr "length" "12,8")])
 
 (define_split
   [(set (match_operand:VSX_LE_128 0 "memory_operand" "")
@@ -836,6 +843,31 @@ (define_split
   DONE;
 })
 
+;; Peepholes to catch loads and stores for TImode if TImode landed in
+;; GPR registers on a little endian system.
+(define_peephole2
+  [(set (match_operand:VSX_TI 0 "int_reg_operand" "")
+	(rotate:VSX_TI (match_operand:VSX_TI 1 "memory_operand" "")
+		       (const_int 64)))
+   (set (match_operand:VSX_TI 2 "int_reg_operand" "")
+	(rotate:VSX_TI (match_dup 0)
+		       (const_int 64)))]
+  "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR
+   && (rtx_equal_p (operands[0], operands[2])
+       || peep2_reg_dead_p (2, operands[0]))"
+   [(set (match_dup 2) (match_dup 1))])
+
+(define_peephole2
+  [(set (match_operand:VSX_TI 0 "int_reg_operand" "")
+	(rotate:VSX_TI (match_operand:VSX_TI 1 "int_reg_operand" "")
+		       (const_int 64)))
+   (set (match_operand:VSX_TI 2 "memory_operand" "")
+	(rotate:VSX_TI (match_dup 0)
+		       (const_int 64)))]
+  "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR
+   && peep2_reg_dead_p (2, operands[0])"
+   [(set (match_dup 2) (match_dup 1))])
+
 ;; Peephole to catch memory to memory transfers for TImode if TImode landed in
 ;; VSX registers on a little endian system.  The vector types and IEEE 128-bit
 ;; floating point are handled by the more generic swap elimination pass.
Index: gcc/testsuite/gcc.target/powerpc/pr72804.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr72804.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/pr72804.c	(working copy)
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { powerpc64*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+__int128_t
+foo (__int128_t *src)
+{
+  return ~*src;
+}
+
+void
+bar (__int128_t *dst, __int128_t src)
+{
+  *dst =  ~src;
+}
+
+/* { dg-final { scan-assembler-times "not " 4 } } */
+/* { dg-final { scan-assembler-times "std " 2 } } */
+/* { dg-final { scan-assembler-times "ld " 2 } } */
+/* { dg-final { scan-assembler-not "lxvd2x" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode
  2017-08-15  0:17 [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode Peter Bergner
@ 2017-08-16 22:57 ` Segher Boessenkool
  2017-08-17  1:10   ` Peter Bergner
  0 siblings, 1 reply; 5+ messages in thread
From: Segher Boessenkool @ 2017-08-16 22:57 UTC (permalink / raw)
  To: Peter Bergner; +Cc: GCC Patches, Bill Schmidt, Michael Meissner

On Mon, Aug 14, 2017 at 04:28:25PM -0500, Peter Bergner wrote:
> The following patch fixes a performance issue when loading/storing/moving
> TImode values when using -mvsx-timode -mcpu=power7 with LRA.  The problem is
> that the vsx_le_permute_<mode> and vsx_le_perm_{load,store}_<mode> patterns
> do no support TImode values in GPRs, and LRA is using these patterns to
> fixup constraints, which ends up leading to really bad code gen as seen by
> the test cases in the bug report.
> 
> This patch fixes the bug by adding GPR support to the above patterns, as
> well as a couple of peepholes that improve the code for loads and stores
> to/from GPRs.
> 
> This passed bootstrapping and regtesting with no regressions and Mike
> ran this on SPEC2006 and found no performance regressions with it.
> 
> Ok for trunk?  Do we want this on the GCC 7 branch where LRA is on by default?

Okay for trunk (see nits below).  For 7, okay after waiting a week or so
for fallout, if you think it really helps there.

>    "@
>     xxpermdi %x0,%x1,%x1,2
>     lxvd2x %x0,%y1
> -   stxvd2x %x1,%y0"
> -  [(set_attr "length" "4")
> -   (set_attr "type" "vecperm,vecload,vecstore")])
> +   stxvd2x %x1,%y0
> +   mr %0,%L1; mr %L0,%1

   mr %0,%L1\;mr %L0,%1

etc.

> +;; Peepholes to catch loads and stores for TImode if TImode landed in
> +;; GPR registers on a little endian system.
> +(define_peephole2
> +  [(set (match_operand:VSX_TI 0 "int_reg_operand" "")

You can leave out the default ""?

> Index: gcc/testsuite/gcc.target/powerpc/pr72804.c
> ===================================================================
> --- gcc/testsuite/gcc.target/powerpc/pr72804.c	(nonexistent)
> +++ gcc/testsuite/gcc.target/powerpc/pr72804.c	(working copy)
> @@ -0,0 +1,25 @@
> +/* { dg-do compile { target { powerpc64*-*-* } } } */

powerpc64*-*-* is never correct, we are biarch.  { target lp64 } instead?

Thanks,


Segher

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode
  2017-08-16 22:57 ` Segher Boessenkool
@ 2017-08-17  1:10   ` Peter Bergner
  2017-08-17  5:05     ` Segher Boessenkool
  2017-08-17 20:30     ` Peter Bergner
  0 siblings, 2 replies; 5+ messages in thread
From: Peter Bergner @ 2017-08-17  1:10 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: GCC Patches, Bill Schmidt, Michael Meissner

On 8/16/17 5:30 PM, Segher Boessenkool wrote:
> On Mon, Aug 14, 2017 at 04:28:25PM -0500, Peter Bergner wrote:
>> +   mr %0,%L1; mr %L0,%1
> 
>    mr %0,%L1\;mr %L0,%1


So you want the ';' escaped and the space removed?  Ok.



>> +  [(set (match_operand:VSX_TI 0 "int_reg_operand" "")
> 
> You can leave out the default ""?

Cut/paste of another peephole2.  I'll try without it, thanks.



>> +/* { dg-do compile { target { powerpc64*-*-* } } } */
> 
> powerpc64*-*-* is never correct, we are biarch.  { target lp64 } instead?

Ah yes, that is better.


I'll make the above changes and commit after another quick test cycle.
Thanks!

Peter

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode
  2017-08-17  1:10   ` Peter Bergner
@ 2017-08-17  5:05     ` Segher Boessenkool
  2017-08-17 20:30     ` Peter Bergner
  1 sibling, 0 replies; 5+ messages in thread
From: Segher Boessenkool @ 2017-08-17  5:05 UTC (permalink / raw)
  To: Peter Bergner; +Cc: GCC Patches, Bill Schmidt, Michael Meissner

On Wed, Aug 16, 2017 at 05:56:09PM -0500, Peter Bergner wrote:
> On 8/16/17 5:30 PM, Segher Boessenkool wrote:
> > On Mon, Aug 14, 2017 at 04:28:25PM -0500, Peter Bergner wrote:
> >> +   mr %0,%L1; mr %L0,%1
> > 
> >    mr %0,%L1\;mr %L0,%1
> 
> So you want the ';' escaped and the space removed?  Ok.

From doc/md.texi:

"""
The template may generate multiple assembler instructions.  Write the text
for the instructions, with @samp{\;} between them.
"""

(It actually expands to "\n\t", see read-md.c).


Segher

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode
  2017-08-17  1:10   ` Peter Bergner
  2017-08-17  5:05     ` Segher Boessenkool
@ 2017-08-17 20:30     ` Peter Bergner
  1 sibling, 0 replies; 5+ messages in thread
From: Peter Bergner @ 2017-08-17 20:30 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: GCC Patches, Bill Schmidt, Michael Meissner

On 8/16/17 5:56 PM, Peter Bergner wrote:
> On 8/16/17 5:30 PM, Segher Boessenkool wrote:
> I'll make the above changes and commit after another quick test cycle.

Testing the changes came up clean, so I committed it.  Thanks.

Peter


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-08-17 18:23 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-15  0:17 [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode Peter Bergner
2017-08-16 22:57 ` Segher Boessenkool
2017-08-17  1:10   ` Peter Bergner
2017-08-17  5:05     ` Segher Boessenkool
2017-08-17 20:30     ` Peter Bergner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).