public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode
@ 2017-08-15  0:17 Peter Bergner
  2017-08-16 22:57 ` Segher Boessenkool
  0 siblings, 1 reply; 5+ messages in thread
From: Peter Bergner @ 2017-08-15  0:17 UTC (permalink / raw)
  To: GCC Patches; +Cc: Segher Boessenkool, Bill Schmidt, Michael Meissner

The following patch fixes a performance issue when loading/storing/moving
TImode values when using -mvsx-timode -mcpu=power7 with LRA.  The problem is
that the vsx_le_permute_<mode> and vsx_le_perm_{load,store}_<mode> patterns
do no support TImode values in GPRs, and LRA is using these patterns to
fixup constraints, which ends up leading to really bad code gen as seen by
the test cases in the bug report.

This patch fixes the bug by adding GPR support to the above patterns, as
well as a couple of peepholes that improve the code for loads and stores
to/from GPRs.

This passed bootstrapping and regtesting with no regressions and Mike
ran this on SPEC2006 and found no performance regressions with it.

Ok for trunk?  Do we want this on the GCC 7 branch where LRA is on by default?

Peter


gcc/
	* config/rs6000/vsx.md (*vsx_le_permute_<mode>): Add support for
	operands residing in integer registers.
	(*vsx_le_perm_load_<mode>): Likewise.
	(*vsx_le_perm_store_<mode>): Likewise.
	(define_peephole2): Add peepholes to optimize the above.

gcc/
	* gcc.target/powerpc/pr72804.c: New test.

Index: gcc/config/rs6000/vsx.md
===================================================================
--- gcc/config/rs6000/vsx.md	(revision 250918)
+++ gcc/config/rs6000/vsx.md	(working copy)
@@ -759,17 +759,20 @@ (define_split
 ;; special V1TI container class, which it is not appropriate to use vec_select
 ;; for the type.
 (define_insn "*vsx_le_permute_<mode>"
-  [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=<VSa>,<VSa>,Z")
+  [(set (match_operand:VSX_TI 0 "nonimmediate_operand" "=<VSa>,<VSa>,Z,&r,&r,Q")
 	(rotate:VSX_TI
-	 (match_operand:VSX_TI 1 "input_operand" "<VSa>,Z,<VSa>")
+	 (match_operand:VSX_TI 1 "input_operand" "<VSa>,Z,<VSa>,r,Q,r")
 	 (const_int 64)))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
   "@
    xxpermdi %x0,%x1,%x1,2
    lxvd2x %x0,%y1
-   stxvd2x %x1,%y0"
-  [(set_attr "length" "4")
-   (set_attr "type" "vecperm,vecload,vecstore")])
+   stxvd2x %x1,%y0
+   mr %0,%L1; mr %L0,%1
+   ld%U1%X1 %0,%L1; ld%U1%X1 %L0,%1
+   std%U0%X0 %L1,%0; std%U0%X0 %1,%L0"
+  [(set_attr "length" "4,4,4,8,8,8")
+   (set_attr "type" "vecperm,vecload,vecstore,*,load,store")])
 
 (define_insn_and_split "*vsx_le_undo_permute_<mode>"
   [(set (match_operand:VSX_TI 0 "vsx_register_operand" "=<VSa>,<VSa>")
@@ -795,10 +798,12 @@ (define_insn_and_split "*vsx_le_undo_per
    (set_attr "type" "veclogical")])
 
 (define_insn_and_split "*vsx_le_perm_load_<mode>"
-  [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>")
-        (match_operand:VSX_LE_128 1 "memory_operand" "Z"))]
+  [(set (match_operand:VSX_LE_128 0 "vsx_register_operand" "=<VSa>,r")
+        (match_operand:VSX_LE_128 1 "memory_operand" "Z,Q"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
-  "#"
+  "@
+   #
+   #"
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
   [(const_int 0)]
   "
@@ -811,16 +816,18 @@ (define_insn_and_split "*vsx_le_perm_loa
   DONE;
 }
   "
-  [(set_attr "type" "vecload")
-   (set_attr "length" "8")])
+  [(set_attr "type" "vecload,load")
+   (set_attr "length" "8,8")])
 
 (define_insn "*vsx_le_perm_store_<mode>"
-  [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z")
-        (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>"))]
+  [(set (match_operand:VSX_LE_128 0 "memory_operand" "=Z,Q")
+        (match_operand:VSX_LE_128 1 "vsx_register_operand" "+<VSa>,r"))]
   "!BYTES_BIG_ENDIAN && TARGET_VSX && !TARGET_P9_VECTOR"
-  "#"
-  [(set_attr "type" "vecstore")
-   (set_attr "length" "12")])
+  "@
+   #
+   #"
+  [(set_attr "type" "vecstore,store")
+   (set_attr "length" "12,8")])
 
 (define_split
   [(set (match_operand:VSX_LE_128 0 "memory_operand" "")
@@ -836,6 +843,31 @@ (define_split
   DONE;
 })
 
+;; Peepholes to catch loads and stores for TImode if TImode landed in
+;; GPR registers on a little endian system.
+(define_peephole2
+  [(set (match_operand:VSX_TI 0 "int_reg_operand" "")
+	(rotate:VSX_TI (match_operand:VSX_TI 1 "memory_operand" "")
+		       (const_int 64)))
+   (set (match_operand:VSX_TI 2 "int_reg_operand" "")
+	(rotate:VSX_TI (match_dup 0)
+		       (const_int 64)))]
+  "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR
+   && (rtx_equal_p (operands[0], operands[2])
+       || peep2_reg_dead_p (2, operands[0]))"
+   [(set (match_dup 2) (match_dup 1))])
+
+(define_peephole2
+  [(set (match_operand:VSX_TI 0 "int_reg_operand" "")
+	(rotate:VSX_TI (match_operand:VSX_TI 1 "int_reg_operand" "")
+		       (const_int 64)))
+   (set (match_operand:VSX_TI 2 "memory_operand" "")
+	(rotate:VSX_TI (match_dup 0)
+		       (const_int 64)))]
+  "!BYTES_BIG_ENDIAN && TARGET_VSX && TARGET_VSX_TIMODE && !TARGET_P9_VECTOR
+   && peep2_reg_dead_p (2, operands[0])"
+   [(set (match_dup 2) (match_dup 1))])
+
 ;; Peephole to catch memory to memory transfers for TImode if TImode landed in
 ;; VSX registers on a little endian system.  The vector types and IEEE 128-bit
 ;; floating point are handled by the more generic swap elimination pass.
Index: gcc/testsuite/gcc.target/powerpc/pr72804.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr72804.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/pr72804.c	(working copy)
@@ -0,0 +1,25 @@
+/* { dg-do compile { target { powerpc64*-*-* } } } */
+/* { dg-skip-if "" { powerpc*-*-darwin* } } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+/* { dg-options "-O2 -mvsx" } */
+
+__int128_t
+foo (__int128_t *src)
+{
+  return ~*src;
+}
+
+void
+bar (__int128_t *dst, __int128_t src)
+{
+  *dst =  ~src;
+}
+
+/* { dg-final { scan-assembler-times "not " 4 } } */
+/* { dg-final { scan-assembler-times "std " 2 } } */
+/* { dg-final { scan-assembler-times "ld " 2 } } */
+/* { dg-final { scan-assembler-not "lxvd2x" } } */
+/* { dg-final { scan-assembler-not "stxvd2x" } } */
+/* { dg-final { scan-assembler-not "xxpermdi" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */
+/* { dg-final { scan-assembler-not "mfvsrd" } } */

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-08-17 18:23 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-15  0:17 [PATCH, rs6000] Fix PR target/72804: Poor code gen with -mvsx-timode Peter Bergner
2017-08-16 22:57 ` Segher Boessenkool
2017-08-17  1:10   ` Peter Bergner
2017-08-17  5:05     ` Segher Boessenkool
2017-08-17 20:30     ` Peter Bergner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).