public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu
@ 2017-03-06 18:17 Peter Bergner
  2017-03-06 18:55 ` Segher Boessenkool
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Peter Bergner @ 2017-03-06 18:17 UTC (permalink / raw)
  To: GCC Patches
  Cc: Segher Boessenkool, David Edelsohn, Bill Schmidt,
	Michael Meissner, Matthias Klose

PR78543 has two related test cases that have similar insns that need
reloading (pseudo 185 in this case) due to spills:

(insn 142 144 146 20 (parallel [
            (set (reg:HI 4 4 [orig:260 p ] [260])
                (bswap:HI (subreg/s/v:HI (reg:DI 185 [ load_dst_59 ]) 0)))
            (clobber (scratch:SI))
        ]) pr78543.i:23 144 {bswaphi2_internal}
     (expr_list:REG_DEAD (reg:DI 185 [ load_dst_59 ])
        (nil)))

In simplify_subreg, we execute the following code:

  /* If we have a SUBREG of a register that we are replacing and we are
     replacing it with a MEM, make a new MEM and try replacing the
     SUBREG with it.  Don't do this if the MEM has a mode-dependent address
     or if we would be widening it.  */

  if (MEM_P (op)
      && ! mode_dependent_address_p (XEXP (op, 0), MEM_ADDR_SPACE (op))
      /* Allow splitting of volatile memory references in case we don't
         have instruction to move the whole thing.  */
      && (! MEM_VOLATILE_P (op)
          || ! have_insn_for (SET, innermode))
      && GET_MODE_SIZE (outermode) <= GET_MODE_SIZE (GET_MODE (op)))
    return adjust_address_nv (op, outermode, byte);

The address we pass to mode_dependent_address_p() is:

	(plus:DI (reg/f:DI 1 1)
		 (const_int 65648 [0x10070]))

...which rs6000_mode_dependent_address() says *is* mode dependent, so we
do not end up calling adjust_address_nv() to adjust the stack address.
This ends up leaving us with the followng insn to handle:

  (subreg/s/v:HI (mem/c:DI (plus:DI (plus:DI (reg/f:DI 1 1)
                  (const_int 65536 [0x10000]))
              (const_int 112 [0x70])) [5 %sfp+65648 S8 A64]) 0)

Which triggers the gcc_assert at reload.c:1348:

  /* Optional output reloads are always OK even if we have no register class,
     since the function of these reloads is only to have spill_reg_store etc.
     set, so that the storing insn can be deleted later.  */
  gcc_assert (rclass != NO_REGS
              || (optional != 0 && type == RELOAD_FOR_OUTPUT));

where: rclass = NO_REGS, optional = 0, type = RELOAD_FOR_INPUT

If we look at rs6000_mode_dependent_address(), it accepts some addresses
as not being mode dependent:

    case PLUS:
      /* Any offset from virtual_stack_vars_rtx and arg_pointer_rtx
         is considered a legitimate address before reload, so there
         are no offset restrictions in that case.  Note that this
         condition is safe in strict mode because any address involving
         virtual_stack_vars_rtx or arg_pointer_rtx would already have
         been rejected as illegitimate.  */
      if (XEXP (addr, 0) != virtual_stack_vars_rtx
          && XEXP (addr, 0) != arg_pointer_rtx
          && GET_CODE (XEXP (addr, 1)) == CONST_INT)
        {
          unsigned HOST_WIDE_INT val = INTVAL (XEXP (addr, 1));
          return val + 0x8000 >= 0x10000 - (TARGET_POWERPC64 ? 8 : 12);
        }
      break;

It seems wrong that we accept addresses based off virtual_stack_vars_rtx
and arg_pointer_rtx, but not stack_pointer_rtx (ie, r1) like we have in
these test cases.  Adding a change to accept stack_pointer_rtx, allows
us to call adjust_address_nv() which ends up transforming the:

  (mem:DI (plus (reg:1) (const_int 65648)))
into:
  (mem:HI (plus (reg:1) (const_int 65648)))

allowing us to eliminate the subreg insn below:

  (subreg/s/v:HI (mem/c:DI (plus:DI (plus:DI (reg/f:DI 1 1)
                  (const_int 65536 [0x10000]))
              (const_int 112 [0x70])) [5 %sfp+65648 S8 A64]) 0)

which solves our ICE.

The following patch passes bootstrap and regtesting on powerpc64le-linux.
Ok for the GCC 6 branch?

We don't hit this on trunk, because we're using LRA there, so I'm not
sure whether we want to add this there this late in the release cycle.

Peter

gcc/
	PR target/78543
	* config/rs6000/rs6000.c (rs6000_mode_dependent_address): Handle
	stack_pointer_rtx.

gcc/testsuite/
	PR target/78543
	* gcc.target/powerpc/pr78543.c: New test.
	* gcc.target/powerpc/pr78543-2.c: Likewise.

Index: gcc/config/rs6000/rs6000.c
===================================================================
--- gcc/config/rs6000/rs6000.c	(revision 245904)
+++ gcc/config/rs6000/rs6000.c	(working copy)
@@ -8988,6 +8988,7 @@
 	 been rejected as illegitimate.  */
       if (XEXP (addr, 0) != virtual_stack_vars_rtx
 	  && XEXP (addr, 0) != arg_pointer_rtx
+	  && XEXP (addr, 0) != stack_pointer_rtx
 	  && GET_CODE (XEXP (addr, 1)) == CONST_INT)
 	{
 	  unsigned HOST_WIDE_INT val = INTVAL (XEXP (addr, 1));
Index: gcc/testsuite/gcc.target/powerpc/pr78543-2.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr78543-2.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/pr78543-2.c	(working copy)
@@ -0,0 +1,60 @@
+/* PR target/78543 */
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-O1 -mcpu=power8" } */
+
+typedef long a;
+enum c { e, f, g, h, i, ab } j();
+int l, n, o, p;
+a q, r;
+void *memcpy();
+void b();
+static int k(int *s) {
+  int m;
+  if (j(&m))
+    *s = m;
+  return !0;
+}
+void d(char s) {
+  int af[4];
+  int ag;
+  enum c ah;
+  char ai[24 << 11];
+  unsigned aj;
+  if (!k(&aj))
+    goto ak;
+  for (;;) {
+    if (!k(&ag))
+      goto ak;
+    switch (ah) {
+    case e:
+      b("");
+      b("bad length %d for GUID in fileinfo v%u for \"%s\"");
+    case i:
+      b("bad length %d for TTH in fileinfo v%u for \"%s\"", aj);
+    case ab:
+      if (ag % 24)
+	b("for \"%s\"", s);
+    case f:
+      if (20 == ag)
+      case h:
+	if (20 == ag)
+	  o = 0;
+      break;
+    case g:
+      memcpy(af, ai, sizeof af);
+      b();
+      if (p) {
+	a al, am;
+	r = al << 2 | am;
+	n = af[2];
+	al = n;
+	l = __builtin_bswap32(af[3]);
+	am = q = n | l;
+      }
+    default:
+      b("%s0 unhandled field ID %u 0", __func__);
+    }
+  }
+ak:;
+}
Index: gcc/testsuite/gcc.target/powerpc/pr78543.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr78543.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/pr78543.c	(working copy)
@@ -0,0 +1,40 @@
+/* PR target/78543 */
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-O3 -mcpu=power8" } */
+
+int a, b, c, d, e, f = 0, g, i;
+int h[4];
+extern void fn2();
+extern int fn3();
+extern void fn4();
+extern void fn5();
+void
+fn1 (void)
+{
+  unsigned short j;
+  int k = c;
+  char l[65535];
+  if (g && h[e])
+    fn2();
+  for (; h[e]; e--)
+    if (a)
+      return;
+  for (short n = h[0]; n; n--) {
+    a = fn3(d, l, sizeof l) < 0;
+    if (a)
+      return;
+    char m;
+    short o = 0, p = ({
+		   j = i;
+		   j >> 8 | (j & 255) << 8;
+		 });
+    long q = b;
+    if (g && h[0]) {
+      fn2(k, "%7d", "%7", &o, "%7 ", f);
+      while (b < q)
+	fn4();
+      fn5(m, p, "");
+    }
+  }
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu
  2017-03-06 18:17 [PATCH, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu Peter Bergner
@ 2017-03-06 18:55 ` Segher Boessenkool
  2017-03-07 13:35 ` Ulrich Weigand
  2017-03-24 23:26 ` [PATCH #2, " Michael Meissner
  2 siblings, 0 replies; 8+ messages in thread
From: Segher Boessenkool @ 2017-03-06 18:55 UTC (permalink / raw)
  To: Peter Bergner
  Cc: GCC Patches, David Edelsohn, Bill Schmidt, Michael Meissner,
	Matthias Klose

On Mon, Mar 06, 2017 at 12:16:58PM -0600, Peter Bergner wrote:

[ big snip, thanks for the thorough explanation! ]

> The following patch passes bootstrap and regtesting on powerpc64le-linux.
> Ok for the GCC 6 branch?

Please also test on BE and 32-bit.

> We don't hit this on trunk, because we're using LRA there, so I'm not
> sure whether we want to add this there this late in the release cycle.

I prefer to have it on trunk as well.  We *do* still support non-LRA
on trunk, just not by default.

This is okay if all testing works out.  Thanks!


Segher

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu
  2017-03-06 18:17 [PATCH, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu Peter Bergner
  2017-03-06 18:55 ` Segher Boessenkool
@ 2017-03-07 13:35 ` Ulrich Weigand
  2017-03-24 23:26 ` [PATCH #2, " Michael Meissner
  2 siblings, 0 replies; 8+ messages in thread
From: Ulrich Weigand @ 2017-03-07 13:35 UTC (permalink / raw)
  To: Peter Bergner
  Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt,
	Michael Meissner, Matthias Klose

Peter Bergner wrote:

> If we look at rs6000_mode_dependent_address(), it accepts some addresses
> as not being mode dependent:
> 
>     case PLUS:
>       /* Any offset from virtual_stack_vars_rtx and arg_pointer_rtx
>          is considered a legitimate address before reload, so there
>          are no offset restrictions in that case.  Note that this
>          condition is safe in strict mode because any address involving
>          virtual_stack_vars_rtx or arg_pointer_rtx would already have
>          been rejected as illegitimate.  */
>       if (XEXP (addr, 0) != virtual_stack_vars_rtx
>           && XEXP (addr, 0) != arg_pointer_rtx
>           && GET_CODE (XEXP (addr, 1)) == CONST_INT)
>         {
>           unsigned HOST_WIDE_INT val = INTVAL (XEXP (addr, 1));
>           return val + 0x8000 >= 0x10000 - (TARGET_POWERPC64 ? 8 : 12);
>         }
>       break;
> 
> It seems wrong that we accept addresses based off virtual_stack_vars_rtx
> and arg_pointer_rtx, but not stack_pointer_rtx (ie, r1) like we have in
> these test cases.

This check is usually done in legitimate_address_p, and there this
distinction is in fact correct: the problem is that when rewriting
an address based on a virtual register into one based on the stack
(or frame) pointer, the offset will change, and may change significantly.
(E.g. you may end up with "frame size - original offset".)

This means that checking the offset relative to the virtual register
is completely pointless:
(A) If the offset relative to the virtual register is in range, the
    resulting offset relative to the stack pointer may be out of range.
(B) If the offset relative to the virtual register is out of range,
    the resulting offset may still end up *in* range.

Because of (A), reload will re-check offsets after eliminating
registers, and fix up offsets that are now out of range.  But
because if (B), if we were to reject out-of-range offsets relative
to a virtual register already in legitimate_address_p, we'd already
commit to generatcwinge more complex code (e.g. precomputing the address)
and even if the final offset would be in range, nobody would then
clean up that unnecessary code any more.

That is why the legitimate_address_p deliberately and correctly
accepts *any* offset for virtual (and eliminable) registers,
thereby deferring the offset validity check until after the
register has been replaced by a real register.  But for any
actually real register (like the stack pointer), of course we
*do* have to perform the precise check -- nobody would do that
check otherwise, and we'd end up with invalid instructions.


Now, here we are in mode_dependent_address, which in general
should return true if legitimate_address_p on this address might
return different results depending on the mode of the address.

And in fact, for stack-pointer based addresses, legitimate_address_p
*can* return different results depending on the mode, as the comment
in front of mode_dependent_address explains:

/* Go to LABEL if ADDR (a legitimate address expression)
   has an effect that depends on the machine mode it is used for.

   On the RS/6000 this is true of all integral offsets (since AltiVec
   and VSX modes don't allow them) or is a pre-increment or decrement.


On the other hand, for addresses based on a virtual register,
legitimate_address_p does not depend on the mode since those
are special-cased to be always accepted (see the discussion above).

So I'm not sure that the proposed change is in fact correct ...

Bye,
Ulrich

-- 
  Dr. Ulrich Weigand
  GNU/Linux compilers and toolchain
  Ulrich.Weigand@de.ibm.com

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH #2, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu
  2017-03-06 18:17 [PATCH, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu Peter Bergner
  2017-03-06 18:55 ` Segher Boessenkool
  2017-03-07 13:35 ` Ulrich Weigand
@ 2017-03-24 23:26 ` Michael Meissner
  2017-03-24 23:37   ` Michael Meissner
  2017-03-25  1:12   ` Segher Boessenkool
  2 siblings, 2 replies; 8+ messages in thread
From: Michael Meissner @ 2017-03-24 23:26 UTC (permalink / raw)
  To: Peter Bergner, GCC Patches, Segher Boessenkool, David Edelsohn,
	Bill Schmidt, Michael Meissner, Matthias Klose

[-- Attachment #1: Type: text/plain, Size: 2116 bytes --]

This patch reworks the original patch I attached to the bug report, to try and
make it less hacky.  It separates the bswap insns where there is hardware
support into separate read, write, and register swap instructions. This is
because the register allocators will try to push the bswap value in a register
to the stack and do the load based swap with reverse bytes.

Reload fumbles in certain conditions.  LRA generates working code, but the
store and the load with byte reverse from the same location, can slow things
down compared to the operation on registers.

I only did this optimization where we had the hardware support (i.e. bswap for
HImode all of the time, bswap for SImode all of the time, and bswap for DImode
if we are executing 64-bit instructions and the machine has LDBRX/STDBRX --
power7 and newer/cell ppc).

I have done bootstrap builds on a little endian power8 system, on a big endian
power8 system, and a big endian power7 system (both 32/64-bit support on this
last system).  There were no regressions.

I applied the patches to GCC 6, and the compiler builds and has no regressions
on a little endian power8 system.

I built spec 2006 benchmarks with the GCC 7 compiler.  There are 12 benchmarks
that generate one or more load/store with byte swap instructions (perlbench,
gcc, gamess, milc, zeusmp, calculix, h264ref, tonto, omnetpp, wrf, sphinx3,
xalancbmk).

I compared the instructions generated.  10 of the benchmarks generated the same
instructions.

Milc generated 1 less load with byte swap instruction and 1 more store with
byte swap instruction.

Sphinx3 generated 6 less load with byte swap instructions and 6 more store with
byte swap instructions.

So I count this as the same level of byte swapping is being generated.

Can I apply the patch to the GCC 7 trunk?  Since the patch shows up as an abort
in GCC 6 and not in GCC 7 (unless you disable LRA), I would like to apply the
patch as soon as possible to the GCC 6 branch.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797

[-- Attachment #2: pr78543.patch03b --]
[-- Type: text/plain, Size: 9271 bytes --]

Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md	(revision 246413)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -370,6 +370,11 @@ (define_mode_iterator P [(SI "TARGET_32B
 ; PTImode is GPR only)
 (define_mode_iterator TI2 [TI PTI])
 
+; Types that have hardware load/store with byte reverse
+(define_mode_iterator BSWAP [HI
+			     SI
+			     (DI "TARGET_POWERPC64 && TARGET_LDBRX")])
+  
 ; Any hardware-supported floating-point mode
 (define_mode_iterator FP [
   (SF "TARGET_HARD_FLOAT 
@@ -2350,12 +2355,12 @@ (define_insn "cmpb<mode>3"
 
 ;; Since the hardware zeros the upper part of the register, save generating the
 ;; AND immediate if we are converting to unsigned
-(define_insn "*bswaphi2_extenddi"
+(define_insn "*bswap<mode>2_extenddi"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
 	(zero_extend:DI
-	 (bswap:HI (match_operand:HI 1 "memory_operand" "Z"))))]
+	 (bswap:HSI (match_operand:HSI 1 "memory_operand" "Z"))))]
   "TARGET_POWERPC64"
-  "lhbrx %0,%y1"
+  "l<wd>brx %0,%y1"
   [(set_attr "length" "4")
    (set_attr "type" "load")])
 
@@ -2368,34 +2373,55 @@ (define_insn "*bswaphi2_extendsi"
   [(set_attr "length" "4")
    (set_attr "type" "load")])
 
-(define_expand "bswaphi2"
-  [(parallel [(set (match_operand:HI 0 "reg_or_mem_operand" "")
-		   (bswap:HI
-		    (match_operand:HI 1 "reg_or_mem_operand" "")))
-	      (clobber (match_scratch:SI 2 ""))])]
+;; Separate the bswap patterns into load, store, and gpr<-gpr.  This prevents
+;; the register allocator from converting a gpr<-gpr swap into a store and then
+;; load with byte swap, which can be slower than doing it in the registers.  It
+;; also prevents certain failures with the RELOAD register allocator.
+
+(define_expand "bswap<mode>2"
+  [(use (match_operand:HSI 0 "reg_or_mem_operand"))
+   (use (match_operand:HSI 1 "reg_or_mem_operand"))]
   ""
 {
-  if (!REG_P (operands[0]) && !REG_P (operands[1]))
-    operands[1] = force_reg (HImode, operands[1]);
+  rtx dest = operands[0];
+  rtx src = operands[1];
+
+  if (!REG_P (dest) && !REG_P (src))
+    src = force_reg (<MODE>mode, src);
+
+  if (MEM_P (src))
+    emit_insn (gen_bswap<mode>2_load (dest, src));
+  else
+    {
+      if (MEM_P (dest))
+	emit_insn (gen_bswap<mode>2_store (dest, src));
+      else
+	emit_insn (gen_bswap<mode>2_reg (dest, src));
+    }
+  DONE;
 })
 
-(define_insn "bswaphi2_internal"
-  [(set (match_operand:HI 0 "reg_or_mem_operand" "=r,Z,&r")
-	(bswap:HI
-	 (match_operand:HI 1 "reg_or_mem_operand" "Z,r,r")))
-   (clobber (match_scratch:SI 2 "=X,X,&r"))]
+(define_insn "bswap<mode>2_load"
+  [(set (match_operand:BSWAP 0 "gpc_reg_operand" "=r")
+	(bswap:BSWAP (match_operand:BSWAP 1 "memory_operand" "Z")))]
+  ""
+  "l<wd>brx %0,%y1"
+  [(set_attr "type" "load")])
+
+(define_insn "bswap<mode>2_store"
+  [(set (match_operand:BSWAP 0 "memory_operand" "=Z")
+	(bswap:BSWAP (match_operand:BSWAP 1 "gpc_reg_operand" "r")))]
   ""
-  "@
-   lhbrx %0,%y1
-   sthbrx %1,%y0
-   #"
-  [(set_attr "length" "4,4,12")
-   (set_attr "type" "load,store,*")])
+  "st<wd>brx %1,%y0"
+  [(set_attr "type" "store")])
 
-(define_split
-  [(set (match_operand:HI 0 "gpc_reg_operand" "")
-	(bswap:HI (match_operand:HI 1 "gpc_reg_operand" "")))
-   (clobber (match_operand:SI 2 "gpc_reg_operand" ""))]
+(define_insn_and_split "bswaphi2_reg"
+  [(set (match_operand:HI 0 "gpc_reg_operand" "=&r")
+	(bswap:HI
+	 (match_operand:HI 1 "gpc_reg_operand" "r")))
+   (clobber (match_scratch:SI 2 "=&r"))]
+  ""
+  "#"
   "reload_completed"
   [(set (match_dup 3)
 	(and:SI (lshiftrt:SI (match_dup 4)
@@ -2408,48 +2434,21 @@ (define_split
    (set (match_dup 3)
 	(ior:SI (match_dup 3)
 		(match_dup 2)))]
-  "
 {
   operands[3] = simplify_gen_subreg (SImode, operands[0], HImode, 0);
   operands[4] = simplify_gen_subreg (SImode, operands[1], HImode, 0);
-}")
-
-(define_insn "*bswapsi2_extenddi"
-  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-	(zero_extend:DI
-	 (bswap:SI (match_operand:SI 1 "memory_operand" "Z"))))]
-  "TARGET_POWERPC64"
-  "lwbrx %0,%y1"
-  [(set_attr "length" "4")
-   (set_attr "type" "load")])
-
-(define_expand "bswapsi2"
-  [(set (match_operand:SI 0 "reg_or_mem_operand" "")
-	(bswap:SI
-	 (match_operand:SI 1 "reg_or_mem_operand" "")))]
-  ""
-{
-  if (!REG_P (operands[0]) && !REG_P (operands[1]))
-    operands[1] = force_reg (SImode, operands[1]);
-})
-
-(define_insn "*bswapsi2_internal"
-  [(set (match_operand:SI 0 "reg_or_mem_operand" "=r,Z,&r")
-	(bswap:SI
-	 (match_operand:SI 1 "reg_or_mem_operand" "Z,r,r")))]
-  ""
-  "@
-   lwbrx %0,%y1
-   stwbrx %1,%y0
-   #"
-  [(set_attr "length" "4,4,12")
-   (set_attr "type" "load,store,*")])
+}
+  [(set_attr "length" "12")
+   (set_attr "type" "*")])
 
 ;; We are always BITS_BIG_ENDIAN, so the bit positions below in
 ;; zero_extract insns do not change for -mlittle.
-(define_split
-  [(set (match_operand:SI 0 "gpc_reg_operand" "")
-	(bswap:SI (match_operand:SI 1 "gpc_reg_operand" "")))]
+(define_insn_and_split "bswapsi2_reg"
+  [(set (match_operand:SI 0 "gpc_reg_operand" "=&r")
+	(bswap:SI
+	 (match_operand:SI 1 "gpc_reg_operand" "r")))]
+  ""
+  "#"
   "reload_completed"
   [(set (match_dup 0)					; DABC
 	(rotate:SI (match_dup 1)
@@ -2465,11 +2464,23 @@ (define_split
 				     (const_int 24))
 			(const_int 255))
 		(and:SI (match_dup 0)
-			(const_int -256))))
-
-  ]
+			(const_int -256))))]
   "")
 
+(define_insn "bswapdi2_reg"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
+	(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))
+   (clobber (match_scratch:DI 2 "=&r"))
+   (clobber (match_scratch:DI 3 "=&r"))]
+  "TARGET_POWERPC64 && TARGET_LDBRX"
+  "#"
+  [(set_attr "length" "36")
+   (set_attr "type" "*")])
+
+;; On systems with LDBRX/STDBRX generate the loads/stores directly, just like
+;; we do for L{H,W}BRX and ST{H,W}BRX above.  If not, we have to generate more
+;; complex code.
+
 (define_expand "bswapdi2"
   [(parallel [(set (match_operand:DI 0 "reg_or_mem_operand" "")
 		   (bswap:DI
@@ -2478,34 +2489,36 @@ (define_expand "bswapdi2"
 	      (clobber (match_scratch:DI 3 ""))])]
   ""
 {
-  if (!REG_P (operands[0]) && !REG_P (operands[1]))
-    operands[1] = force_reg (DImode, operands[1]);
+  rtx dest = operands[0];
+  rtx src = operands[1];
+
+  if (!REG_P (dest) && !REG_P (src))
+    operands[1] = src = force_reg (DImode, src);
+
+  if (TARGET_POWERPC64 && TARGET_LDBRX)
+    {
+      if (MEM_P (src))
+	emit_insn (gen_bswapdi2_load (dest, src));
+      else
+	{
+	  if (MEM_P (dest))
+	    emit_insn (gen_bswapdi2_store (dest, src));
+	  else
+	    emit_insn (gen_bswapdi2_reg (dest, src));
+	}
+      DONE;
+    }
 
   if (!TARGET_POWERPC64)
     {
       /* 32-bit mode needs fewer scratch registers, but 32-bit addressing mode
 	 that uses 64-bit registers needs the same scratch registers as 64-bit
 	 mode.  */
-      emit_insn (gen_bswapdi2_32bit (operands[0], operands[1]));
+      emit_insn (gen_bswapdi2_32bit (dest, src));
       DONE;
     }
 })
 
-;; Power7/cell has ldbrx/stdbrx, so use it directly
-(define_insn "*bswapdi2_ldbrx"
-  [(set (match_operand:DI 0 "reg_or_mem_operand" "=r,Z,&r")
-	(bswap:DI (match_operand:DI 1 "reg_or_mem_operand" "Z,r,r")))
-   (clobber (match_scratch:DI 2 "=X,X,&r"))
-   (clobber (match_scratch:DI 3 "=X,X,&r"))]
-  "TARGET_POWERPC64 && TARGET_LDBRX
-   && (REG_P (operands[0]) || REG_P (operands[1]))"
-  "@
-   ldbrx %0,%y1
-   stdbrx %1,%y0
-   #"
-  [(set_attr "length" "4,4,36")
-   (set_attr "type" "load,store,*")])
-
 ;; Non-power7/cell, fall back to use lwbrx/stwbrx
 (define_insn "*bswapdi2_64bit"
   [(set (match_operand:DI 0 "reg_or_mem_operand" "=r,Z,&r")
Index: gcc/testsuite/gcc.target/powerpc/pr78543.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr78543.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/pr78543.c	(working copy)
@@ -0,0 +1,60 @@
+/* { dg-do compile { target { powerpc64*-*-* && lp64 } } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O1 -mno-lra" } */
+
+typedef long a;
+enum c { e, f, g, h, i, ab } j();
+int l, n, o, p;
+a q, r;
+void *memcpy();
+void b();
+static int k(int *s) {
+  int m;
+  if (j(&m))
+    *s = m;
+  return !0;
+}
+void d(char s) {
+  int af[4];
+  int ag;
+  enum c ah;
+  char ai[24 << 11];
+  unsigned aj;
+  if (!k(&aj))
+    goto ak;
+  for (;;) {
+    if (!k(&ag))
+      goto ak;
+    switch (ah) {
+    case e:
+      b("");
+      b("bad length %d for GUID in fileinfo v%u for \"%s\"");
+    case i:
+      b("bad length %d for TTH in fileinfo v%u for \"%s\"", aj);
+    case ab:
+      if (ag % 24)
+        b("for \"%s\"", s);
+    case f:
+      if (20 == ag)
+      case h:
+        if (20 == ag)
+          o = 0;
+      break;
+    case g:
+      memcpy(af, ai, sizeof af);
+      b();
+      if (p) {
+        a al, am;
+        r = al << 2 | am;
+        n = af[2];
+        al = n;
+        l = __builtin_bswap32(af[3]);
+        am = q = n | l;
+      }
+    default:
+      b("%s0 unhandled field ID %u 0", __func__);
+    }
+  }
+ak:;
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH #2, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu
  2017-03-24 23:26 ` [PATCH #2, " Michael Meissner
@ 2017-03-24 23:37   ` Michael Meissner
  2017-03-25  1:12   ` Segher Boessenkool
  1 sibling, 0 replies; 8+ messages in thread
From: Michael Meissner @ 2017-03-24 23:37 UTC (permalink / raw)
  To: Michael Meissner, Peter Bergner, GCC Patches, Segher Boessenkool,
	David Edelsohn, Bill Schmidt, Matthias Klose

Well instead of attaching the ChangeLog, I attached the patch without
ChangeLog.

Here is the ChangeLog entry for the patch:

[gcc]
2017-03-24  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/78543
	* config/rs6000/rs6000.md (BSWAP): New mode iterator for modes
	with hardware byte swap load/store intstructions.
	(bswaphi2_extenddi): Combine bswap HImode and SImode with zero
	extend to DImode to one insn.
	(bswap<mode>2_extenddi): Likewise.
	(bswapsi2_extenddi): Likewise.
	(bswaphi2_extendsi): Likewise.
	(bswaphi2): Combine bswap HImode and SImode into one insn.
	Separate memory insns from swapping register.
	(bswapsi2): Likewise.
	(bswap<mode>2): Likewise.
	(bswaphi2_internal): Delete, no longer used.
	(bswapsi2_internal): Likewise.
	(bswap<mode>2_load): Split bswap HImode/SImode into separate load,
	store, and gpr<-gpr swap insns.
	(bswap<mode>2_store): Likewise.
	(bswaphi2_reg): Register only splitter, combine with the splitter.
	(bswaphi2 splitter): Likewise.
	(bswapsi2_reg): Likewise.
	(bswapsi2 splitter): Likewise.
	(bswapdi2): If we have the LDBRX and STDBRX instructions, split
	the insns into load, store, and register/register insns.
	(bswapdi2_ldbrx): Likewise.
	(bswapdi2_load): Likewise.
	(bswapdi2_store): Likewise.
	(bswapdi2_reg): Likewise.

[gcc/testsuite]
2017-03-24  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/78543
	* gcc.target/powerpc/pr78543.c: New test.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH #2, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu
  2017-03-24 23:26 ` [PATCH #2, " Michael Meissner
  2017-03-24 23:37   ` Michael Meissner
@ 2017-03-25  1:12   ` Segher Boessenkool
  2017-03-27 17:30     ` Michael Meissner
  2017-03-27 19:58     ` Michael Meissner
  1 sibling, 2 replies; 8+ messages in thread
From: Segher Boessenkool @ 2017-03-25  1:12 UTC (permalink / raw)
  To: Michael Meissner, Peter Bergner, GCC Patches, David Edelsohn,
	Bill Schmidt, Matthias Klose

Hi Mike,

On Fri, Mar 24, 2017 at 07:23:02PM -0400, Michael Meissner wrote:
> Reload fumbles in certain conditions.

Yeah.  And it does not need bswap to get totally lost with this, so this
patch is a workaround, not a fix.

It does make things nicer though :-)

> LRA generates working code, but the
> store and the load with byte reverse from the same location, can slow things
> down compared to the operation on registers.

How so?  You mean (say) lwbrx after doing a stw to that same address?
We have that problem in general :-/


Thanks for the extensive testing.

> Can I apply the patch to the GCC 7 trunk?  Since the patch shows up as an abort
> in GCC 6 and not in GCC 7 (unless you disable LRA), I would like to apply the
> patch as soon as possible to the GCC 6 branch.

A few remarks:

> +; Types that have hardware load/store with byte reverse
> +(define_mode_iterator BSWAP [HI
> +			     SI
> +			     (DI "TARGET_POWERPC64 && TARGET_LDBRX")])

The patch would be much easier to read if you had done this step (with HSI
as well) separately.  Oh well.

> +(define_expand "bswap<mode>2"

> +  if (MEM_P (src))
> +    emit_insn (gen_bswap<mode>2_load (dest, src));
> +  else
> +    {
> +      if (MEM_P (dest))
> +	emit_insn (gen_bswap<mode>2_store (dest, src));
> +      else
> +	emit_insn (gen_bswap<mode>2_reg (dest, src));
> +    }
> +  DONE;

Please avoid the extra nesting, i.e. do

+  if (MEM_P (src))
+    emit_insn (gen_bswap<mode>2_load (dest, src));
+  else if (MEM_P (dest))
+    emit_insn (gen_bswap<mode>2_store (dest, src));
+  else
+    emit_insn (gen_bswap<mode>2_reg (dest, src));
+  DONE;

(also for bswapdi2).

> +  [(set_attr "length" "12")
> +   (set_attr "type" "*")])

Lose that last line?  You don't need to explicitly set things to their
default value ;-)
OTOH, you might want to make it type "three" instead?

> +  [(set_attr "length" "36")
> +   (set_attr "type" "*")])

Same.

This patch is okay for trunk.  Also for 6, but what is the hurry there?

Thanks!


Segher

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH #2, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu
  2017-03-25  1:12   ` Segher Boessenkool
@ 2017-03-27 17:30     ` Michael Meissner
  2017-03-27 19:58     ` Michael Meissner
  1 sibling, 0 replies; 8+ messages in thread
From: Michael Meissner @ 2017-03-27 17:30 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Michael Meissner, Peter Bergner, GCC Patches, David Edelsohn,
	Bill Schmidt, Matthias Klose

On Fri, Mar 24, 2017 at 07:16:32PM -0500, Segher Boessenkool wrote:
> Hi Mike,
> 
> On Fri, Mar 24, 2017 at 07:23:02PM -0400, Michael Meissner wrote:
> > Reload fumbles in certain conditions.
> 
> Yeah.  And it does not need bswap to get totally lost with this, so this
> patch is a workaround, not a fix.
> 
> It does make things nicer though :-)
> 
> > LRA generates working code, but the
> > store and the load with byte reverse from the same location, can slow things
> > down compared to the operation on registers.
> 
> How so?  You mean (say) lwbrx after doing a stw to that same address?
> We have that problem in general :-/

With the second code sample that shows the problem,

with:

	-O1 -mlittle -mno-lra

the compiler aborts.  If you remove the -mno-lra (or use -mlra on GCC 6), the
compiler does not abort, however the code is sub-optimal.  LRA has the value in
a register it does:

        lwa 9,108(1)
        li 10,0
        ori 10,10,0xc070
        stdx 9,10,1
        bl b

	... other code

        li 9,0
        ori 9,9,0xc070
        lwbrx 9,9,1

With my patches, it does:

        lwa 14,108(1)
        bl b

	... other code

        rotlwi 9,14,24
        rlwimi 9,14,8,8,15
        rlwimi 9,14,8,24,31

With -O3, it generates the same code (more or less) with both register allocators.

Sure, it would be nice to fold the bswap with the original load.


> 
> Thanks for the extensive testing.
> 
> > Can I apply the patch to the GCC 7 trunk?  Since the patch shows up as an abort
> > in GCC 6 and not in GCC 7 (unless you disable LRA), I would like to apply the
> > patch as soon as possible to the GCC 6 branch.
> 
> A few remarks:
> 
> > +; Types that have hardware load/store with byte reverse
> > +(define_mode_iterator BSWAP [HI
> > +			     SI
> > +			     (DI "TARGET_POWERPC64 && TARGET_LDBRX")])
> 
> The patch would be much easier to read if you had done this step (with HSI
> as well) separately.  Oh well.

Actually, I developed it that way, and I can easily go back to that.  I thought
I would be dinged because I didn't combine them. :-)

> > +(define_expand "bswap<mode>2"
> 
> > +  if (MEM_P (src))
> > +    emit_insn (gen_bswap<mode>2_load (dest, src));
> > +  else
> > +    {
> > +      if (MEM_P (dest))
> > +	emit_insn (gen_bswap<mode>2_store (dest, src));
> > +      else
> > +	emit_insn (gen_bswap<mode>2_reg (dest, src));
> > +    }
> > +  DONE;
> 
> Please avoid the extra nesting, i.e. do
> 
> +  if (MEM_P (src))
> +    emit_insn (gen_bswap<mode>2_load (dest, src));
> +  else if (MEM_P (dest))
> +    emit_insn (gen_bswap<mode>2_store (dest, src));
> +  else
> +    emit_insn (gen_bswap<mode>2_reg (dest, src));
> +  DONE;

Ok.  The patch originally had a force_reg in there, and I removed it when it
didn't seem necessary any more.

> (also for bswapdi2).
> 
> > +  [(set_attr "length" "12")
> > +   (set_attr "type" "*")])
> 
> Lose that last line?  You don't need to explicitly set things to their
> default value ;-)
> OTOH, you might want to make it type "three" instead?
> 
> > +  [(set_attr "length" "36")
> > +   (set_attr "type" "*")])
> 
> Same.
> 
> This patch is okay for trunk.  Also for 6, but what is the hurry there?

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH #2, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu
  2017-03-25  1:12   ` Segher Boessenkool
  2017-03-27 17:30     ` Michael Meissner
@ 2017-03-27 19:58     ` Michael Meissner
  1 sibling, 0 replies; 8+ messages in thread
From: Michael Meissner @ 2017-03-27 19:58 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Michael Meissner, Peter Bergner, GCC Patches, David Edelsohn,
	Bill Schmidt, Matthias Klose

[-- Attachment #1: Type: text/plain, Size: 1821 bytes --]

On Fri, Mar 24, 2017 at 07:16:32PM -0500, Segher Boessenkool wrote:
> Hi Mike,
> 
> This patch is okay for trunk.  Also for 6, but what is the hurry there?

I applied the patch (svn id 246508 for trunk, svn id 246509 for gcc 6.0).  The
hurry up for GCC 6 is due to the fact that the bug does not show by default in
trunk (due to failing with RELOAD), but the bug shows up in GCC 6 (since RELOAD
is default there), and it affects some of the Linux distributions.

[gcc]
2017-03-27  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/78543
	* config/rs6000/rs6000.md (bswaphi2_extenddi): Combine bswap
	HImode and SImode with zero extend to DImode to one insn.
	(bswap<mode>2_extenddi): Likewise.
	(bswapsi2_extenddi): Likewise.
	(bswaphi2_extendsi): Likewise.
	(bswaphi2): Combine bswap HImode and SImode into one insn.
	Separate memory insns from swapping register.
	(bswapsi2): Likewise.
	(bswap<mode>2): Likewise.
	(bswaphi2_internal): Delete, no longer used.
	(bswapsi2_internal): Likewise.
	(bswap<mode>2_load): Split bswap HImode/SImode into separate load,
	store, and gpr<-gpr swap insns.
	(bswap<mode>2_store): Likewise.
	(bswaphi2_reg): Register only splitter, combine with the splitter.
	(bswaphi2 splitter): Likewise.
	(bswapsi2_reg): Likewise.
	(bswapsi2 splitter): Likewise.
	(bswapdi2): If we have the LDBRX and STDBRX instructions, split
	the insns into load, store, and register/register insns.
	(bswapdi2_ldbrx): Likewise.
	(bswapdi2_load): Likewise.
	(bswapdi2_store): Likewise.
	(bswapdi2_reg): Likewise.

[gcc/testsuite]
2017-03-27  Michael Meissner  <meissner@linux.vnet.ibm.com>

	PR target/78543
	* gcc.target/powerpc/pr78543.c: New test.

-- 
Michael Meissner, IBM
IBM, M/S 2506R, 550 King Street, Littleton, MA 01460-6245, USA
email: meissner@linux.vnet.ibm.com, phone: +1 (978) 899-4797

[-- Attachment #2: pr78543.patch04b --]
[-- Type: text/plain, Size: 9202 bytes --]

Index: gcc/config/rs6000/rs6000.md
===================================================================
--- gcc/config/rs6000/rs6000.md	(revision 246507)
+++ gcc/config/rs6000/rs6000.md	(working copy)
@@ -2350,12 +2350,12 @@ (define_insn "cmpb<mode>3"
 
 ;; Since the hardware zeros the upper part of the register, save generating the
 ;; AND immediate if we are converting to unsigned
-(define_insn "*bswaphi2_extenddi"
+(define_insn "*bswap<mode>2_extenddi"
   [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
 	(zero_extend:DI
-	 (bswap:HI (match_operand:HI 1 "memory_operand" "Z"))))]
+	 (bswap:HSI (match_operand:HSI 1 "memory_operand" "Z"))))]
   "TARGET_POWERPC64"
-  "lhbrx %0,%y1"
+  "l<wd>brx %0,%y1"
   [(set_attr "length" "4")
    (set_attr "type" "load")])
 
@@ -2368,34 +2368,52 @@ (define_insn "*bswaphi2_extendsi"
   [(set_attr "length" "4")
    (set_attr "type" "load")])
 
-(define_expand "bswaphi2"
-  [(parallel [(set (match_operand:HI 0 "reg_or_mem_operand" "")
-		   (bswap:HI
-		    (match_operand:HI 1 "reg_or_mem_operand" "")))
-	      (clobber (match_scratch:SI 2 ""))])]
+;; Separate the bswap patterns into load, store, and gpr<-gpr.  This prevents
+;; the register allocator from converting a gpr<-gpr swap into a store and then
+;; load with byte swap, which can be slower than doing it in the registers.  It
+;; also prevents certain failures with the RELOAD register allocator.
+
+(define_expand "bswap<mode>2"
+  [(use (match_operand:HSI 0 "reg_or_mem_operand"))
+   (use (match_operand:HSI 1 "reg_or_mem_operand"))]
   ""
 {
-  if (!REG_P (operands[0]) && !REG_P (operands[1]))
-    operands[1] = force_reg (HImode, operands[1]);
+  rtx dest = operands[0];
+  rtx src = operands[1];
+
+  if (!REG_P (dest) && !REG_P (src))
+    src = force_reg (<MODE>mode, src);
+
+  if (MEM_P (src))
+    emit_insn (gen_bswap<mode>2_load (dest, src));
+  else if (MEM_P (dest))
+    emit_insn (gen_bswap<mode>2_store (dest, src));
+  else
+    emit_insn (gen_bswap<mode>2_reg (dest, src));
+  DONE;
 })
 
-(define_insn "bswaphi2_internal"
-  [(set (match_operand:HI 0 "reg_or_mem_operand" "=r,Z,&r")
+(define_insn "bswap<mode>2_load"
+  [(set (match_operand:HSI 0 "gpc_reg_operand" "=r")
+	(bswap:HSI (match_operand:HSI 1 "memory_operand" "Z")))]
+  ""
+  "l<wd>brx %0,%y1"
+  [(set_attr "type" "load")])
+
+(define_insn "bswap<mode>2_store"
+  [(set (match_operand:HSI 0 "memory_operand" "=Z")
+	(bswap:HSI (match_operand:HSI 1 "gpc_reg_operand" "r")))]
+  ""
+  "st<wd>brx %1,%y0"
+  [(set_attr "type" "store")])
+
+(define_insn_and_split "bswaphi2_reg"
+  [(set (match_operand:HI 0 "gpc_reg_operand" "=&r")
 	(bswap:HI
-	 (match_operand:HI 1 "reg_or_mem_operand" "Z,r,r")))
-   (clobber (match_scratch:SI 2 "=X,X,&r"))]
+	 (match_operand:HI 1 "gpc_reg_operand" "r")))
+   (clobber (match_scratch:SI 2 "=&r"))]
   ""
-  "@
-   lhbrx %0,%y1
-   sthbrx %1,%y0
-   #"
-  [(set_attr "length" "4,4,12")
-   (set_attr "type" "load,store,*")])
-
-(define_split
-  [(set (match_operand:HI 0 "gpc_reg_operand" "")
-	(bswap:HI (match_operand:HI 1 "gpc_reg_operand" "")))
-   (clobber (match_operand:SI 2 "gpc_reg_operand" ""))]
+  "#"
   "reload_completed"
   [(set (match_dup 3)
 	(and:SI (lshiftrt:SI (match_dup 4)
@@ -2408,48 +2426,21 @@ (define_split
    (set (match_dup 3)
 	(ior:SI (match_dup 3)
 		(match_dup 2)))]
-  "
 {
   operands[3] = simplify_gen_subreg (SImode, operands[0], HImode, 0);
   operands[4] = simplify_gen_subreg (SImode, operands[1], HImode, 0);
-}")
-
-(define_insn "*bswapsi2_extenddi"
-  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
-	(zero_extend:DI
-	 (bswap:SI (match_operand:SI 1 "memory_operand" "Z"))))]
-  "TARGET_POWERPC64"
-  "lwbrx %0,%y1"
-  [(set_attr "length" "4")
-   (set_attr "type" "load")])
-
-(define_expand "bswapsi2"
-  [(set (match_operand:SI 0 "reg_or_mem_operand" "")
-	(bswap:SI
-	 (match_operand:SI 1 "reg_or_mem_operand" "")))]
-  ""
-{
-  if (!REG_P (operands[0]) && !REG_P (operands[1]))
-    operands[1] = force_reg (SImode, operands[1]);
-})
-
-(define_insn "*bswapsi2_internal"
-  [(set (match_operand:SI 0 "reg_or_mem_operand" "=r,Z,&r")
-	(bswap:SI
-	 (match_operand:SI 1 "reg_or_mem_operand" "Z,r,r")))]
-  ""
-  "@
-   lwbrx %0,%y1
-   stwbrx %1,%y0
-   #"
-  [(set_attr "length" "4,4,12")
-   (set_attr "type" "load,store,*")])
+}
+  [(set_attr "length" "12")
+   (set_attr "type" "*")])
 
 ;; We are always BITS_BIG_ENDIAN, so the bit positions below in
 ;; zero_extract insns do not change for -mlittle.
-(define_split
-  [(set (match_operand:SI 0 "gpc_reg_operand" "")
-	(bswap:SI (match_operand:SI 1 "gpc_reg_operand" "")))]
+(define_insn_and_split "bswapsi2_reg"
+  [(set (match_operand:SI 0 "gpc_reg_operand" "=&r")
+	(bswap:SI
+	 (match_operand:SI 1 "gpc_reg_operand" "r")))]
+  ""
+  "#"
   "reload_completed"
   [(set (match_dup 0)					; DABC
 	(rotate:SI (match_dup 1)
@@ -2465,11 +2456,13 @@ (define_split
 				     (const_int 24))
 			(const_int 255))
 		(and:SI (match_dup 0)
-			(const_int -256))))
-
-  ]
+			(const_int -256))))]
   "")
 
+;; On systems with LDBRX/STDBRX generate the loads/stores directly, just like
+;; we do for L{H,W}BRX and ST{H,W}BRX above.  If not, we have to generate more
+;; complex code.
+
 (define_expand "bswapdi2"
   [(parallel [(set (match_operand:DI 0 "reg_or_mem_operand" "")
 		   (bswap:DI
@@ -2478,33 +2471,56 @@ (define_expand "bswapdi2"
 	      (clobber (match_scratch:DI 3 ""))])]
   ""
 {
-  if (!REG_P (operands[0]) && !REG_P (operands[1]))
-    operands[1] = force_reg (DImode, operands[1]);
+  rtx dest = operands[0];
+  rtx src = operands[1];
+
+  if (!REG_P (dest) && !REG_P (src))
+    operands[1] = src = force_reg (DImode, src);
+
+  if (TARGET_POWERPC64 && TARGET_LDBRX)
+    {
+      if (MEM_P (src))
+	emit_insn (gen_bswapdi2_load (dest, src));
+      else if (MEM_P (dest))
+	emit_insn (gen_bswapdi2_store (dest, src));
+      else
+	emit_insn (gen_bswapdi2_reg (dest, src));
+      DONE;
+    }
 
   if (!TARGET_POWERPC64)
     {
       /* 32-bit mode needs fewer scratch registers, but 32-bit addressing mode
 	 that uses 64-bit registers needs the same scratch registers as 64-bit
 	 mode.  */
-      emit_insn (gen_bswapdi2_32bit (operands[0], operands[1]));
+      emit_insn (gen_bswapdi2_32bit (dest, src));
       DONE;
     }
 })
 
 ;; Power7/cell has ldbrx/stdbrx, so use it directly
-(define_insn "*bswapdi2_ldbrx"
-  [(set (match_operand:DI 0 "reg_or_mem_operand" "=r,Z,&r")
-	(bswap:DI (match_operand:DI 1 "reg_or_mem_operand" "Z,r,r")))
-   (clobber (match_scratch:DI 2 "=X,X,&r"))
-   (clobber (match_scratch:DI 3 "=X,X,&r"))]
-  "TARGET_POWERPC64 && TARGET_LDBRX
-   && (REG_P (operands[0]) || REG_P (operands[1]))"
-  "@
-   ldbrx %0,%y1
-   stdbrx %1,%y0
-   #"
-  [(set_attr "length" "4,4,36")
-   (set_attr "type" "load,store,*")])
+(define_insn "bswapdi2_load"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(bswap:DI (match_operand:DI 1 "memory_operand" "Z")))]
+  "TARGET_POWERPC64 && TARGET_LDBRX"
+  "ldbrx %0,%y1"
+  [(set_attr "type" "load")])
+
+(define_insn "bswapdi2_store"
+  [(set (match_operand:DI 0 "memory_operand" "=Z")
+	(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))]
+  "TARGET_POWERPC64 && TARGET_LDBRX"
+  "stdbrx %1,%y0"
+  [(set_attr "type" "store")])
+
+(define_insn "bswapdi2_reg"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=&r")
+	(bswap:DI (match_operand:DI 1 "gpc_reg_operand" "r")))
+   (clobber (match_scratch:DI 2 "=&r"))
+   (clobber (match_scratch:DI 3 "=&r"))]
+  "TARGET_POWERPC64 && TARGET_LDBRX"
+  "#"
+  [(set_attr "length" "36")])
 
 ;; Non-power7/cell, fall back to use lwbrx/stwbrx
 (define_insn "*bswapdi2_64bit"
Index: gcc/testsuite/gcc.target/powerpc/pr78543.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/pr78543.c	(revision 0)
+++ gcc/testsuite/gcc.target/powerpc/pr78543.c	(working copy)
@@ -0,0 +1,60 @@
+/* { dg-do compile { target { powerpc64*-*-* && lp64 } } } */
+/* { dg-require-effective-target powerpc_p8vector_ok } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power8" } } */
+/* { dg-options "-mcpu=power8 -O1 -mno-lra" } */
+
+typedef long a;
+enum c { e, f, g, h, i, ab } j();
+int l, n, o, p;
+a q, r;
+void *memcpy();
+void b();
+static int k(int *s) {
+  int m;
+  if (j(&m))
+    *s = m;
+  return !0;
+}
+void d(char s) {
+  int af[4];
+  int ag;
+  enum c ah;
+  char ai[24 << 11];
+  unsigned aj;
+  if (!k(&aj))
+    goto ak;
+  for (;;) {
+    if (!k(&ag))
+      goto ak;
+    switch (ah) {
+    case e:
+      b("");
+      b("bad length %d for GUID in fileinfo v%u for \"%s\"");
+    case i:
+      b("bad length %d for TTH in fileinfo v%u for \"%s\"", aj);
+    case ab:
+      if (ag % 24)
+        b("for \"%s\"", s);
+    case f:
+      if (20 == ag)
+      case h:
+        if (20 == ag)
+          o = 0;
+      break;
+    case g:
+      memcpy(af, ai, sizeof af);
+      b();
+      if (p) {
+        a al, am;
+        r = al << 2 | am;
+        n = af[2];
+        al = n;
+        l = __builtin_bswap32(af[3]);
+        am = q = n | l;
+      }
+    default:
+      b("%s0 unhandled field ID %u 0", __func__);
+    }
+  }
+ak:;
+}

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-03-27 19:41 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-06 18:17 [PATCH, rs6000][GCC6] Fix PR78543, ICE in push_reload on powerpc64le-linux-gnu Peter Bergner
2017-03-06 18:55 ` Segher Boessenkool
2017-03-07 13:35 ` Ulrich Weigand
2017-03-24 23:26 ` [PATCH #2, " Michael Meissner
2017-03-24 23:37   ` Michael Meissner
2017-03-25  1:12   ` Segher Boessenkool
2017-03-27 17:30     ` Michael Meissner
2017-03-27 19:58     ` Michael Meissner

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).