[PATCH][ARM] Improve 64-bit shifts (non-NEON)

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][ARM] Improve 64-bit shifts (non-NEON)
@ 2012-01-27 16:07 Andrew Stubbs
  2012-01-30 15:26 ` Richard Earnshaw
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Stubbs @ 2012-01-27 16:07 UTC (permalink / raw)
  To: gcc-patches, patches

[-- Attachment #1: Type: text/plain, Size: 3355 bytes --]

Hi all,

This patch introduces a new, more efficient set of DImode shift 
sequences for values stored in core-registers (as opposed to VFP/NEON 
registers).

The new sequences take advantage of knowledge of what the ARM 
instructions do with out-of-range shift amounts.

The following are examples or a simple test case, like this one:

long long
f (long long *a, int b)
{
   return *a << b;
}


In ARM mode, old left-shift vs. the new one:

     stmfd   sp!, {r4, r5}        | ldrd    r2, [r0]
     rsb     r4, r1, #32          | mov     ip, r1
     ldr     r5, [r0, #4]         | stmfd   sp!, {r4, r5}
     subs    ip, r1, #32          | sub     r5, ip, #32
     ldr     r0, [r0, #0]         | rsb     r4, ip, #32
     mov     r3, r5, asl r1       | mov     r1, r3, asl ip
     orr     r3, r3, r0, lsr r4   | mov     r0, r2, asl ip
     mov     r2, r0, asl r1       | orr     r1, r1, r2, asl r5
     movpl   r3, r0, asl ip       | orr     r1, r1, r2, lsr r4
     mov     r0, r2               | ldmfd   sp!, {r4, r5}
     mov     r1, r3               | bx      lr
     ldmfd   sp!, {r4, r5}        |
     bx      lr                   |

In Thumb mode, old left-shift vs. new:

     ldr     r2, [r0, #0]         | ldrd    r2, [r0]
     ldr     r3, [r0, #4]         | push    {r4, r5, r6}
     push    {r4, r5, r6}         | sub     r6, r1, #32
     rsb     r6, r1, #32          | mov     r4, r1
     sub     r4, r1, #32          | rsb     r5, r1, #32
     lsls    r3, r3, r1           | lsls    r6, r2, r6
     lsrs    r6, r2, r6           | lsls    r1, r3, r1
     lsls    r5, r2, r4           | lsrs    r5, r2, r5
     orrs    r3, r3, r6           | lsls    r0, r2, r4
     lsls    r0, r2, r1           | orrs    r1, r1, r6
     bics    r1, r5, r4, asr #32  | orrs    r1, r1, r5
     it      cs                   | pop     {r4, r5, r6}
     movcs   r1, r3               | bx      lr
     pop     {r4, r5, r6}         |
     bx      lr                   |

Logical right shift is essentially the same sequence as the left shift 
above. However, arithmetic right shift requires something slightly 
different. Here it is in ARM mode, old vs. new:

     stmfd   sp!, {r4, r5}        | ldrd    r2, [r0]
     rsb     r4, r1, #32          | mov     ip, r1
     ldr     r5, [r0, #0]         | stmfd   sp!, {r4, r5}
     subs    ip, r1, #32          | rsb     r5, ip, #32
     ldr     r0, [r0, #4]         | subs    r4, ip, #32
     mov     r2, r5, lsr r1       | mov     r0, r2, lsr ip
     orr     r2, r2, r0, asl r4   | mov     r1, r3, asr ip
     mov     r3, r0, asr r1       | orr     r0, r0, r3, asl r5
     movpl   r2, r0, asr ip       | orrge   r0, r0, r3, asr r4
     mov     r1, r3               | ldmfd   sp!, {r4, r5}
     mov     r0, r2               | bx      lr
     ldmfd   sp!, {r4, r5}        |
     bx      lr                   |

I won't bore you with the Thumb mode comparison.

The shift-by-constant cases have also been reimplemented, although the 
resultant sequences are much the same as before. (Doing this isn't 
strictly necessary just yet, but when I post my next patch to do 64-bit 
shifts in NEON, this feature will be required by the fall-back 
alternatives.)

I've run a regression test on a cross-compiler, and I should have native 
test results next week sometime. Also some benchmark results.

Is this OK for stage 1?

Andrew

[-- Attachment #2: core-shift64.patch --]
[-- Type: text/x-patch, Size: 12751 bytes --]

2012-01-27  Andrew Stubbs  <ams@codesourcery.com>

	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.

---
 gcc/config/arm/arm-protos.h |    3 +
 gcc/config/arm/arm.c        |  198 +++++++++++++++++++++++++++++++++++++++++++
 gcc/config/arm/arm.md       |   90 ++++++++++++++------
 3 files changed, 264 insertions(+), 27 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+					   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0bded8d..eefc45c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25139,5 +25139,203 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch registers may be NULL.
+   
+   Additionally, ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			       rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Bits flow from up-stream to down-stream.  */
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	      && (REG_P (out) || GET_CODE (out) == SUBREG)
+	      && GET_MODE (out) == DImode);
+  gcc_assert (in
+	      && (REG_P (in) || GET_CODE (in) == SUBREG)
+	      && GET_MODE (in) == DImode);
+  gcc_assert (amount
+	      && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		   && GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	      || (GET_CODE (scratch1) == SCRATCH)
+	      || (GET_MODE (scratch1) == SImode
+		  && REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	      || (GET_CODE (scratch2) == SCRATCH)
+	      || (GET_MODE (scratch2) == SImode
+		  && REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	      || !HARD_REGISTER_P (out)
+	      || (REGNO (out) != REGNO (amount)
+		  && REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	    gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	    gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	    gen_addsi3_compare0 ((DEST), (SRC), \
+				 gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	    gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			    SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			    SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	    gen_rtx_IOR (SImode, (A), (B))
+  #define IF(COND,RTX) \
+	    gen_rtx_COND_EXEC (VOIDmode, \
+			       gen_rtx_ ## COND (CCmode, cc_reg, \
+						 const0_rtx), \
+			       (RTX))
+
+  if (CONST_INT_P (amount))
+    {
+      /* Shifts by a constant amount.  */
+      if (INTVAL (amount) <= 0)
+	/* Match what shift-by-register would do.  */
+	emit_insn (gen_movdi (out, in));
+      else if (INTVAL (amount) >= 64)
+	{
+	  /* Match what shift-by-register would do.  */
+	  if (code == ASHIFTRT)
+	    {
+	      rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
+	      emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
+	      emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
+	    }
+	  else
+	    emit_insn (gen_movdi (out, const0_rtx));
+	}
+      else if (INTVAL (amount) < 32)
+	{
+	  /* Shifts by a constant less than 32.  */
+	  rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
+						  32 - INTVAL (amount));
+
+	  emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+	  emit_insn (SET (out_down,
+			  ORR (REV_LSHIFT (code, in_up, reverse_amount),
+			       out_down)));
+	  emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+	}
+      else
+	{
+	  /* Shifts by a constant greater than 31.  */
+	  rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
+
+	  emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
+	  if (code == ASHIFTRT)
+	    emit_insn (gen_ashrsi3 (out_up, in_up,
+				    gen_rtx_CONST_INT (VOIDmode, 31)));
+	  else
+	    emit_insn (SET (out_up, const0_rtx));
+	}
+    }
+  else
+    {
+      /* Shifts by a variable amount.  */
+      rtx cc_reg = gen_rtx_REG (CC_NCVmode, CC_REGNUM);
+
+      gcc_assert (scratch1 && REG_P (scratch1));
+      gcc_assert (scratch2 && REG_P (scratch2));
+
+      switch (code)
+	{
+	case ASHIFT:
+	  emit_insn (SUB_32 (scratch1, amount));
+	  emit_insn (RSB_32 (scratch2, amount));
+	  break;
+	case ASHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_S_32 (scratch2, amount));
+	  break;
+	case LSHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_32 (scratch2, amount));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+
+      if (!TARGET_THUMB2)
+	{
+	  /* If this were only called during expand we could just use the else
+	     case and let combine deal with it, but this can also be called
+	     from post-reload splitters.  */
+	  emit_insn (SET (out_down,
+			  ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
+	  if (code == ASHIFTRT)
+	    {
+	      emit_insn (IF (GE,
+			     SET (out_down,
+				  ORR (SHIFT (ASHIFTRT, in_up, scratch2),
+				       out_down))));
+	    }
+	  else
+	    emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
+					   out_down)));
+	}
+      else
+	{
+	  /* Thumb2 can't do shift and or in one insn.  */
+	  emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
+	  emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
+
+	  if (code == ASHIFTRT)
+	    {
+	      emit_insn (IF (GE, SET (scratch2,
+				      SHIFT (ASHIFTRT, in_up, scratch2))));
+	      emit_insn (IF (GE, SET (out_down, ORR (out_down, scratch2))));
+	    }
+	  else
+	    {
+	      emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
+	      emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
+	    }
+	}
+
+      emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+    }
+
+  #undef SUB_32
+  #undef RSB_32
+  #undef SUB_S_32
+  #undef SET
+  #undef SHIFT
+  #undef LSHIFT
+  #undef REV_LSHIFT
+  #undef ORR
+  #undef IF
+}
+
 #include "gt-arm.h"
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 751997f..7cae822 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3466,21 +3466,33 @@
                    (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
-    FAIL;
   "
 )
 
@@ -3525,21 +3537,33 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -3582,21 +3606,33 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-01-27 16:07 [PATCH][ARM] Improve 64-bit shifts (non-NEON) Andrew Stubbs
@ 2012-01-30 15:26 ` Richard Earnshaw
  2012-01-31 14:29   ` Andrew Stubbs
  0 siblings, 1 reply; 18+ messages in thread
From: Richard Earnshaw @ 2012-01-30 15:26 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: gcc-patches, patches

On 27/01/12 16:07, Andrew Stubbs wrote:
> Hi all,
> 
> This patch introduces a new, more efficient set of DImode shift 
> sequences for values stored in core-registers (as opposed to VFP/NEON 
> registers).
> 
> The new sequences take advantage of knowledge of what the ARM 
> instructions do with out-of-range shift amounts.
> 
> The following are examples or a simple test case, like this one:
> 
> long long
> f (long long *a, int b)
> {
>    return *a << b;
> }
> 
> 
> In ARM mode, old left-shift vs. the new one:
> 
>      stmfd   sp!, {r4, r5}        | ldrd    r2, [r0]
>      rsb     r4, r1, #32          | mov     ip, r1
>      ldr     r5, [r0, #4]         | stmfd   sp!, {r4, r5}
>      subs    ip, r1, #32          | sub     r5, ip, #32
>      ldr     r0, [r0, #0]         | rsb     r4, ip, #32
>      mov     r3, r5, asl r1       | mov     r1, r3, asl ip
>      orr     r3, r3, r0, lsr r4   | mov     r0, r2, asl ip
>      mov     r2, r0, asl r1       | orr     r1, r1, r2, asl r5
>      movpl   r3, r0, asl ip       | orr     r1, r1, r2, lsr r4
>      mov     r0, r2               | ldmfd   sp!, {r4, r5}
>      mov     r1, r3               | bx      lr
>      ldmfd   sp!, {r4, r5}        |
>      bx      lr                   |
> 
> In Thumb mode, old left-shift vs. new:
> 
>      ldr     r2, [r0, #0]         | ldrd    r2, [r0]
>      ldr     r3, [r0, #4]         | push    {r4, r5, r6}
>      push    {r4, r5, r6}         | sub     r6, r1, #32
>      rsb     r6, r1, #32          | mov     r4, r1
>      sub     r4, r1, #32          | rsb     r5, r1, #32
>      lsls    r3, r3, r1           | lsls    r6, r2, r6
>      lsrs    r6, r2, r6           | lsls    r1, r3, r1
>      lsls    r5, r2, r4           | lsrs    r5, r2, r5
>      orrs    r3, r3, r6           | lsls    r0, r2, r4
>      lsls    r0, r2, r1           | orrs    r1, r1, r6
>      bics    r1, r5, r4, asr #32  | orrs    r1, r1, r5
>      it      cs                   | pop     {r4, r5, r6}
>      movcs   r1, r3               | bx      lr
>      pop     {r4, r5, r6}         |
>      bx      lr                   |
> 
> Logical right shift is essentially the same sequence as the left shift 
> above. However, arithmetic right shift requires something slightly 
> different. Here it is in ARM mode, old vs. new:
> 
>      stmfd   sp!, {r4, r5}        | ldrd    r2, [r0]
>      rsb     r4, r1, #32          | mov     ip, r1
>      ldr     r5, [r0, #0]         | stmfd   sp!, {r4, r5}
>      subs    ip, r1, #32          | rsb     r5, ip, #32
>      ldr     r0, [r0, #4]         | subs    r4, ip, #32
>      mov     r2, r5, lsr r1       | mov     r0, r2, lsr ip
>      orr     r2, r2, r0, asl r4   | mov     r1, r3, asr ip
>      mov     r3, r0, asr r1       | orr     r0, r0, r3, asl r5
>      movpl   r2, r0, asr ip       | orrge   r0, r0, r3, asr r4
>      mov     r1, r3               | ldmfd   sp!, {r4, r5}
>      mov     r0, r2               | bx      lr
>      ldmfd   sp!, {r4, r5}        |
>      bx      lr                   |
> 
> I won't bore you with the Thumb mode comparison.
> 
> The shift-by-constant cases have also been reimplemented, although the 
> resultant sequences are much the same as before. (Doing this isn't 
> strictly necessary just yet, but when I post my next patch to do 64-bit 
> shifts in NEON, this feature will be required by the fall-back 
> alternatives.)
> 
> I've run a regression test on a cross-compiler, and I should have native 
> test results next week sometime. Also some benchmark results.
> 
> Is this OK for stage 1?
> 

What's the impact of this on -Os?  At present we fall back to the
libcalls, but I can't immediately see how the new code would do that.

Gut feeling is that shift by a constant is always worth inlining at -Os,
but shift by a register isn't.

R.


> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-01-30 15:26 ` Richard Earnshaw
@ 2012-01-31 14:29   ` Andrew Stubbs
  2012-02-07 22:20     ` Ramana Radhakrishnan
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Stubbs @ 2012-01-31 14:29 UTC (permalink / raw)
  To: Richard Earnshaw; +Cc: gcc-patches, patches

[-- Attachment #1: Type: text/plain, Size: 467 bytes --]

On 30/01/12 15:25, Richard Earnshaw wrote:
> What's the impact of this on -Os?  At present we fall back to the
> libcalls, but I can't immediately see how the new code would do that.
>
> Gut feeling is that shift by a constant is always worth inlining at -Os,
> but shift by a register isn't.

Ah, I hadn't considered that. Good point!

This updated patch causes it to fall back to the old behaviour in 
optimize_size mode. This should do what you want.

OK?

Andrew

[-- Attachment #2: core-shift64.patch --]
[-- Type: text/x-patch, Size: 13077 bytes --]

2012-01-31  Andrew Stubbs  <ams@codesourcery.com>

	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.

---
 gcc/config/arm/arm-protos.h |    3 +
 gcc/config/arm/arm.c        |  198 +++++++++++++++++++++++++++++++++++++++++++
 gcc/config/arm/arm.md       |  102 ++++++++++++++++------
 3 files changed, 276 insertions(+), 27 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+					   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0bded8d..eefc45c 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25139,5 +25139,203 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch registers may be NULL.
+   
+   Additionally, ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			       rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Bits flow from up-stream to down-stream.  */
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	      && (REG_P (out) || GET_CODE (out) == SUBREG)
+	      && GET_MODE (out) == DImode);
+  gcc_assert (in
+	      && (REG_P (in) || GET_CODE (in) == SUBREG)
+	      && GET_MODE (in) == DImode);
+  gcc_assert (amount
+	      && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		   && GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	      || (GET_CODE (scratch1) == SCRATCH)
+	      || (GET_MODE (scratch1) == SImode
+		  && REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	      || (GET_CODE (scratch2) == SCRATCH)
+	      || (GET_MODE (scratch2) == SImode
+		  && REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	      || !HARD_REGISTER_P (out)
+	      || (REGNO (out) != REGNO (amount)
+		  && REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	    gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	    gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	    gen_addsi3_compare0 ((DEST), (SRC), \
+				 gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	    gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			    SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			    SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	    gen_rtx_IOR (SImode, (A), (B))
+  #define IF(COND,RTX) \
+	    gen_rtx_COND_EXEC (VOIDmode, \
+			       gen_rtx_ ## COND (CCmode, cc_reg, \
+						 const0_rtx), \
+			       (RTX))
+
+  if (CONST_INT_P (amount))
+    {
+      /* Shifts by a constant amount.  */
+      if (INTVAL (amount) <= 0)
+	/* Match what shift-by-register would do.  */
+	emit_insn (gen_movdi (out, in));
+      else if (INTVAL (amount) >= 64)
+	{
+	  /* Match what shift-by-register would do.  */
+	  if (code == ASHIFTRT)
+	    {
+	      rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
+	      emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
+	      emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
+	    }
+	  else
+	    emit_insn (gen_movdi (out, const0_rtx));
+	}
+      else if (INTVAL (amount) < 32)
+	{
+	  /* Shifts by a constant less than 32.  */
+	  rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
+						  32 - INTVAL (amount));
+
+	  emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+	  emit_insn (SET (out_down,
+			  ORR (REV_LSHIFT (code, in_up, reverse_amount),
+			       out_down)));
+	  emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+	}
+      else
+	{
+	  /* Shifts by a constant greater than 31.  */
+	  rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
+
+	  emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
+	  if (code == ASHIFTRT)
+	    emit_insn (gen_ashrsi3 (out_up, in_up,
+				    gen_rtx_CONST_INT (VOIDmode, 31)));
+	  else
+	    emit_insn (SET (out_up, const0_rtx));
+	}
+    }
+  else
+    {
+      /* Shifts by a variable amount.  */
+      rtx cc_reg = gen_rtx_REG (CC_NCVmode, CC_REGNUM);
+
+      gcc_assert (scratch1 && REG_P (scratch1));
+      gcc_assert (scratch2 && REG_P (scratch2));
+
+      switch (code)
+	{
+	case ASHIFT:
+	  emit_insn (SUB_32 (scratch1, amount));
+	  emit_insn (RSB_32 (scratch2, amount));
+	  break;
+	case ASHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_S_32 (scratch2, amount));
+	  break;
+	case LSHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_32 (scratch2, amount));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+
+      if (!TARGET_THUMB2)
+	{
+	  /* If this were only called during expand we could just use the else
+	     case and let combine deal with it, but this can also be called
+	     from post-reload splitters.  */
+	  emit_insn (SET (out_down,
+			  ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
+	  if (code == ASHIFTRT)
+	    {
+	      emit_insn (IF (GE,
+			     SET (out_down,
+				  ORR (SHIFT (ASHIFTRT, in_up, scratch2),
+				       out_down))));
+	    }
+	  else
+	    emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
+					   out_down)));
+	}
+      else
+	{
+	  /* Thumb2 can't do shift and or in one insn.  */
+	  emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
+	  emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
+
+	  if (code == ASHIFTRT)
+	    {
+	      emit_insn (IF (GE, SET (scratch2,
+				      SHIFT (ASHIFTRT, in_up, scratch2))));
+	      emit_insn (IF (GE, SET (out_down, ORR (out_down, scratch2))));
+	    }
+	  else
+	    {
+	      emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
+	      emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
+	    }
+	}
+
+      emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+    }
+
+  #undef SUB_32
+  #undef RSB_32
+  #undef SUB_S_32
+  #undef SET
+  #undef SHIFT
+  #undef LSHIFT
+  #undef REV_LSHIFT
+  #undef ORR
+  #undef IF
+}
+
 #include "gt-arm.h"
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 751997f..e814dc4 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3466,21 +3466,37 @@
                    (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_size)
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
-    FAIL;
   "
 )
 
@@ -3525,21 +3541,37 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_size)
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -3582,21 +3614,37 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1] 
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_size)
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-01-31 14:29   ` Andrew Stubbs
@ 2012-02-07 22:20     ` Ramana Radhakrishnan
  2012-02-07 22:34       ` Steven Bosscher
  2012-02-08 11:34       ` Andrew Stubbs
  0 siblings, 2 replies; 18+ messages in thread
From: Ramana Radhakrishnan @ 2012-02-07 22:20 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: Richard Earnshaw, gcc-patches, patches

Hi Andrew

I find it interesting that cond_exec's in this form survive all the
way till reload and "work".  AFAIK we could never have cond_exec's
before reload . The documentation doesn't appear to mention this,
therefore I would like to see if  the cond_exec's can be recast as
if_then_else forms from expand rather than these forms. Has this
survived bootstrap and testing ?

> +      /* If we're optimizing for size, we prefer the libgcc calls.  */
> +      if (optimize_size)
> +	FAIL;

In addition you want to replace optimize_size with
optimize_function_for_size_p in these cases.

cheers
Ramana

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-07 22:20     ` Ramana Radhakrishnan
@ 2012-02-07 22:34       ` Steven Bosscher
  2012-02-08 12:13         ` Bernd Schmidt
  2012-02-08 11:34       ` Andrew Stubbs
  1 sibling, 1 reply; 18+ messages in thread
From: Steven Bosscher @ 2012-02-07 22:34 UTC (permalink / raw)
  To: Ramana Radhakrishnan
  Cc: Andrew Stubbs, Richard Earnshaw, gcc-patches, patches

On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
<ramana.radhakrishnan@linaro.org> wrote:
> Hi Andrew
>
> I find it interesting that cond_exec's in this form survive all the
> way till reload and "work".  AFAIK we could never have cond_exec's
> before reload .

There is nothing wrong per-se with cond_execs before reload, as long
as you don't have to reload a predicate pseudo-reg. For ia64, with its
(IIRC) 64 predicate registers, it was not practical, or even possible,
to assign the predicate registers to hard regs during expand.

AFAIU this isn't an issue on ARM because there is just one condition
code register.

Ciao!
Steven

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-07 22:20     ` Ramana Radhakrishnan
  2012-02-07 22:34       ` Steven Bosscher
@ 2012-02-08 11:34       ` Andrew Stubbs
  2012-02-08 16:43         ` Andrew Stubbs
  1 sibling, 1 reply; 18+ messages in thread
From: Andrew Stubbs @ 2012-02-08 11:34 UTC (permalink / raw)
  To: Ramana Radhakrishnan; +Cc: Richard Earnshaw, gcc-patches, patches

On 07/02/12 22:19, Ramana Radhakrishnan wrote:
> I find it interesting that cond_exec's in this form survive all the
> way till reload and "work".  AFAIK we could never have cond_exec's
> before reload . The documentation doesn't appear to mention this,
> therefore I would like to see if  the cond_exec's can be recast as
> if_then_else forms from expand rather than these forms. Has this
> survived bootstrap and testing ?

I've tried to do this, but it can't be done by a straight translation 
because we don't have the insns available to do it. As I understand it, 
all "predicable" instructions automatically get a cond_exec variant, but 
the only if_then_else I could find (it's hard to grep because 
if_then_else occurs many times in attributes) is in the conditional move 
instruction. Have I missed something?

Anyway, I've tried to redo it using conditional move, and that works, 
but it's not as efficient.

Hopefully Steven Bosscher is correct, and there's no problem with 
cond_exec on ARM.

>> +      /* If we're optimizing for size, we prefer the libgcc calls.  */
>> +      if (optimize_size)
>> +	FAIL;
>
> In addition you want to replace optimize_size with
> optimize_function_for_size_p in these cases.

Ok, I've fixed this. I'll post an update soon.

Unfortunately, my testing has found a problem in the native bootstrap, 
so I'm going to need to track that down first.

Andrew

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-07 22:34       ` Steven Bosscher
@ 2012-02-08 12:13         ` Bernd Schmidt
  2012-02-08 12:32           ` Richard Guenther
  0 siblings, 1 reply; 18+ messages in thread
From: Bernd Schmidt @ 2012-02-08 12:13 UTC (permalink / raw)
  To: Steven Bosscher
  Cc: Ramana Radhakrishnan, Andrew Stubbs, Richard Earnshaw,
	gcc-patches, patches

On 02/07/2012 11:33 PM, Steven Bosscher wrote:
> On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
> <ramana.radhakrishnan@linaro.org> wrote:
>> Hi Andrew
>>
>> I find it interesting that cond_exec's in this form survive all the
>> way till reload and "work".  AFAIK we could never have cond_exec's
>> before reload .
> 
> There is nothing wrong per-se with cond_execs before reload, as long
> as you don't have to reload a predicate pseudo-reg.

I thought the problem was that we'd have to emit conditional reload
insns and inheritance wouldn't work.


Bernd

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-08 12:13         ` Bernd Schmidt
@ 2012-02-08 12:32           ` Richard Guenther
  2012-02-08 13:01             ` Bernd Schmidt
  0 siblings, 1 reply; 18+ messages in thread
From: Richard Guenther @ 2012-02-08 12:32 UTC (permalink / raw)
  To: Bernd Schmidt
  Cc: Steven Bosscher, Ramana Radhakrishnan, Andrew Stubbs,
	Richard Earnshaw, gcc-patches, patches

On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 02/07/2012 11:33 PM, Steven Bosscher wrote:
>> On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
>> <ramana.radhakrishnan@linaro.org> wrote:
>>> Hi Andrew
>>>
>>> I find it interesting that cond_exec's in this form survive all the
>>> way till reload and "work".  AFAIK we could never have cond_exec's
>>> before reload .
>>
>> There is nothing wrong per-se with cond_execs before reload, as long
>> as you don't have to reload a predicate pseudo-reg.
>
> I thought the problem was that we'd have to emit conditional reload
> insns and inheritance wouldn't work.

It probably depends on how DF sees conditional uses / defs.  If they
look like regular uses / defs then I suppose un-conditional spills/reloads
are fine - otherwise of course you'd corrupt one of the two register set
states.  But that also means it's probably safe if the sequence of conditional
insns is of length 1.

Not sure we want to open that possible can of worms though ;)

Richard.

>
> Bernd

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-08 12:32           ` Richard Guenther
@ 2012-02-08 13:01             ` Bernd Schmidt
  2012-02-08 13:33               ` Richard Guenther
  0 siblings, 1 reply; 18+ messages in thread
From: Bernd Schmidt @ 2012-02-08 13:01 UTC (permalink / raw)
  To: Richard Guenther
  Cc: Steven Bosscher, Ramana Radhakrishnan, Andrew Stubbs,
	Richard Earnshaw, gcc-patches, patches

On 02/08/2012 01:12 PM, Richard Guenther wrote:
> On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
>> On 02/07/2012 11:33 PM, Steven Bosscher wrote:
>>> On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
>>> <ramana.radhakrishnan@linaro.org> wrote:
>>>> Hi Andrew
>>>>
>>>> I find it interesting that cond_exec's in this form survive all the
>>>> way till reload and "work".  AFAIK we could never have cond_exec's
>>>> before reload .
>>>
>>> There is nothing wrong per-se with cond_execs before reload, as long
>>> as you don't have to reload a predicate pseudo-reg.
>>
>> I thought the problem was that we'd have to emit conditional reload
>> insns and inheritance wouldn't work.
> 
> It probably depends on how DF sees conditional uses / defs.  If they
> look like regular uses / defs then I suppose un-conditional spills/reloads
> are fine - otherwise of course you'd corrupt one of the two register set
> states.

I'm pretty sure conditional defs are always RMW, but I'd have to go
look. Can't imagine it working otherwise though.


Bernd

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-08 13:01             ` Bernd Schmidt
@ 2012-02-08 13:33               ` Richard Guenther
  0 siblings, 0 replies; 18+ messages in thread
From: Richard Guenther @ 2012-02-08 13:33 UTC (permalink / raw)
  To: Bernd Schmidt
  Cc: Steven Bosscher, Ramana Radhakrishnan, Andrew Stubbs,
	Richard Earnshaw, gcc-patches, patches

On Wed, Feb 8, 2012 at 1:41 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
> On 02/08/2012 01:12 PM, Richard Guenther wrote:
>> On Wed, Feb 8, 2012 at 1:02 PM, Bernd Schmidt <bernds@codesourcery.com> wrote:
>>> On 02/07/2012 11:33 PM, Steven Bosscher wrote:
>>>> On Tue, Feb 7, 2012 at 11:19 PM, Ramana Radhakrishnan
>>>> <ramana.radhakrishnan@linaro.org> wrote:
>>>>> Hi Andrew
>>>>>
>>>>> I find it interesting that cond_exec's in this form survive all the
>>>>> way till reload and "work".  AFAIK we could never have cond_exec's
>>>>> before reload .
>>>>
>>>> There is nothing wrong per-se with cond_execs before reload, as long
>>>> as you don't have to reload a predicate pseudo-reg.
>>>
>>> I thought the problem was that we'd have to emit conditional reload
>>> insns and inheritance wouldn't work.
>>
>> It probably depends on how DF sees conditional uses / defs.  If they
>> look like regular uses / defs then I suppose un-conditional spills/reloads
>> are fine - otherwise of course you'd corrupt one of the two register set
>> states.
>
> I'm pretty sure conditional defs are always RMW, but I'd have to go
> look. Can't imagine it working otherwise though.

I was thinking about

  (cond_exec 1 (set (reg:SI 30) (...)))
  (cond_exec 0 (set (reg:SI 30) (...)))
  (cond_exec 1 (use (reg:SI 30) (...)))

where if we spill/reload 30 with non-cond_exec stmts then we might
clobber the shared register set of the conditional execution paths.
Similar of course shared stack space if we happen to spill/reload in
both paths.  If of course defs/uses of both paths conflict and get different
hard registers assigned the register issue doesn't exist, still the
shared spill stack space issue may.

Richard.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-08 11:34       ` Andrew Stubbs
@ 2012-02-08 16:43         ` Andrew Stubbs
  2012-02-11  2:41           ` Richard Henderson
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Stubbs @ 2012-02-08 16:43 UTC (permalink / raw)
  To: gcc-patches; +Cc: Ramana Radhakrishnan, Richard Earnshaw, patches

On 08/02/12 11:18, Andrew Stubbs wrote:
> I've tried to do this, but it can't be done by a straight translation
> because we don't have the insns available to do it. As I understand it,
> all "predicable" instructions automatically get a cond_exec variant, but
> the only if_then_else I could find (it's hard to grep because
> if_then_else occurs many times in attributes) is in the conditional move
> instruction. Have I missed something?

Ok, I missed the various patterns like if_arith_move.

Unfortunately, these don't work in Thumb mode (no IT block), and I'd 
have to add arith-shift variants, I think, for ARM mode to work.

Hmmmm ... I'll try again.

Andrew

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-08 16:43         ` Andrew Stubbs
@ 2012-02-11  2:41           ` Richard Henderson
  2012-02-16 16:08             ` Andrew Stubbs
  0 siblings, 1 reply; 18+ messages in thread
From: Richard Henderson @ 2012-02-11  2:41 UTC (permalink / raw)
  To: Andrew Stubbs
  Cc: gcc-patches, Ramana Radhakrishnan, Richard Earnshaw, patches

On 02/08/2012 08:28 AM, Andrew Stubbs wrote:
> Unfortunately, these don't work in Thumb mode (no IT block), and I'd have to add arith-shift variants, I think, for ARM mode to work.
> 
> Hmmmm ... I'll try again.

Does it work to simply use branches initially, and rely on post-reload
ifcvt to transform to cond_exec when possible?


r~

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-11  2:41           ` Richard Henderson
@ 2012-02-16 16:08             ` Andrew Stubbs
  2012-02-17 17:42               ` Andrew Stubbs
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Stubbs @ 2012-02-16 16:08 UTC (permalink / raw)
  To: Richard Henderson
  Cc: gcc-patches, Ramana Radhakrishnan, Richard Earnshaw, patches

[-- Attachment #1: Type: text/plain, Size: 1169 bytes --]

On 11/02/12 01:11, Richard Henderson wrote:
> On 02/08/2012 08:28 AM, Andrew Stubbs wrote:
>> Unfortunately, these don't work in Thumb mode (no IT block), and I'd have to add arith-shift variants, I think, for ARM mode to work.
>>
>> Hmmmm ... I'll try again.
>
> Does it work to simply use branches initially, and rely on post-reload
> ifcvt to transform to cond_exec when possible?

It did work in ARM mode, but did not work in Thumb2 mode. My recent 
patch has fixed that:
http://gcc.gnu.org/ml/gcc-patches/2012-02/msg00734.html

With that patch applied, the attached patch has almost the right effect. 
The only problem now is that the scheduling gives multiple IT blocks, 
but that's a different problem.

Here's the new thumb code for ashiftrt:

         ldrd    r2, [r0]
         push    {r4, r5, r6}
         mov     r4, r1
         rsb     r5, r1, #32
         subs    r6, r4, #32
         lsr     r0, r2, r4
         lsl     r5, r3, r5
         it      ge
         asrge   r6, r3, r6
         asr     r1, r3, r4
         orr     r0, r0, r5
         it      ge
         orrge   r0, r0, r6
         pop     {r4, r5, r6}
         bx      lr

OK for 4.8?

Andrew

[-- Attachment #2: core-shift64.patch --]
[-- Type: text/x-patch, Size: 13617 bytes --]

2012-02-16  Andrew Stubbs  <ams@codesourcery.com>

	gcc/
	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.
	(arm_cond_branch): Remove '*' to enable gen_arm_cond_branch.

---
 gcc/config/arm/arm-protos.h |    3 +
 gcc/config/arm/arm.c        |  201 +++++++++++++++++++++++++++++++++++++++++++
 gcc/config/arm/arm.md       |  104 ++++++++++++++++------
 3 files changed, 280 insertions(+), 28 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+					   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 0bded8d..f3accc8 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25139,5 +25139,206 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch registers may be NULL.
+
+   Additionally, ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			       rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Bits flow from up-stream to down-stream.  */
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	      && (REG_P (out) || GET_CODE (out) == SUBREG)
+	      && GET_MODE (out) == DImode);
+  gcc_assert (in
+	      && (REG_P (in) || GET_CODE (in) == SUBREG)
+	      && GET_MODE (in) == DImode);
+  gcc_assert (amount
+	      && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		   && GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	      || (GET_CODE (scratch1) == SCRATCH)
+	      || (GET_MODE (scratch1) == SImode
+		  && REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	      || (GET_CODE (scratch2) == SCRATCH)
+	      || (GET_MODE (scratch2) == SImode
+		  && REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	      || !HARD_REGISTER_P (out)
+	      || (REGNO (out) != REGNO (amount)
+		  && REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	    gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	    gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	    gen_addsi3_compare0 ((DEST), (SRC), \
+				 gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	    gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			    SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			    SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	    gen_rtx_IOR (SImode, (A), (B))
+  #define BRANCH(COND,LABEL) \
+	    gen_arm_cond_branch ((LABEL), \
+				 gen_rtx_ ## COND (CCmode, cc_reg, \
+						   const0_rtx), \
+				 cc_reg)
+
+  if (CONST_INT_P (amount))
+    {
+      /* Shifts by a constant amount.  */
+      if (INTVAL (amount) <= 0)
+	/* Match what shift-by-register would do.  */
+	emit_insn (gen_movdi (out, in));
+      else if (INTVAL (amount) >= 64)
+	{
+	  /* Match what shift-by-register would do.  */
+	  if (code == ASHIFTRT)
+	    {
+	      rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
+	      emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
+	      emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
+	    }
+	  else
+	    emit_insn (gen_movdi (out, const0_rtx));
+	}
+      else if (INTVAL (amount) < 32)
+	{
+	  /* Shifts by a constant less than 32.  */
+	  rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
+						  32 - INTVAL (amount));
+
+	  emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+	  emit_insn (SET (out_down,
+			  ORR (REV_LSHIFT (code, in_up, reverse_amount),
+			       out_down)));
+	  emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+	}
+      else
+	{
+	  /* Shifts by a constant greater than 31.  */
+	  rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
+
+	  emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
+	  if (code == ASHIFTRT)
+	    emit_insn (gen_ashrsi3 (out_up, in_up,
+				    gen_rtx_CONST_INT (VOIDmode, 31)));
+	  else
+	    emit_insn (SET (out_up, const0_rtx));
+	}
+    }
+  else
+    {
+      /* Shifts by a variable amount.  */
+      rtx cc_reg = gen_rtx_REG (CC_NCVmode, CC_REGNUM);
+
+      gcc_assert (scratch1 && REG_P (scratch1));
+      gcc_assert (scratch2 && REG_P (scratch2));
+
+      switch (code)
+	{
+	case ASHIFT:
+	  emit_insn (SUB_32 (scratch1, amount));
+	  emit_insn (RSB_32 (scratch2, amount));
+	  break;
+	case ASHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_S_32 (scratch2, amount));
+	  break;
+	case LSHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_32 (scratch2, amount));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+
+      if (!TARGET_THUMB2)
+	{
+	  /* If this were only called during expand we could just use the else
+	     case and let combine deal with it, but this can also be called
+	     from post-reload splitters.  */
+	  emit_insn (SET (out_down,
+			  ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
+	  if (code == ASHIFTRT)
+	    {
+	      rtx done_label = gen_label_rtx ();
+	      emit_jump_insn (BRANCH (LT, done_label));
+	      emit_insn (SET (out_down, ORR (SHIFT (ASHIFTRT, in_up, scratch2),
+					     out_down)));
+	      emit_label (done_label);
+	    }
+	  else
+	    emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
+					   out_down)));
+	}
+      else
+	{
+	  /* Thumb2 can't do shift and or in one insn.  */
+	  emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
+	  emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
+
+	  if (code == ASHIFTRT)
+	    {
+	      rtx done_label = gen_label_rtx ();
+	      emit_jump_insn (BRANCH (LT, done_label));
+	      emit_insn (SET (scratch2, SHIFT (ASHIFTRT, in_up, scratch2)));
+	      emit_insn (SET (out_down, ORR (out_down, scratch2)));
+	      emit_label (done_label);
+	    }
+	  else
+	    {
+	      emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
+	      emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
+	    }
+	}
+
+      emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+    }
+
+  #undef SUB_32
+  #undef RSB_32
+  #undef SUB_S_32
+  #undef SET
+  #undef SHIFT
+  #undef LSHIFT
+  #undef REV_LSHIFT
+  #undef ORR
+  #undef BRANCH
+}
+
 #include "gt-arm.h"
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 751997f..09a8754 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3466,21 +3466,37 @@
                    (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_size)
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
-    FAIL;
   "
 )
 
@@ -3525,21 +3541,37 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_size)
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -3582,21 +3614,37 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_size)
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -7645,7 +7693,7 @@
 ;; Patterns to match conditional branch insns.
 ;;
 
-(define_insn "*arm_cond_branch"
+(define_insn "arm_cond_branch"
   [(set (pc)
 	(if_then_else (match_operator 1 "arm_comparison_operator"
 		       [(match_operand 2 "cc_register" "") (const_int 0)])

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-16 16:08             ` Andrew Stubbs
@ 2012-02-17 17:42               ` Andrew Stubbs
  2012-05-16 10:26                 ` Ramana Radhakrishnan
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Stubbs @ 2012-02-17 17:42 UTC (permalink / raw)
  Cc: Richard Henderson, gcc-patches, Ramana Radhakrishnan,
	Richard Earnshaw, patches

[-- Attachment #1: Type: text/plain, Size: 188 bytes --]

On 16/02/12 15:33, Andrew Stubbs wrote:
> OK for 4.8?

I forgot to address Ramana's comment about optimize_size.

This update fixes that and leaves everything else untouched.

OK?

Andrew

[-- Attachment #2: core-shift64.patch --]
[-- Type: text/x-patch, Size: 13683 bytes --]

2012-02-17  Andrew Stubbs  <ams@codesourcery.com>

	gcc/
	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.
	(arm_cond_branch): Remove '*' to enable gen_arm_cond_branch.

---
 gcc/config/arm/arm-protos.h |    3 +
 gcc/config/arm/arm.c        |  201 +++++++++++++++++++++++++++++++++++++++++++
 gcc/config/arm/arm.md       |  104 ++++++++++++++++------
 3 files changed, 280 insertions(+), 28 deletions(-)

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 296550a..df8d7a9 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -242,6 +242,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+					   rtx);
 #endif /* RTX_CODE */
 
 #endif /* ! GCC_ARM_PROTOS_H */
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index c3a19e4..02dc6ca 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25213,5 +25213,206 @@ vfp3_const_double_for_fract_bits (rtx operand)
   return 0;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal
+   on ARM, since we know that shifts by negative amounts are no-ops.
+
+   It's safe for the input and output to be the same register, but
+   early-clobber rules apply for the shift amount and scratch registers.
+
+   Shift by register requires both scratch registers.  Shift by a constant
+   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+   the scratch registers may be NULL.
+
+   Additionally, ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			       rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Bits flow from up-stream to down-stream.  */
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	      && (REG_P (out) || GET_CODE (out) == SUBREG)
+	      && GET_MODE (out) == DImode);
+  gcc_assert (in
+	      && (REG_P (in) || GET_CODE (in) == SUBREG)
+	      && GET_MODE (in) == DImode);
+  gcc_assert (amount
+	      && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		   && GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	      || (GET_CODE (scratch1) == SCRATCH)
+	      || (GET_MODE (scratch1) == SImode
+		  && REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	      || (GET_CODE (scratch2) == SCRATCH)
+	      || (GET_MODE (scratch2) == SImode
+		  && REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	      || !HARD_REGISTER_P (out)
+	      || (REGNO (out) != REGNO (amount)
+		  && REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	    gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	    gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	    gen_addsi3_compare0 ((DEST), (SRC), \
+				 gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	    gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			    SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			    SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	    gen_rtx_IOR (SImode, (A), (B))
+  #define BRANCH(COND,LABEL) \
+	    gen_arm_cond_branch ((LABEL), \
+				 gen_rtx_ ## COND (CCmode, cc_reg, \
+						   const0_rtx), \
+				 cc_reg)
+
+  if (CONST_INT_P (amount))
+    {
+      /* Shifts by a constant amount.  */
+      if (INTVAL (amount) <= 0)
+	/* Match what shift-by-register would do.  */
+	emit_insn (gen_movdi (out, in));
+      else if (INTVAL (amount) >= 64)
+	{
+	  /* Match what shift-by-register would do.  */
+	  if (code == ASHIFTRT)
+	    {
+	      rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
+	      emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
+	      emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
+	    }
+	  else
+	    emit_insn (gen_movdi (out, const0_rtx));
+	}
+      else if (INTVAL (amount) < 32)
+	{
+	  /* Shifts by a constant less than 32.  */
+	  rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
+						  32 - INTVAL (amount));
+
+	  emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+	  emit_insn (SET (out_down,
+			  ORR (REV_LSHIFT (code, in_up, reverse_amount),
+			       out_down)));
+	  emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+	}
+      else
+	{
+	  /* Shifts by a constant greater than 31.  */
+	  rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
+
+	  emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
+	  if (code == ASHIFTRT)
+	    emit_insn (gen_ashrsi3 (out_up, in_up,
+				    gen_rtx_CONST_INT (VOIDmode, 31)));
+	  else
+	    emit_insn (SET (out_up, const0_rtx));
+	}
+    }
+  else
+    {
+      /* Shifts by a variable amount.  */
+      rtx cc_reg = gen_rtx_REG (CC_NCVmode, CC_REGNUM);
+
+      gcc_assert (scratch1 && REG_P (scratch1));
+      gcc_assert (scratch2 && REG_P (scratch2));
+
+      switch (code)
+	{
+	case ASHIFT:
+	  emit_insn (SUB_32 (scratch1, amount));
+	  emit_insn (RSB_32 (scratch2, amount));
+	  break;
+	case ASHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_S_32 (scratch2, amount));
+	  break;
+	case LSHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_32 (scratch2, amount));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+
+      if (!TARGET_THUMB2)
+	{
+	  /* If this were only called during expand we could just use the else
+	     case and let combine deal with it, but this can also be called
+	     from post-reload splitters.  */
+	  emit_insn (SET (out_down,
+			  ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
+	  if (code == ASHIFTRT)
+	    {
+	      rtx done_label = gen_label_rtx ();
+	      emit_jump_insn (BRANCH (LT, done_label));
+	      emit_insn (SET (out_down, ORR (SHIFT (ASHIFTRT, in_up, scratch2),
+					     out_down)));
+	      emit_label (done_label);
+	    }
+	  else
+	    emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
+					   out_down)));
+	}
+      else
+	{
+	  /* Thumb2 can't do shift and or in one insn.  */
+	  emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
+	  emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
+
+	  if (code == ASHIFTRT)
+	    {
+	      rtx done_label = gen_label_rtx ();
+	      emit_jump_insn (BRANCH (LT, done_label));
+	      emit_insn (SET (scratch2, SHIFT (ASHIFTRT, in_up, scratch2)));
+	      emit_insn (SET (out_down, ORR (out_down, scratch2)));
+	      emit_label (done_label);
+	    }
+	  else
+	    {
+	      emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
+	      emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
+	    }
+	}
+
+      emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+    }
+
+  #undef SUB_32
+  #undef RSB_32
+  #undef SUB_S_32
+  #undef SET
+  #undef SHIFT
+  #undef LSHIFT
+  #undef REV_LSHIFT
+  #undef ORR
+  #undef BRANCH
+}
+
 #include "gt-arm.h"
 
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index 751997f..7910bae 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3466,21 +3466,37 @@
                    (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_function_for_size_p (cfun))
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
-    FAIL;
   "
 )
 
@@ -3525,21 +3541,37 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_function_for_size_p (cfun))
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -3582,21 +3614,37 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (GET_CODE (operands[2]) == CONST_INT
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_function_for_size_p (cfun))
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -7645,7 +7693,7 @@
 ;; Patterns to match conditional branch insns.
 ;;
 
-(define_insn "*arm_cond_branch"
+(define_insn "arm_cond_branch"
   [(set (pc)
 	(if_then_else (match_operator 1 "arm_comparison_operator"
 		       [(match_operand 2 "cc_register" "") (const_int 0)])

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-02-17 17:42               ` Andrew Stubbs
@ 2012-05-16 10:26                 ` Ramana Radhakrishnan
  2012-05-16 16:09                   ` Andrew Stubbs
  0 siblings, 1 reply; 18+ messages in thread
From: Ramana Radhakrishnan @ 2012-05-16 10:26 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: Richard Henderson, gcc-patches, Richard Earnshaw, patches

>
>  extern const struct tune_params *current_tune;
>  extern int vfp3_const_double_for_fract_bits (rtx);
> +
> +extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
> +					   rtx);
>  #endif /* RTX_CODE */

>  #endif /* ! GCC_ARM_PROTOS_H */
> diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
> index c3a19e4..02dc6ca 100644
> --- a/gcc/config/arm/arm.c
> +++ b/gcc/config/arm/arm.c
> @@ -25213,5 +25213,206 @@ vfp3_const_double_for_fract_bits (rtx operand)
>    return 0;
>  }

> +/* The default expansion of general 64-bit shifts in core-regs is suboptimal
> +   on ARM, since we know that shifts by negative amounts are no-ops.
> +
> +   It's safe for the input and output to be the same register, but
> +   early-clobber rules apply for the shift amount and scratch registers.
> +
> +   Shift by register requires both scratch registers.  Shift by a constant
> +   less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
> +   the scratch registers may be NULL.
> +
> +   Additionally, ashiftrt by a register also clobbers the CC register.  */
> +void
> +arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
> +			       rtx amount, rtx scratch1, rtx scratch2)
> +{
> +  rtx out_high = gen_highpart (SImode, out);
> +  rtx out_low = gen_lowpart (SImode, out);
> +  rtx in_high = gen_highpart (SImode, in);
> +  rtx in_low = gen_lowpart (SImode, in);
> +
> +  /* Bits flow from up-stream to down-stream.  */

Some thing more about "upstream" and "downstream" here would be nice :)

> +  rtx out_up   = code == ASHIFT ? out_low : out_high;
> +  rtx out_down = code == ASHIFT ? out_high : out_low;
> +  rtx in_up   = code == ASHIFT ? in_low : in_high;
> +  rtx in_down = code == ASHIFT ? in_high : in_low;
> +
> +  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
> +  gcc_assert (out
> +	      && (REG_P (out) || GET_CODE (out) == SUBREG)


> +	      && GET_MODE (out) == DImode);
> +  gcc_assert (in
> +	      && (REG_P (in) || GET_CODE (in) == SUBREG)
> +	      && GET_MODE (in) == DImode);
> +  gcc_assert (amount
> +	      && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
> +		   && GET_MODE (amount) == SImode)
> +		  || CONST_INT_P (amount)));
> +  gcc_assert (scratch1 == NULL
> +	      || (GET_CODE (scratch1) == SCRATCH)
> +	      || (GET_MODE (scratch1) == SImode
> +		  && REG_P (scratch1)));
> +  gcc_assert (scratch2 == NULL
> +	      || (GET_CODE (scratch2) == SCRATCH)
> +	      || (GET_MODE (scratch2) == SImode
> +		  && REG_P (scratch2)));
> +  gcc_assert (!REG_P (out) || !REG_P (amount)
> +	      || !HARD_REGISTER_P (out)
> +	      || (REGNO (out) != REGNO (amount)
> +		  && REGNO (out) + 1 != REGNO (amount)));
> +
> +  /* Macros to make following code more readable.  */
> +  #define SUB_32(DEST,SRC) \
> +	    gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
> +  #define RSB_32(DEST,SRC) \
> +	    gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
> +  #define SUB_S_32(DEST,SRC) \
> +	    gen_addsi3_compare0 ((DEST), (SRC), \
> +				 gen_rtx_CONST_INT (VOIDmode, -32))
> +  #define SET(DEST,SRC) \
> +	    gen_rtx_SET (SImode, (DEST), (SRC))
> +  #define SHIFT(CODE,SRC,AMOUNT) \
> +	    gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
> +  #define LSHIFT(CODE,SRC,AMOUNT) \
> +	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
> +			    SImode, (SRC), (AMOUNT))
> +  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
> +	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
> +			    SImode, (SRC), (AMOUNT))
> +  #define ORR(A,B) \
> +	    gen_rtx_IOR (SImode, (A), (B))
> +  #define BRANCH(COND,LABEL) \
> +	    gen_arm_cond_branch ((LABEL), \
> +				 gen_rtx_ ## COND (CCmode, cc_reg, \
> +						   const0_rtx), \
> +				 cc_reg)
> +
> +  if (CONST_INT_P (amount))
> +    {
> +      /* Shifts by a constant amount.  */
> +      if (INTVAL (amount) <= 0)
> +	/* Match what shift-by-register would do.  */
> +	emit_insn (gen_movdi (out, in));
> +      else if (INTVAL (amount) >= 64)
> +	{
> +	  /* Match what shift-by-register would do.  */
> +	  if (code == ASHIFTRT)
> +	    {
> +	      rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
> +	      emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
> +	      emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
> +	    }
> +	  else
> +	    emit_insn (gen_movdi (out, const0_rtx));
> +	}
> +      else if (INTVAL (amount) < 32)
> +	{
> +	  /* Shifts by a constant less than 32.  */
> +	  rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
> +						  32 - INTVAL (amount));
> +
> +	  emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
> +	  emit_insn (SET (out_down,
> +			  ORR (REV_LSHIFT (code, in_up, reverse_amount),
> +			       out_down)));
> +	  emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
> +	}
> +      else
> +	{
> +	  /* Shifts by a constant greater than 31.  */
> +	  rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
> +
> +	  emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
> +	  if (code == ASHIFTRT)
> +	    emit_insn (gen_ashrsi3 (out_up, in_up,
> +				    gen_rtx_CONST_INT (VOIDmode, 31)));
> +	  else
> +	    emit_insn (SET (out_up, const0_rtx));
> +	}
> +    }
> +  else
> +    {
> +      /* Shifts by a variable amount.  */
> +      rtx cc_reg = gen_rtx_REG (CC_NCVmode, CC_REGNUM);

This isn't something I'm terribly confident about. I think I'd rather
use CC_NOOVmode or in CCmode in this case I think (in this case you
only care that the value as a result of subs r0, r1, 32 is positive or
0) so it's possibly ok to do so. GE with CC_NCV mode really doesn't
make sense as this expects only N, C and V flags to be set but GE
requires the Z bit as well if you went for it.

> +      gcc_assert (scratch1 && REG_P (scratch1));
> +      gcc_assert (scratch2 && REG_P (scratch2));
> +
> +      switch (code)
> +	{
> +	case ASHIFT:
> +	  emit_insn (SUB_32 (scratch1, amount));
> +	  emit_insn (RSB_32 (scratch2, amount));
> +	  break;
> +	case ASHIFTRT:
> +	  emit_insn (RSB_32 (scratch1, amount));
> +	  emit_insn (SUB_S_32 (scratch2, amount));
> +	  break;
> +	case LSHIFTRT:
> +	  emit_insn (RSB_32 (scratch1, amount));
> +	  emit_insn (SUB_32 (scratch2, amount));
> +	  break;
> +	default:
> +	  gcc_unreachable ();
> +	}
> +
> +      emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
> +
> +      if (!TARGET_THUMB2)
> +	{
> +	  /* If this were only called during expand we could just use the else
> +	     case and let combine deal with it, but this can also be called
> +	     from post-reload splitters.  */
> +	  emit_insn (SET (out_down,
> +			  ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
> +	  if (code == ASHIFTRT)
> +	    {
> +	      rtx done_label = gen_label_rtx ();
> +	      emit_jump_insn (BRANCH (LT, done_label));
> +	      emit_insn (SET (out_down, ORR (SHIFT (ASHIFTRT, in_up, scratch2),
> +					     out_down)));
> +	      emit_label (done_label);
> +	    }
> +	  else
> +	    emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
> +					   out_down)));
> +	}
> +      else
> +	{
> +	  /* Thumb2 can't do shift and or in one insn.  */
> +	  emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
> +	  emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
> +
> +	  if (code == ASHIFTRT)
> +	    {
> +	      rtx done_label = gen_label_rtx ();
> +	      emit_jump_insn (BRANCH (LT, done_label));
> +	      emit_insn (SET (scratch2, SHIFT (ASHIFTRT, in_up, scratch2)));
> +	      emit_insn (SET (out_down, ORR (out_down, scratch2)));
> +	      emit_label (done_label);
> +	    }
> +	  else
> +	    {
> +	      emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
> +	      emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
> +	    }
> +	}
> +
> +      emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
> +    }
> +
> +  #undef SUB_32
> +  #undef RSB_32
> +  #undef SUB_S_32
> +  #undef SET
> +  #undef SHIFT
> +  #undef LSHIFT
> +  #undef REV_LSHIFT
> +  #undef ORR
> +  #undef BRANCH
> +}
> +
>  #include "gt-arm.h"

> diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
> index 751997f..7910bae 100644
> --- a/gcc/config/arm/arm.md
> +++ b/gcc/config/arm/arm.md
> @@ -3466,21 +3466,37 @@
>                     (match_operand:SI 2 "reg_or_int_operand" "")))]
>    "TARGET_32BIT"
>    "
> -  if (GET_CODE (operands[2]) == CONST_INT)
> +  if (!CONST_INT_P (operands[2])
> +      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
> +    ; /* No special preparation statements; expand pattern as above.  */
> +  else
>      {
> -      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
> +      rtx scratch1, scratch2;
> +
> +      if (GET_CODE (operands[2]) == CONST_INT
> +	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
>          {
>            emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
>            DONE;
>          }
> -        /* Ideally we shouldn't fail here if we could know that operands[1]
> -           ends up already living in an iwmmxt register. Otherwise it's
> -           cheaper to have the alternate code being generated than moving
> -           values to iwmmxt regs and back.  */
> -        FAIL;
> +
> +      /* Ideally we should use iwmmxt here if we could know that operands[1]
> +         ends up already living in an iwmmxt register. Otherwise it's
> +         cheaper to have the alternate code being generated than moving
> +         values to iwmmxt regs and back.  */
> +
> +      /* If we're optimizing for size, we prefer the libgcc calls.  */
> +      if (optimize_function_for_size_p (cfun))
> +	FAIL;
> +
> +      /* Expand operation using core-registers.
> +	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
> +      scratch1 = gen_reg_rtx (SImode);
> +      scratch2 = gen_reg_rtx (SImode);
> +      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
> +				     operands[2], scratch1, scratch2);
> +      DONE;
>      }
> -  else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
> -    FAIL;
>    "
>  )

> @@ -3525,21 +3541,37 @@
>                       (match_operand:SI 2 "reg_or_int_operand" "")))]
>    "TARGET_32BIT"
>    "
> -  if (GET_CODE (operands[2]) == CONST_INT)
> +  if (!CONST_INT_P (operands[2])
> +      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
> +    ; /* No special preparation statements; expand pattern as above.  */
> +  else
>      {
> -      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
> +      rtx scratch1, scratch2;
> +
> +      if (GET_CODE (operands[2]) == CONST_INT
> +	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
>          {
>            emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
>            DONE;
>          }
> -        /* Ideally we shouldn't fail here if we could know that operands[1]
> -           ends up already living in an iwmmxt register. Otherwise it's
> -           cheaper to have the alternate code being generated than moving
> -           values to iwmmxt regs and back.  */
> -        FAIL;
> +
> +      /* Ideally we should use iwmmxt here if we could know that operands[1]
> +         ends up already living in an iwmmxt register. Otherwise it's
> +         cheaper to have the alternate code being generated than moving
> +         values to iwmmxt regs and back.  */
> +
> +      /* If we're optimizing for size, we prefer the libgcc calls.  */
> +      if (optimize_function_for_size_p (cfun))
> +	FAIL;
> +
> +      /* Expand operation using core-registers.
> +	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
> +      scratch1 = gen_reg_rtx (SImode);
> +      scratch2 = gen_reg_rtx (SImode);
> +      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
> +				     operands[2], scratch1, scratch2);
> +      DONE;
>      }
> -  else if (!TARGET_REALLY_IWMMXT)
> -    FAIL;
>    "
>  )

> @@ -3582,21 +3614,37 @@
>                       (match_operand:SI 2 "reg_or_int_operand" "")))]
>    "TARGET_32BIT"
>    "
> -  if (GET_CODE (operands[2]) == CONST_INT)
> +  if (!CONST_INT_P (operands[2])
> +      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
> +    ; /* No special preparation statements; expand pattern as above.  */
> +  else
>      {
> -      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
> +      rtx scratch1, scratch2;
> +
> +      if (GET_CODE (operands[2]) == CONST_INT

Use CONST_INT_P (operands[2]) instead.

Ok with those changes.

regards
Ramana

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-05-16 10:26                 ` Ramana Radhakrishnan
@ 2012-05-16 16:09                   ` Andrew Stubbs
  2012-05-16 16:44                     ` Ramana Radhakrishnan
  0 siblings, 1 reply; 18+ messages in thread
From: Andrew Stubbs @ 2012-05-16 16:09 UTC (permalink / raw)
  To: Ramana Radhakrishnan
  Cc: Richard Henderson, gcc-patches, Richard Earnshaw, patches

[-- Attachment #1: Type: text/plain, Size: 256 bytes --]

On 16/05/12 11:25, Ramana Radhakrishnan wrote:
> Ok with those changes.

Hi Ramana,

Here's an update rebased and modified as requested.

Can you please confirm that the comments explain what you wanted to 
know, and then I will commit it.

Thanks

Andrew

[-- Attachment #2: core-shift64.patch --]
[-- Type: text/x-patch, Size: 15653 bytes --]

2012-05-16  Andrew Stubbs  <ams@codesourcery.com>

	gcc/
	* config/arm/arm-protos.h (arm_emit_coreregs_64bit_shift): New
	prototype.
	* config/arm/arm.c (arm_emit_coreregs_64bit_shift): New function.
	* config/arm/arm.md (ashldi3): Use arm_emit_coreregs_64bit_shift.
	(ashrdi3,lshrdi3): Likewise.
	(arm_cond_branch): Remove '*' to enable gen_arm_cond_branch.

diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index cb74d70..b338470 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -245,6 +245,9 @@ struct tune_params
 
 extern const struct tune_params *current_tune;
 extern int vfp3_const_double_for_fract_bits (rtx);
+
+extern void arm_emit_coreregs_64bit_shift (enum rtx_code, rtx, rtx, rtx, rtx,
+					   rtx);
 #endif /* RTX_CODE */
 
 extern void arm_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel);
diff --git a/gcc/config/arm/arm.c b/gcc/config/arm/arm.c
index 2c62c51..3ad4c75 100644
--- a/gcc/config/arm/arm.c
+++ b/gcc/config/arm/arm.c
@@ -25933,4 +25933,256 @@ arm_autoinc_modes_ok_p (enum machine_mode mode, enum arm_auto_incmodes code)
   return false;
 }
 
+/* The default expansion of general 64-bit shifts in core-regs is suboptimal,
+   on ARM, since we know that shifts by negative amounts are no-ops.
+   Additionally, the default expansion code is not available or suitable
+   for post-reload insn splits (this can occur when the register allocator
+   chooses not to do a shift in NEON).
+   
+   This function is used in both initial expand and post-reload splits, and
+   handles all kinds of 64-bit shifts.
+
+   Input requirements:
+    - It is safe for the input and output to be the same register, but
+      early-clobber rules apply for the shift amount and scratch registers.
+    - Shift by register requires both scratch registers.  Shift by a constant
+      less than 32 in Thumb2 mode requires SCRATCH1 only.  In all other cases
+      the scratch registers may be NULL.
+    - Ashiftrt by a register also clobbers the CC register.  */
+void
+arm_emit_coreregs_64bit_shift (enum rtx_code code, rtx out, rtx in,
+			       rtx amount, rtx scratch1, rtx scratch2)
+{
+  rtx out_high = gen_highpart (SImode, out);
+  rtx out_low = gen_lowpart (SImode, out);
+  rtx in_high = gen_highpart (SImode, in);
+  rtx in_low = gen_lowpart (SImode, in);
+
+  /* Terminology:
+	in = the register pair containing the input value.
+	out = the destination register pair.
+	up = the high- or low-part of each pair.
+	down = the opposite part to "up".
+     In a shift, we can consider bits to shift from "up"-stream to
+     "down"-stream, so in a left-shift "up" is the low-part and "down"
+     is the high-part of each register pair.  */
+
+  rtx out_up   = code == ASHIFT ? out_low : out_high;
+  rtx out_down = code == ASHIFT ? out_high : out_low;
+  rtx in_up   = code == ASHIFT ? in_low : in_high;
+  rtx in_down = code == ASHIFT ? in_high : in_low;
+
+  gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
+  gcc_assert (out
+	      && (REG_P (out) || GET_CODE (out) == SUBREG)
+	      && GET_MODE (out) == DImode);
+  gcc_assert (in
+	      && (REG_P (in) || GET_CODE (in) == SUBREG)
+	      && GET_MODE (in) == DImode);
+  gcc_assert (amount
+	      && (((REG_P (amount) || GET_CODE (amount) == SUBREG)
+		   && GET_MODE (amount) == SImode)
+		  || CONST_INT_P (amount)));
+  gcc_assert (scratch1 == NULL
+	      || (GET_CODE (scratch1) == SCRATCH)
+	      || (GET_MODE (scratch1) == SImode
+		  && REG_P (scratch1)));
+  gcc_assert (scratch2 == NULL
+	      || (GET_CODE (scratch2) == SCRATCH)
+	      || (GET_MODE (scratch2) == SImode
+		  && REG_P (scratch2)));
+  gcc_assert (!REG_P (out) || !REG_P (amount)
+	      || !HARD_REGISTER_P (out)
+	      || (REGNO (out) != REGNO (amount)
+		  && REGNO (out) + 1 != REGNO (amount)));
+
+  /* Macros to make following code more readable.  */
+  #define SUB_32(DEST,SRC) \
+	    gen_addsi3 ((DEST), (SRC), gen_rtx_CONST_INT (VOIDmode, -32))
+  #define RSB_32(DEST,SRC) \
+	    gen_subsi3 ((DEST), gen_rtx_CONST_INT (VOIDmode, 32), (SRC))
+  #define SUB_S_32(DEST,SRC) \
+	    gen_addsi3_compare0 ((DEST), (SRC), \
+				 gen_rtx_CONST_INT (VOIDmode, -32))
+  #define SET(DEST,SRC) \
+	    gen_rtx_SET (SImode, (DEST), (SRC))
+  #define SHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE), SImode, (SRC), (AMOUNT))
+  #define LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? ASHIFT : LSHIFTRT, \
+			    SImode, (SRC), (AMOUNT))
+  #define REV_LSHIFT(CODE,SRC,AMOUNT) \
+	    gen_rtx_fmt_ee ((CODE) == ASHIFT ? LSHIFTRT : ASHIFT, \
+			    SImode, (SRC), (AMOUNT))
+  #define ORR(A,B) \
+	    gen_rtx_IOR (SImode, (A), (B))
+  #define BRANCH(COND,LABEL) \
+	    gen_arm_cond_branch ((LABEL), \
+				 gen_rtx_ ## COND (CCmode, cc_reg, \
+						   const0_rtx), \
+				 cc_reg)
+
+  /* Shifts by register and shifts by constant are handled separately.  */
+  if (CONST_INT_P (amount))
+    {
+      /* We have a shift-by-constant.  */
+
+      /* First, handle out-of-range shift amounts.
+	 In both cases we try to match the result an ARM instruction in a
+	 shift-by-register would give.  This helps reduce execution
+	 differences between optimization levels, but it won't stop other
+         parts of the compiler doing different things.  This is "undefined
+         behaviour, in any case.  */
+      if (INTVAL (amount) <= 0)
+	emit_insn (gen_movdi (out, in));
+      else if (INTVAL (amount) >= 64)
+	{
+	  if (code == ASHIFTRT)
+	    {
+	      rtx const31_rtx = gen_rtx_CONST_INT (VOIDmode, 31);
+	      emit_insn (SET (out_down, SHIFT (code, in_up, const31_rtx)));
+	      emit_insn (SET (out_up, SHIFT (code, in_up, const31_rtx)));
+	    }
+	  else
+	    emit_insn (gen_movdi (out, const0_rtx));
+	}
+
+      /* Now handle valid shifts. */
+      else if (INTVAL (amount) < 32)
+	{
+	  /* Shifts by a constant less than 32.  */
+	  rtx reverse_amount = gen_rtx_CONST_INT (VOIDmode,
+						  32 - INTVAL (amount));
+
+	  emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+	  emit_insn (SET (out_down,
+			  ORR (REV_LSHIFT (code, in_up, reverse_amount),
+			       out_down)));
+	  emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+	}
+      else
+	{
+	  /* Shifts by a constant greater than 31.  */
+	  rtx adj_amount = gen_rtx_CONST_INT (VOIDmode, INTVAL (amount) - 32);
+
+	  emit_insn (SET (out_down, SHIFT (code, in_up, adj_amount)));
+	  if (code == ASHIFTRT)
+	    emit_insn (gen_ashrsi3 (out_up, in_up,
+				    gen_rtx_CONST_INT (VOIDmode, 31)));
+	  else
+	    emit_insn (SET (out_up, const0_rtx));
+	}
+    }
+  else
+    {
+      /* We have a shift-by-register.  */
+      rtx cc_reg = gen_rtx_REG (CC_NOOVmode, CC_REGNUM);
+
+      /* This alternative requires the scratch registers.  */
+      gcc_assert (scratch1 && REG_P (scratch1));
+      gcc_assert (scratch2 && REG_P (scratch2));
+
+      /* We will need the values "amount-32" and "32-amount" later.
+         Swapping them around now allows the later code to be more general. */
+      switch (code)
+	{
+	case ASHIFT:
+	  emit_insn (SUB_32 (scratch1, amount));
+	  emit_insn (RSB_32 (scratch2, amount));
+	  break;
+	case ASHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  /* Also set CC = amount > 32.  */
+	  emit_insn (SUB_S_32 (scratch2, amount));
+	  break;
+	case LSHIFTRT:
+	  emit_insn (RSB_32 (scratch1, amount));
+	  emit_insn (SUB_32 (scratch2, amount));
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+
+      /* Emit code like this:
+
+	 arithmetic-left:
+	    out_down = in_down << amount;
+	    out_down = (in_up << (amount - 32)) | out_down;
+	    out_down = ((unsigned)in_up >> (32 - amount)) | out_down;
+	    out_up = in_up << amount;
+
+	 arithmetic-right:
+	    out_down = in_down >> amount;
+	    out_down = (in_up << (32 - amount)) | out_down;
+	    if (amount < 32)
+	      out_down = ((signed)in_up >> (amount - 32)) | out_down;
+	    out_up = in_up << amount;
+
+	 logical-right:
+	    out_down = in_down >> amount;
+	    out_down = (in_up << (32 - amount)) | out_down;
+	    if (amount < 32)
+	      out_down = ((unsigned)in_up >> (amount - 32)) | out_down;
+	    out_up = in_up << amount;
+
+	  The ARM and Thumb2 variants are the same but implemented slightly
+	  differently.  If this were only called during expand we could just
+	  use the Thumb2 case and let combine do the right thing, but this
+	  can also be called from post-reload splitters.  */
+
+      emit_insn (SET (out_down, LSHIFT (code, in_down, amount)));
+
+      if (!TARGET_THUMB2)
+	{
+	  /* Emit code for ARM mode.  */
+	  emit_insn (SET (out_down,
+			  ORR (SHIFT (ASHIFT, in_up, scratch1), out_down)));
+	  if (code == ASHIFTRT)
+	    {
+	      rtx done_label = gen_label_rtx ();
+	      emit_jump_insn (BRANCH (LT, done_label));
+	      emit_insn (SET (out_down, ORR (SHIFT (ASHIFTRT, in_up, scratch2),
+					     out_down)));
+	      emit_label (done_label);
+	    }
+	  else
+	    emit_insn (SET (out_down, ORR (SHIFT (LSHIFTRT, in_up, scratch2),
+					   out_down)));
+	}
+      else
+	{
+	  /* Emit code for Thumb2 mode.
+	     Thumb2 can't do shift and or in one insn.  */
+	  emit_insn (SET (scratch1, SHIFT (ASHIFT, in_up, scratch1)));
+	  emit_insn (gen_iorsi3 (out_down, out_down, scratch1));
+
+	  if (code == ASHIFTRT)
+	    {
+	      rtx done_label = gen_label_rtx ();
+	      emit_jump_insn (BRANCH (LT, done_label));
+	      emit_insn (SET (scratch2, SHIFT (ASHIFTRT, in_up, scratch2)));
+	      emit_insn (SET (out_down, ORR (out_down, scratch2)));
+	      emit_label (done_label);
+	    }
+	  else
+	    {
+	      emit_insn (SET (scratch2, SHIFT (LSHIFTRT, in_up, scratch2)));
+	      emit_insn (gen_iorsi3 (out_down, out_down, scratch2));
+	    }
+	}
+
+      emit_insn (SET (out_up, SHIFT (code, in_up, amount)));
+    }
+
+  #undef SUB_32
+  #undef RSB_32
+  #undef SUB_S_32
+  #undef SET
+  #undef SHIFT
+  #undef LSHIFT
+  #undef REV_LSHIFT
+  #undef ORR
+  #undef BRANCH
+}
+
 #include "gt-arm.h"
diff --git a/gcc/config/arm/arm.md b/gcc/config/arm/arm.md
index b1ad3bf..bc97a4a 100644
--- a/gcc/config/arm/arm.md
+++ b/gcc/config/arm/arm.md
@@ -3520,21 +3520,37 @@
                    (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (CONST_INT_P (operands[2])
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashldi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_function_for_size_p (cfun))
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT && !(TARGET_HARD_FLOAT && TARGET_MAVERICK))
-    FAIL;
   "
 )
 
@@ -3579,21 +3595,37 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (CONST_INT_P (operands[2])
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_ashrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_function_for_size_p (cfun))
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (ASHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -3636,21 +3668,37 @@
                      (match_operand:SI 2 "reg_or_int_operand" "")))]
   "TARGET_32BIT"
   "
-  if (GET_CODE (operands[2]) == CONST_INT)
+  if (!CONST_INT_P (operands[2])
+      && (TARGET_REALLY_IWMMXT || (TARGET_HARD_FLOAT && TARGET_MAVERICK)))
+    ; /* No special preparation statements; expand pattern as above.  */
+  else
     {
-      if ((HOST_WIDE_INT) INTVAL (operands[2]) == 1)
+      rtx scratch1, scratch2;
+
+      if (CONST_INT_P (operands[2])
+	  && (HOST_WIDE_INT) INTVAL (operands[2]) == 1)
         {
           emit_insn (gen_arm_lshrdi3_1bit (operands[0], operands[1]));
           DONE;
         }
-        /* Ideally we shouldn't fail here if we could know that operands[1] 
-           ends up already living in an iwmmxt register. Otherwise it's
-           cheaper to have the alternate code being generated than moving
-           values to iwmmxt regs and back.  */
-        FAIL;
+
+      /* Ideally we should use iwmmxt here if we could know that operands[1]
+         ends up already living in an iwmmxt register. Otherwise it's
+         cheaper to have the alternate code being generated than moving
+         values to iwmmxt regs and back.  */
+
+      /* If we're optimizing for size, we prefer the libgcc calls.  */
+      if (optimize_function_for_size_p (cfun))
+	FAIL;
+
+      /* Expand operation using core-registers.
+	 'FAIL' would achieve the same thing, but this is a bit smarter.  */
+      scratch1 = gen_reg_rtx (SImode);
+      scratch2 = gen_reg_rtx (SImode);
+      arm_emit_coreregs_64bit_shift (LSHIFTRT, operands[0], operands[1],
+				     operands[2], scratch1, scratch2);
+      DONE;
     }
-  else if (!TARGET_REALLY_IWMMXT)
-    FAIL;
   "
 )
 
@@ -7755,7 +7803,7 @@
 ;; Patterns to match conditional branch insns.
 ;;
 
-(define_insn "*arm_cond_branch"
+(define_insn "arm_cond_branch"
   [(set (pc)
 	(if_then_else (match_operator 1 "arm_comparison_operator"
 		       [(match_operand 2 "cc_register" "") (const_int 0)])

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-05-16 16:09                   ` Andrew Stubbs
@ 2012-05-16 16:44                     ` Ramana Radhakrishnan
  2012-05-18  9:12                       ` Andrew Stubbs
  0 siblings, 1 reply; 18+ messages in thread
From: Ramana Radhakrishnan @ 2012-05-16 16:44 UTC (permalink / raw)
  To: Andrew Stubbs; +Cc: Richard Henderson, gcc-patches, Richard Earnshaw, patches

On 16 May 2012 17:09, Andrew Stubbs <ams@codesourcery.com> wrote:
> On 16/05/12 11:25, Ramana Radhakrishnan wrote:
>>
>> Ok with those changes.
>
>
> Hi Ramana,
>
> Here's an update rebased and modified as requested.
>
> Can you please confirm that the comments explain what you wanted to know,
> and then I will commit it.

OK (if no regressions).

Ramana

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH][ARM] Improve 64-bit shifts (non-NEON)
  2012-05-16 16:44                     ` Ramana Radhakrishnan
@ 2012-05-18  9:12                       ` Andrew Stubbs
  0 siblings, 0 replies; 18+ messages in thread
From: Andrew Stubbs @ 2012-05-18  9:12 UTC (permalink / raw)
  To: Ramana Radhakrishnan
  Cc: Richard Henderson, gcc-patches, Richard Earnshaw, patches

On 16/05/12 17:43, Ramana Radhakrishnan wrote:
> OK (if no regressions).

Cross tested with no regressions, and committed.

Thanks

Andrew

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2012-05-18  9:12 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-27 16:07 [PATCH][ARM] Improve 64-bit shifts (non-NEON) Andrew Stubbs
2012-01-30 15:26 ` Richard Earnshaw
2012-01-31 14:29   ` Andrew Stubbs
2012-02-07 22:20     ` Ramana Radhakrishnan
2012-02-07 22:34       ` Steven Bosscher
2012-02-08 12:13         ` Bernd Schmidt
2012-02-08 12:32           ` Richard Guenther
2012-02-08 13:01             ` Bernd Schmidt
2012-02-08 13:33               ` Richard Guenther
2012-02-08 11:34       ` Andrew Stubbs
2012-02-08 16:43         ` Andrew Stubbs
2012-02-11  2:41           ` Richard Henderson
2012-02-16 16:08             ` Andrew Stubbs
2012-02-17 17:42               ` Andrew Stubbs
2012-05-16 10:26                 ` Ramana Radhakrishnan
2012-05-16 16:09                   ` Andrew Stubbs
2012-05-16 16:44                     ` Ramana Radhakrishnan
2012-05-18  9:12                       ` Andrew Stubbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).