[PATCH] [Aarch64] Variable shift count truncation issues

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] [Aarch64] Variable shift count truncation issues
@ 2017-05-18 23:17 Michael Collison
  2017-05-19  7:51 ` Richard Sandiford
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2017-05-18 23:17 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 1806 bytes --]

This patch improves code generation for shifts with and operations that can be
omitted based on the size of the operation. AArch64 variable shifts only use the
low 5-6 bits so masking operations that clear higher bits can be removed. When
the shift instructions operate on all register arguments they truncate the shift/rotate
amount by the size of the registers they operate on: 32 for W-regs, 63 for X-regs.
This allows us to optimize away such redundant masking instructions.

The patch only applies to shifts on integer instructions; vector shifts are excluded.

unsigned int f1
(unsigned int x, int y)
{
  return x << (y & 31);
}

unsigned long
f3 (unsigned long bit_addr)
{
  unsigned long bitnumb = bit_addr & 63;
  return (1L << bitnumb);
}

With trunk at -O2 we generate:

f1:
	and	w1, w1, 31
	lsl	w0, w0, w1
	ret
	.size	f1, .-f1
	.align	2
	.p2align 3,,7
	.global	f3
	.type	f3, %function
f3:
	and	x0, x0, 63
	mov	x1, 1
	lsl	x0, x1, x0
	ret

with the patch we generate:

f1:
	lsl	w0, w0, w1
	ret

f3:
	mov	x1, 1
	lsl	x0, x1, x0
	ret


Okay for trunk?

2017-05-17  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
	New pattern.
	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
	(*aarch64_reg_<mode>3_minus_mask): New pattern.
	(*aarch64_<optab>_reg_di3_mask2): New pattern.
	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
	of shift when the shift amount is masked with constant equal to
	the size of the mode.


2017-05-17  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* gcc.target/aarch64/var_shift_mask_1.c: New test.

[-- Attachment #2: pr70119.patch --]
[-- Type: application/octet-stream, Size: 7119 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 714bb79..6ff2e22 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7347,17 +7347,26 @@ cost_plus:
         }
       else
         {
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
-		{
-		  /* Vector shift (register).  */
-		  *cost += extra_cost->vect.alu;
-		}
-	      else
+	      if (speed)
+		/* Vector shift (register).  */
+		*cost += extra_cost->vect.alu;
+	    }
+	  else
+	    {
+	      if (speed)
+		/* LSLV.  */
+		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
 		{
-		  /* LSLV.  */
-		  *cost += extra_cost->alu.shift_reg;
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
 		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
@@ -7386,14 +7395,27 @@ cost_plus:
 	}
       else
 	{
-
-	  /* ASR (register) and friends.  */
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
+	      if (speed)
+		/* Vector shift (register).  */
 		*cost += extra_cost->vect.alu;
-	      else
+	    }
+	  else
+	    {
+	      if (speed)
+		/* ASR (register) and friends.  */
 		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
+		{
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
+		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
 	}
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5adc5ed..c6ae670 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -3999,6 +3999,92 @@
   }
 )
 
+;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
+;; they truncate the shift/rotate amount by the size of the registers they
+;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away
+;; such redundant masking instructions.  GCC can do that automatically when
+;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
+;; because some of the SISD shift alternatives don't perform this truncations.
+;; So this pattern exists to catch such cases.
+
+(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (subreg:QI (and:GPI
+		      (match_operand:GPI 2 "register_operand" "r")
+		      (match_operand 3 "const_int_operand" "n")) 0)))]
+  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
+  "<shift>\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "shift_reg")]
+)
+
+
+(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (subreg:QI (neg:SI (and:SI
+			      (match_operand:SI 2 "register_operand" "r")
+			      (match_operand 3 "const_int_operand" "n"))) 0)))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
+   && can_create_pseudo_p ()"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx tmp = gen_reg_rtx (SImode);
+
+    emit_insn (gen_negsi2 (tmp, operands[2]));
+    rtx tmp2 = simplify_gen_subreg (QImode, tmp, SImode, 0);
+    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], tmp2));
+    DONE;
+  }
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(ashift:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (minus:QI (match_operand 2 "const_int_operand" "n")
+		    (subreg:QI (and:SI
+				(match_operand:SI 3 "register_operand" "r")
+				(match_operand 4 "const_int_operand" "n")) 0))))]
+  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
+   && can_create_pseudo_p ()
+   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx tmp = gen_reg_rtx (SImode);
+
+    emit_insn (gen_negsi2 (tmp, operands[3]));
+    rtx tmp2 = simplify_gen_subreg (QImode, tmp, SImode, 0);
+    emit_insn (gen_ashl<mode>3 (operands[0], operands[1], tmp2));
+    DONE;
+  }
+)
+
+(define_insn_and_split "*aarch64_<optab>_reg_di3_mask2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(SHIFT:DI
+	  (match_operand:DI 1 "register_operand" "r")
+	  (subreg:QI (and:SI
+		      (match_operand:SI 2 "register_operand" "r")
+		      (match_operand 3 "const_int_operand" "n")) 0)))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)
+   && can_create_pseudo_p ()"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx tmp = simplify_gen_subreg (QImode, operands[2], SImode, 0);
+    emit_insn (gen_<optab>di3 (operands[0], operands[1], tmp));
+    DONE;
+  }
+)
+
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
new file mode 100644
index 0000000..e2b020e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* The integer variable shift and rotate instructions truncate their
+   shift amounts by the datasize.  Make sure that we don't emit a redundant
+   masking operation.  */
+
+unsigned
+f1 (unsigned x, int y)
+{
+  return x << (y & 31);
+}
+
+unsigned long
+f2 (unsigned long x, int y)
+{
+  return x << (y & 63);
+}
+
+unsigned long
+f3 (unsigned long bit_addr, int y)
+{
+  unsigned long bitnumb = bit_addr & 63;
+  return (1L << bitnumb);
+}
+
+unsigned int
+f4 (unsigned int x, unsigned int y)
+{
+  y &= 31;
+  return x >> y | (x << (32 - y));
+}
+
+unsigned long
+f5 (unsigned long x, unsigned long y)
+{
+  y &= 63;
+  return x >> y | (x << (64 - y));
+}
+
+unsigned long
+f6 (unsigned long x, unsigned long y)
+{
+
+  return (x << (64 - (y & 63)));
+
+}
+
+unsigned long
+f7 (unsigned long x, unsigned long y)
+{
+  return (x << -(y & 63));
+}
+
+/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
+/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
+/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-05-18 23:17 [PATCH] [Aarch64] Variable shift count truncation issues Michael Collison
@ 2017-05-19  7:51 ` Richard Sandiford
  2017-05-19 11:03   ` Christophe Lyon
  0 siblings, 1 reply; 15+ messages in thread
From: Richard Sandiford @ 2017-05-19  7:51 UTC (permalink / raw)
  To: Michael Collison; +Cc: gcc-patches, nd

Thanks for doing this.  Just a couple of comments about the .md stuff:

Michael Collison <Michael.Collison@arm.com> writes:
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 5adc5ed..c6ae670 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -3999,6 +3999,92 @@
>    }
>  )
>  
> +;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
> +;; they truncate the shift/rotate amount by the size of the registers they
> +;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away
> +;; such redundant masking instructions.  GCC can do that automatically when
> +;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
> +;; because some of the SISD shift alternatives don't perform this truncations.
> +;; So this pattern exists to catch such cases.
> +
> +(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +	(SHIFT:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (subreg:QI (and:GPI
> +		      (match_operand:GPI 2 "register_operand" "r")
> +		      (match_operand 3 "const_int_operand" "n")) 0)))]
> +  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
> +  "<shift>\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "shift_reg")]
> +)

(subreg:QI (...) 0) is only correct for little endian.  For big endian
it needs to be 3 for SI or 7 for DI.  You could probably handle that
using something like:

  (match_operator:QI 2 "lowpart_subreg"
    [(and:GPI ...)])

and defining a lowpart_subreg predicate that checks the SUBREG_BYTE.
Or just leave the subreg as-is and add !BYTES_BIG_ENDIAN to the insn
condition.

> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +	(SHIFT:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (subreg:QI (neg:SI (and:SI
> +			      (match_operand:SI 2 "register_operand" "r")
> +			      (match_operand 3 "const_int_operand" "n"))) 0)))]
> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
> +   && can_create_pseudo_p ()"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +  {
> +    rtx tmp = gen_reg_rtx (SImode);
> +
> +    emit_insn (gen_negsi2 (tmp, operands[2]));
> +    rtx tmp2 = simplify_gen_subreg (QImode, tmp, SImode, 0);
> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], tmp2));
> +    DONE;
> +  }
> +)

Insn patterns shouldn't check can_create_pseudo_p, because there's no
guarantee that the associated split happens before RA.  In this case it
should be safe to reuse operand 0 after RA if you change it to:

  [(set (match_operand:GPI 0 "register_operand" "=&r")
        ...)]

and then:

  rtx tmp = (can_create_pseudo_p ()
             ? gen_reg_rtx (SImode)
             : gen_lowpart (SImode, operands[0]));

Thanks,
Richard

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-05-19  7:51 ` Richard Sandiford
@ 2017-05-19 11:03   ` Christophe Lyon
  0 siblings, 0 replies; 15+ messages in thread
From: Christophe Lyon @ 2017-05-19 11:03 UTC (permalink / raw)
  To: Michael Collison, gcc-patches, nd, richard.sandiford

Hi Michael,


On 19 May 2017 at 09:21, Richard Sandiford <richard.sandiford@linaro.org> wrote:
> Thanks for doing this.  Just a couple of comments about the .md stuff:
>
> Michael Collison <Michael.Collison@arm.com> writes:
>> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
>> index 5adc5ed..c6ae670 100644
>> --- a/gcc/config/aarch64/aarch64.md
>> +++ b/gcc/config/aarch64/aarch64.md
>> @@ -3999,6 +3999,92 @@
>>    }
>>  )
>>
>> +;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
>> +;; they truncate the shift/rotate amount by the size of the registers they
>> +;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away
>> +;; such redundant masking instructions.  GCC can do that automatically when
>> +;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
>> +;; because some of the SISD shift alternatives don't perform this truncations.
>> +;; So this pattern exists to catch such cases.
>> +
>> +(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r")
>> +     (SHIFT:GPI
>> +       (match_operand:GPI 1 "register_operand" "r")
>> +       (subreg:QI (and:GPI
>> +                   (match_operand:GPI 2 "register_operand" "r")
>> +                   (match_operand 3 "const_int_operand" "n")) 0)))]
>> +  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
>> +  "<shift>\t%<w>0, %<w>1, %<w>2"
>> +  [(set_attr "type" "shift_reg")]
>> +)
>
> (subreg:QI (...) 0) is only correct for little endian.  For big endian
> it needs to be 3 for SI or 7 for DI.  You could probably handle that
> using something like:
>
>   (match_operator:QI 2 "lowpart_subreg"
>     [(and:GPI ...)])
>
> and defining a lowpart_subreg predicate that checks the SUBREG_BYTE.
> Or just leave the subreg as-is and add !BYTES_BIG_ENDIAN to the insn
> condition.
>

I can confirm that the new tests pass on little-endian, but fail on big.

Thanks,

Christophe


>> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r")
>> +     (SHIFT:GPI
>> +       (match_operand:GPI 1 "register_operand" "r")
>> +       (subreg:QI (neg:SI (and:SI
>> +                           (match_operand:SI 2 "register_operand" "r")
>> +                           (match_operand 3 "const_int_operand" "n"))) 0)))]
>> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
>> +   && can_create_pseudo_p ()"
>> +  "#"
>> +  "&& true"
>> +  [(const_int 0)]
>> +  {
>> +    rtx tmp = gen_reg_rtx (SImode);
>> +
>> +    emit_insn (gen_negsi2 (tmp, operands[2]));
>> +    rtx tmp2 = simplify_gen_subreg (QImode, tmp, SImode, 0);
>> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], tmp2));
>> +    DONE;
>> +  }
>> +)
>
> Insn patterns shouldn't check can_create_pseudo_p, because there's no
> guarantee that the associated split happens before RA.  In this case it
> should be safe to reuse operand 0 after RA if you change it to:
>
>   [(set (match_operand:GPI 0 "register_operand" "=&r")
>         ...)]
>
> and then:
>
>   rtx tmp = (can_create_pseudo_p ()
>              ? gen_reg_rtx (SImode)
>              : gen_lowpart (SImode, operands[0]));
>
> Thanks,
> Richard

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-06-30 18:04                 ` Andreas Schwab
@ 2017-06-30 18:54                   ` Michael Collison
  0 siblings, 0 replies; 15+ messages in thread
From: Michael Collison @ 2017-06-30 18:54 UTC (permalink / raw)
  To: Andreas Schwab
  Cc: James Greenhalgh, Wilco Dijkstra, Christophe Lyon, GCC Patches,
	richard.sandiford, nd

Okay I will take a look at this.

Michael Collison


> On Jun 30, 2017, at 11:04 AM, Andreas Schwab <schwab@linux-m68k.org> wrote:
> 
>> On Jun 23 2017, Michael Collison <Michael.Collison@arm.com> wrote:
>> 
>> diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
>> new file mode 100644
>> index 0000000..e2b020e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
>> @@ -0,0 +1,61 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" } */
>> +
>> +/* The integer variable shift and rotate instructions truncate their
>> +   shift amounts by the datasize.  Make sure that we don't emit a redundant
>> +   masking operation.  */
>> +
>> +unsigned
>> +f1 (unsigned x, int y)
>> +{
>> +  return x << (y & 31);
>> +}
>> +
>> +unsigned long
>> +f2 (unsigned long x, int y)
>> +{
>> +  return x << (y & 63);
>> +}
>> +
>> +unsigned long
>> +f3 (unsigned long bit_addr, int y)
>> +{
>> +  unsigned long bitnumb = bit_addr & 63;
>> +  return (1L << bitnumb);
>> +}
>> +
>> +unsigned int
>> +f4 (unsigned int x, unsigned int y)
>> +{
>> +  y &= 31;
>> +  return x >> y | (x << (32 - y));
>> +}
>> +
>> +unsigned long
>> +f5 (unsigned long x, unsigned long y)
>> +{
>> +  y &= 63;
>> +  return x >> y | (x << (64 - y));
>> +}
>> +
>> +unsigned long
>> +f6 (unsigned long x, unsigned long y)
>> +{
>> +
>> +  return (x << (64 - (y & 63)));
>> +
>> +}
>> +
>> +unsigned long
>> +f7 (unsigned long x, unsigned long y)
>> +{
>> +  return (x << -(y & 63));
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
>> +/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
>> +/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
>> +/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
>> +/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
>> +/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
>> +/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
> 
> That fails with -mabi=ilp32.
> 
> Andreas.
> 
> -- 
> Andreas Schwab, schwab@linux-m68k.org
> GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
> "And now for something completely different."

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-06-23  9:28               ` Michael Collison
  2017-06-23  9:30                 ` James Greenhalgh
@ 2017-06-30 18:04                 ` Andreas Schwab
  2017-06-30 18:54                   ` Michael Collison
  1 sibling, 1 reply; 15+ messages in thread
From: Andreas Schwab @ 2017-06-30 18:04 UTC (permalink / raw)
  To: Michael Collison
  Cc: James Greenhalgh, Wilco Dijkstra, Christophe Lyon, GCC Patches,
	richard.sandiford, nd

On Jun 23 2017, Michael Collison <Michael.Collison@arm.com> wrote:

> diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
> new file mode 100644
> index 0000000..e2b020e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* The integer variable shift and rotate instructions truncate their
> +   shift amounts by the datasize.  Make sure that we don't emit a redundant
> +   masking operation.  */
> +
> +unsigned
> +f1 (unsigned x, int y)
> +{
> +  return x << (y & 31);
> +}
> +
> +unsigned long
> +f2 (unsigned long x, int y)
> +{
> +  return x << (y & 63);
> +}
> +
> +unsigned long
> +f3 (unsigned long bit_addr, int y)
> +{
> +  unsigned long bitnumb = bit_addr & 63;
> +  return (1L << bitnumb);
> +}
> +
> +unsigned int
> +f4 (unsigned int x, unsigned int y)
> +{
> +  y &= 31;
> +  return x >> y | (x << (32 - y));
> +}
> +
> +unsigned long
> +f5 (unsigned long x, unsigned long y)
> +{
> +  y &= 63;
> +  return x >> y | (x << (64 - y));
> +}
> +
> +unsigned long
> +f6 (unsigned long x, unsigned long y)
> +{
> +
> +  return (x << (64 - (y & 63)));
> +
> +}
> +
> +unsigned long
> +f7 (unsigned long x, unsigned long y)
> +{
> +  return (x << -(y & 63));
> +}
> +
> +/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
> +/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
> +/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
> +/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */

That fails with -mabi=ilp32.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756  01D3 44D5 214B 8276 4ED5
"And now for something completely different."

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-06-23  9:28               ` Michael Collison
@ 2017-06-23  9:30                 ` James Greenhalgh
  2017-06-30 18:04                 ` Andreas Schwab
  1 sibling, 0 replies; 15+ messages in thread
From: James Greenhalgh @ 2017-06-23  9:30 UTC (permalink / raw)
  To: Michael Collison
  Cc: Wilco Dijkstra, Christophe Lyon, GCC Patches, richard.sandiford, nd

On Fri, Jun 23, 2017 at 10:27:55AM +0100, Michael Collison wrote:
> Fixed the "nitpick" issues pointed out by James. Okay for trunk?


> > I have a few comments below, which are closer to nitpicking than structural
> > issues with the patch.
> > 
> > With those fixed, this is OK to commit.

This is still OK for trunk.

Thanks,
James

> 2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 	    Michael Collison <michael.collison@arm.com>
> 
> 	PR target/70119
> 	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
> 	New pattern.
> 	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
> 	(*aarch64_reg_<mode>3_minus_mask): New pattern.
> 	(*aarch64_<optab>_reg_di3_mask2): New pattern.
> 	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
> 	of shift when the shift amount is masked with constant equal to
> 	the size of the mode.
> 	* config/aarch64/predicates.md (subreg_lowpart_operator): New
> 	predicate.
> 
> 
> 2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 	    Michael Collison <michael.collison@arm.com>
> 
> 	PR target/70119
> 	* gcc.target/aarch64/var_shift_mask_1.c: New test.
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] [Aarch64] Variable shift count truncation issues
       [not found]             ` <20170622101638.GA30708@arm.com>
@ 2017-06-23  9:28               ` Michael Collison
  2017-06-23  9:30                 ` James Greenhalgh
  2017-06-30 18:04                 ` Andreas Schwab
  0 siblings, 2 replies; 15+ messages in thread
From: Michael Collison @ 2017-06-23  9:28 UTC (permalink / raw)
  To: James Greenhalgh, Wilco Dijkstra, Christophe Lyon, GCC Patches,
	richard.sandiford
  Cc: nd

[-- Attachment #1: Type: text/plain, Size: 5812 bytes --]

Fixed the "nitpick" issues pointed out by James. Okay for trunk?

2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
	New pattern.
	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
	(*aarch64_reg_<mode>3_minus_mask): New pattern.
	(*aarch64_<optab>_reg_di3_mask2): New pattern.
	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
	of shift when the shift amount is masked with constant equal to
	the size of the mode.
	* config/aarch64/predicates.md (subreg_lowpart_operator): New
	predicate.


2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* gcc.target/aarch64/var_shift_mask_1.c: New test.

-----Original Message-----
From: James Greenhalgh [mailto:james.greenhalgh@arm.com] 
Sent: Thursday, June 22, 2017 3:17 AM
To: Michael Collison <Michael.Collison@arm.com>; Wilco Dijkstra <Wilco.Dijkstra@arm.com>; Christophe Lyon <christophe.lyon@linaro.org>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>; richard.sandiford@linaro.org
Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues

On Wed, Jun 21, 2017 at 04:42:00PM +0100, Richard Sandiford wrote:
> Michael Collison <Michael.Collison@arm.com> writes:
> > Updated the patch per Richard's suggestions to allow scheduling of 
> > instructions before reload.
> 
> Thanks, this looks good to me FWIW, but obviously I can't approve it.

Thanks for the review Richard, that gives me good confidence in this patch.

I have a few comments below, which are closer to nitpicking than structural issues with the patch.

With those fixed, this is OK to commit.

> > 2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> > 	    Michael Collison <michael.collison@arm.com>

With the work you've done, you can probably place yourself first on the ChangeLog now ;)

> >
> > 	PR target/70119
> > 	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
> > 	New pattern.
> > 	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
> > 	(*aarch64_reg_<mode>3_minus_mask): New pattern.
> > 	(*aarch64_<optab>_reg_di3_mask2): New pattern.
> > 	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
> > 	of shift when the shift amount is masked with constant equal to
> > 	the size of the mode.
> > 	* config/aarch64/predicates.md (subreg_lowpart_operator): New
> > 	predicate.
> >
> >
> > 2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> > 	    Michael Collison <michael.collison@arm.com>
> >
> > 	PR target/70119
> > 	* gcc.target/aarch64/var_shift_mask_1.c: New test.


> > diff --git a/gcc/config/aarch64/aarch64.md 
> > b/gcc/config/aarch64/aarch64.md index d89df66..ff5720c 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -3942,6 +3942,97 @@
> >    }
> >  )
> >  
> > +;; When the LSL, LSR, ASR, ROR instructions operate on all register 
> > +arguments ;; they truncate the shift/rotate amount by the size of 
> > +the registers they ;; operate on: 32 for W-regs, 63 for X-regs.  
> > +This allows us to optimise away

Is this "63" a typo? Should it be 64?

> > +;; such redundant masking instructions.  GCC can do that 
> > +automatically when ;; SHIFT_COUNT_TRUNCATED is true, but we can't 
> > +enable it for TARGET_SIMD ;; because some of the SISD shift alternatives don't perform this truncations.
> > +;; So this pattern exists to catch such cases.
> > +
> > +(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
> > +  [(set (match_operand:GPI 0 "register_operand" "=r")
> > +	(SHIFT:GPI
> > +	  (match_operand:GPI 1 "register_operand" "r")
> > +	  (match_operator 4 "subreg_lowpart_operator"
> > +	   [(and:GPI (match_operand:GPI 2 "register_operand" "r")
> > +		     (match_operand 3 "const_int_operand" "n"))])))]
> > +  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"

Spaces around "-"

> > +  "<shift>\t%<w>0, %<w>1, %<w>2"
> > +  [(set_attr "type" "shift_reg")]
> > +)
> > +
> > +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> > +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> > +	(SHIFT:GPI
> > +	  (match_operand:GPI 1 "register_operand" "r")
> > +	  (match_operator 4 "subreg_lowpart_operator"
> > +	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> > +			   (match_operand 3 "const_int_operand" "n")))])))]
> > +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
> > +  "#"
> > +  "&& 1"

I'd prefer "true" to "1"

> > +  [(const_int 0)]
> > +  {
> > +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> > +	       : operands[0]);
> > +    emit_insn (gen_negsi2 (tmp, operands[2]));
> > +
> > +    rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
> > +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
> > +				     SUBREG_BYTE (operands[4]));
> > +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
> > +    DONE;
> > +  }
> > +)
> > +
> > +(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
> > +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> > +	(ashift:GPI
> > +	  (match_operand:GPI 1 "register_operand" "r")
> > +	  (minus:QI (match_operand 2 "const_int_operand" "n")
> > +		    (match_operator 5 "subreg_lowpart_operator"
> > +		    [(and:SI (match_operand:SI 3 "register_operand" "r")
> > +			     (match_operand 4 "const_int_operand" "n"))]))))]
> > +  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
> > +   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
> > +  "#"
> > +  "&& 1"

Likewise.

Thanks,
James


[-- Attachment #2: pr5546v6.patch --]
[-- Type: application/octet-stream, Size: 8072 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5707e53..45377a2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7407,17 +7407,26 @@ cost_plus:
         }
       else
         {
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
-		{
-		  /* Vector shift (register).  */
-		  *cost += extra_cost->vect.alu;
-		}
-	      else
+	      if (speed)
+		/* Vector shift (register).  */
+		*cost += extra_cost->vect.alu;
+	    }
+	  else
+	    {
+	      if (speed)
+		/* LSLV.  */
+		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
 		{
-		  /* LSLV.  */
-		  *cost += extra_cost->alu.shift_reg;
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
 		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
@@ -7446,14 +7455,27 @@ cost_plus:
 	}
       else
 	{
-
-	  /* ASR (register) and friends.  */
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
+	      if (speed)
+		/* Vector shift (register).  */
 		*cost += extra_cost->vect.alu;
-	      else
+	    }
+	  else
+	    {
+	      if (speed)
+		/* ASR (register) and friends.  */
 		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
+		{
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
+		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
 	}
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index d89df66..32a1923 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -3942,6 +3942,97 @@
   }
 )
 
+;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
+;; they truncate the shift/rotate amount by the size of the registers they
+;; operate on: 32 for W-regs, 64 for X-regs.  This allows us to optimise away
+;; such redundant masking instructions.  GCC can do that automatically when
+;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
+;; because some of the SISD shift alternatives don't perform this truncations.
+;; So this pattern exists to catch such cases.
+
+(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	   [(and:GPI (match_operand:GPI 2 "register_operand" "r")
+		     (match_operand 3 "const_int_operand" "n"))])))]
+  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode) - 1)) == 0"
+  "<shift>\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "shift_reg")]
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
+  [(set (match_operand:GPI 0 "register_operand" "=&r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
+			   (match_operand 3 "const_int_operand" "n")))])))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode) - 1)) == 0)"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
+	       : operands[0]);
+    emit_insn (gen_negsi2 (tmp, operands[2]));
+
+    rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
+    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
+				     SUBREG_BYTE (operands[4]));
+    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
+    DONE;
+  }
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
+  [(set (match_operand:GPI 0 "register_operand" "=&r")
+	(ashift:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (minus:QI (match_operand 2 "const_int_operand" "n")
+		    (match_operator 5 "subreg_lowpart_operator"
+		    [(and:SI (match_operand:SI 3 "register_operand" "r")
+			     (match_operand 4 "const_int_operand" "n"))]))))]
+  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode) - 1)) == 0)
+   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
+	       : operands[0]);
+
+    emit_insn (gen_negsi2 (tmp, operands[3]));
+
+    rtx and_op = gen_rtx_AND (SImode, tmp, operands[4]);
+    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[5]), and_op,
+				     SUBREG_BYTE (operands[5]));
+
+    emit_insn (gen_ashl<mode>3 (operands[0], operands[1], subreg_tmp));
+    DONE;
+  }
+)
+
+(define_insn "*aarch64_<optab>_reg_di3_mask2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(SHIFT:DI
+	  (match_operand:DI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	   [(and:SI (match_operand:SI 2 "register_operand" "r")
+		     (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)"
+{
+  rtx xop[3];
+  xop[0] = operands[0];
+  xop[1] = operands[1];
+  xop[2] = gen_lowpart (GET_MODE (operands[4]), operands[2]);
+  output_asm_insn ("<shift>\t%x0, %x1, %x2", xop);
+  return "";
+}
+  [(set_attr "type" "shift_reg")]
+)
+
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index cd7ded9..ad8a43c 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -35,6 +35,10 @@
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
+(define_special_predicate "subreg_lowpart_operator"
+  (and (match_code "subreg")
+       (match_test "subreg_lowpart_p (op)")))
+
 (define_predicate "aarch64_ccmp_immediate"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), -31, 31)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
new file mode 100644
index 0000000..e2b020e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* The integer variable shift and rotate instructions truncate their
+   shift amounts by the datasize.  Make sure that we don't emit a redundant
+   masking operation.  */
+
+unsigned
+f1 (unsigned x, int y)
+{
+  return x << (y & 31);
+}
+
+unsigned long
+f2 (unsigned long x, int y)
+{
+  return x << (y & 63);
+}
+
+unsigned long
+f3 (unsigned long bit_addr, int y)
+{
+  unsigned long bitnumb = bit_addr & 63;
+  return (1L << bitnumb);
+}
+
+unsigned int
+f4 (unsigned int x, unsigned int y)
+{
+  y &= 31;
+  return x >> y | (x << (32 - y));
+}
+
+unsigned long
+f5 (unsigned long x, unsigned long y)
+{
+  y &= 63;
+  return x >> y | (x << (64 - y));
+}
+
+unsigned long
+f6 (unsigned long x, unsigned long y)
+{
+
+  return (x << (64 - (y & 63)));
+
+}
+
+unsigned long
+f7 (unsigned long x, unsigned long y)
+{
+  return (x << -(y & 63));
+}
+
+/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
+/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
+/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
@ 2017-06-22 10:22 James Greenhalgh
  0 siblings, 0 replies; 15+ messages in thread
From: James Greenhalgh @ 2017-06-22 10:22 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd

Resending for the list, as the last copy got bounced.

Thanks,
James

----- Forwarded message from James Greenhalgh <james.greenhalgh@arm.com> -----

Date: Thu, 22 Jun 2017 11:16:38 +0100
From: James Greenhalgh <james.greenhalgh@arm.com>
To: Michael Collison <Michael.Collison@arm.com>, Wilco Dijkstra <Wilco.Dijkstra@arm.com>, Christophe Lyon <christophe.lyon@linaro.org>, GCC Patches <gcc-patches@gcc.gnu.org>, nd <nd@arm.com>, richard.sandiford@linaro.org
Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues
User-Agent: Mutt/1.5.21 (2010-09-15)

On Wed, Jun 21, 2017 at 04:42:00PM +0100, Richard Sandiford wrote:
> Michael Collison <Michael.Collison@arm.com> writes:
> > Updated the patch per Richard's suggestions to allow scheduling of
> > instructions before reload.
> 
> Thanks, this looks good to me FWIW, but obviously I can't approve it.

Thanks for the review Richard, that gives me good confidence in this patch.

I have a few comments below, which are closer to nitpicking than structural
issues with the patch.

With those fixed, this is OK to commit.

> > 2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> > 	    Michael Collison <michael.collison@arm.com>

With the work you've done, you can probably place yourself first on the
ChangeLog now ;)

> >
> > 	PR target/70119
> > 	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
> > 	New pattern.
> > 	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
> > 	(*aarch64_reg_<mode>3_minus_mask): New pattern.
> > 	(*aarch64_<optab>_reg_di3_mask2): New pattern.
> > 	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
> > 	of shift when the shift amount is masked with constant equal to
> > 	the size of the mode.
> > 	* config/aarch64/predicates.md (subreg_lowpart_operator): New
> > 	predicate.
> >
> >
> > 2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> > 	    Michael Collison <michael.collison@arm.com>
> >
> > 	PR target/70119
> > 	* gcc.target/aarch64/var_shift_mask_1.c: New test.


> > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> > index d89df66..ff5720c 100644
> > --- a/gcc/config/aarch64/aarch64.md
> > +++ b/gcc/config/aarch64/aarch64.md
> > @@ -3942,6 +3942,97 @@
> >    }
> >  )
> >  
> > +;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
> > +;; they truncate the shift/rotate amount by the size of the registers they
> > +;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away

Is this "63" a typo? Should it be 64?

> > +;; such redundant masking instructions.  GCC can do that automatically when
> > +;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
> > +;; because some of the SISD shift alternatives don't perform this truncations.
> > +;; So this pattern exists to catch such cases.
> > +
> > +(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
> > +  [(set (match_operand:GPI 0 "register_operand" "=r")
> > +	(SHIFT:GPI
> > +	  (match_operand:GPI 1 "register_operand" "r")
> > +	  (match_operator 4 "subreg_lowpart_operator"
> > +	   [(and:GPI (match_operand:GPI 2 "register_operand" "r")
> > +		     (match_operand 3 "const_int_operand" "n"))])))]
> > +  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"

Spaces around "-"

> > +  "<shift>\t%<w>0, %<w>1, %<w>2"
> > +  [(set_attr "type" "shift_reg")]
> > +)
> > +
> > +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> > +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> > +	(SHIFT:GPI
> > +	  (match_operand:GPI 1 "register_operand" "r")
> > +	  (match_operator 4 "subreg_lowpart_operator"
> > +	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> > +			   (match_operand 3 "const_int_operand" "n")))])))]
> > +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
> > +  "#"
> > +  "&& 1"

I'd prefer "true" to "1"

> > +  [(const_int 0)]
> > +  {
> > +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> > +	       : operands[0]);
> > +    emit_insn (gen_negsi2 (tmp, operands[2]));
> > +
> > +    rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
> > +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
> > +				     SUBREG_BYTE (operands[4]));
> > +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
> > +    DONE;
> > +  }
> > +)
> > +
> > +(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
> > +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> > +	(ashift:GPI
> > +	  (match_operand:GPI 1 "register_operand" "r")
> > +	  (minus:QI (match_operand 2 "const_int_operand" "n")
> > +		    (match_operator 5 "subreg_lowpart_operator"
> > +		    [(and:SI (match_operand:SI 3 "register_operand" "r")
> > +			     (match_operand 4 "const_int_operand" "n"))]))))]
> > +  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
> > +   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
> > +  "#"
> > +  "&& 1"

Likewise.

Thanks,
James


----- End forwarded message -----

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-06-21  8:32         ` Michael Collison
@ 2017-06-21 15:42           ` Richard Sandiford
       [not found]             ` <20170622101638.GA30708@arm.com>
  0 siblings, 1 reply; 15+ messages in thread
From: Richard Sandiford @ 2017-06-21 15:42 UTC (permalink / raw)
  To: Michael Collison; +Cc: Wilco Dijkstra, Christophe Lyon, GCC Patches, nd

Michael Collison <Michael.Collison@arm.com> writes:
> Updated the patch per Richard's suggestions to allow scheduling of
> instructions before reload.

Thanks, this looks good to me FWIW, but obviously I can't approve it.

Richard

> Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk?
>
> 2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 	    Michael Collison <michael.collison@arm.com>
>
> 	PR target/70119
> 	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
> 	New pattern.
> 	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
> 	(*aarch64_reg_<mode>3_minus_mask): New pattern.
> 	(*aarch64_<optab>_reg_di3_mask2): New pattern.
> 	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
> 	of shift when the shift amount is masked with constant equal to
> 	the size of the mode.
> 	* config/aarch64/predicates.md (subreg_lowpart_operator): New
> 	predicate.
>
>
> 2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 	    Michael Collison <michael.collison@arm.com>
>
> 	PR target/70119
> 	* gcc.target/aarch64/var_shift_mask_1.c: New test.
>
> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
> Sent: Thursday, June 15, 2017 6:40 AM
> To: Michael Collison <Michael.Collison@arm.com>
> Cc: Wilco Dijkstra <Wilco.Dijkstra@arm.com>; Christophe Lyon <christophe.lyon@linaro.org>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>
> Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues
>
> Michael Collison <Michael.Collison@arm.com> writes:
>> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
>> +  [(set (match_operand:GPI 0 "register_operand" "=r")
>> +	(SHIFT:GPI
>> +	  (match_operand:GPI 1 "register_operand" "r")
>> +	  (match_operator 4 "subreg_lowpart_operator"
>> +	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
>> +			   (match_operand 3 "const_int_operand" "n")))])))
>> +   (clobber (match_scratch:SI 5 "=&r"))]
>> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
>> +  "#"
>> +  "&& reload_completed"
>> +  [(const_int 0)]
>> +  {
>> +    emit_insn (gen_negsi2 (operands[5], operands[2]));
>> +
>> +    rtx and_op = gen_rtx_AND (SImode, operands[5], operands[3]);
>> +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
>> +    		     		     SUBREG_BYTE (operands[4]));
>> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
>> +    DONE;
>> +  }
>> +)
>
> Thanks, I agree this looks correct from the split/reload_completed POV.
> I think we can go one better though, either:
>
> (a) Still allow the split when !reload_completed, and use:
>
>      if (GET_MODE (operands[5]) == SCRATCH)
>        operands[5] = gen_reg_rtx (SImode);
>
>     This will allow the individual instructions to be scheduled by sched1.
>
> (b) Continue to restrict the split to reload_completed, change operand 0
>     to =&r so that it can be used as a temporary, and drop operand 5 entirely.
>
> Or perhaps do both:
>
> (define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
>   [(set (match_operand:GPI 0 "register_operand" "=&r")
> 	(SHIFT:GPI
> 	  (match_operand:GPI 1 "register_operand" "r")
> 	  (match_operator 4 "subreg_lowpart_operator"
> 	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> 			   (match_operand 3 "const_int_operand" "n")))])))]
>   "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
>   "#"
>   "&& 1"
>   [(const_int 0)]
>   {
>     rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (<GPI:MODE>mode)
>                : operands[0]);
>     emit_insn (gen_negsi2 (tmp, operands[2]));
>
>     rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
>     rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
>     		     		     SUBREG_BYTE (operands[4]));
>     emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
>     DONE;
>   }
> )
>
> Sorry for the run-around.  I should have realised earlier that these patterns didn't really need a distinct register after RA.
>
> Thanks,
> Richard
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 5707e53..45377a2 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -7407,17 +7407,26 @@ cost_plus:
>          }
>        else
>          {
> -	  if (speed)
> +	  if (VECTOR_MODE_P (mode))
>  	    {
> -	      if (VECTOR_MODE_P (mode))
> -		{
> -		  /* Vector shift (register).  */
> -		  *cost += extra_cost->vect.alu;
> -		}
> -	      else
> +	      if (speed)
> +		/* Vector shift (register).  */
> +		*cost += extra_cost->vect.alu;
> +	    }
> +	  else
> +	    {
> +	      if (speed)
> +		/* LSLV.  */
> +		*cost += extra_cost->alu.shift_reg;
> +
> +	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
> +		  && CONST_INT_P (XEXP (op1, 1))
> +		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
>  		{
> -		  /* LSLV.  */
> -		  *cost += extra_cost->alu.shift_reg;
> +		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
> +		  /* We already demanded XEXP (op1, 0) to be REG_P, so
> +		     don't recurse into it.  */
> +		  return true;
>  		}
>  	    }
>  	  return false;  /* All arguments need to be in registers.  */
> @@ -7446,14 +7455,27 @@ cost_plus:
>  	}
>        else
>  	{
> -
> -	  /* ASR (register) and friends.  */
> -	  if (speed)
> +	  if (VECTOR_MODE_P (mode))
>  	    {
> -	      if (VECTOR_MODE_P (mode))
> +	      if (speed)
> +		/* Vector shift (register).  */
>  		*cost += extra_cost->vect.alu;
> -	      else
> +	    }
> +	  else
> +	    {
> +	      if (speed)
> +		/* ASR (register) and friends.  */
>  		*cost += extra_cost->alu.shift_reg;
> +
> +	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
> +		  && CONST_INT_P (XEXP (op1, 1))
> +		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
> +		{
> +		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
> +		  /* We already demanded XEXP (op1, 0) to be REG_P, so
> +		     don't recurse into it.  */
> +		  return true;
> +		}
>  	    }
>  	  return false;  /* All arguments need to be in registers.  */
>  	}
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index d89df66..ff5720c 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -3942,6 +3942,97 @@
>    }
>  )
>  
> +;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
> +;; they truncate the shift/rotate amount by the size of the registers they
> +;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away
> +;; such redundant masking instructions.  GCC can do that automatically when
> +;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
> +;; because some of the SISD shift alternatives don't perform this truncations.
> +;; So this pattern exists to catch such cases.
> +
> +(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +	(SHIFT:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (match_operator 4 "subreg_lowpart_operator"
> +	   [(and:GPI (match_operand:GPI 2 "register_operand" "r")
> +		     (match_operand 3 "const_int_operand" "n"))])))]
> +  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
> +  "<shift>\t%<w>0, %<w>1, %<w>2"
> +  [(set_attr "type" "shift_reg")]
> +)
> +
> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> +	(SHIFT:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (match_operator 4 "subreg_lowpart_operator"
> +	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> +			   (match_operand 3 "const_int_operand" "n")))])))]
> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +  {
> +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> +	       : operands[0]);
> +    emit_insn (gen_negsi2 (tmp, operands[2]));
> +
> +    rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
> +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
> +				     SUBREG_BYTE (operands[4]));
> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
> +    DONE;
> +  }
> +)
> +
> +(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
> +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> +	(ashift:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (minus:QI (match_operand 2 "const_int_operand" "n")
> +		    (match_operator 5 "subreg_lowpart_operator"
> +		    [(and:SI (match_operand:SI 3 "register_operand" "r")
> +			     (match_operand 4 "const_int_operand" "n"))]))))]
> +  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
> +   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
> +  "#"
> +  "&& 1"
> +  [(const_int 0)]
> +  {
> +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> +	       : operands[0]);
> +
> +    emit_insn (gen_negsi2 (tmp, operands[3]));
> +
> +    rtx and_op = gen_rtx_AND (SImode, tmp, operands[4]);
> +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[5]), and_op,
> +				     SUBREG_BYTE (operands[5]));
> +
> +    emit_insn (gen_ashl<mode>3 (operands[0], operands[1], subreg_tmp));
> +    DONE;
> +  }
> +)
> +
> +(define_insn "*aarch64_<optab>_reg_di3_mask2"
> +  [(set (match_operand:DI 0 "register_operand" "=r")
> +	(SHIFT:DI
> +	  (match_operand:DI 1 "register_operand" "r")
> +	  (match_operator 4 "subreg_lowpart_operator"
> +	   [(and:SI (match_operand:SI 2 "register_operand" "r")
> +		     (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))]
> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)"
> +{
> +  rtx xop[3];
> +  xop[0] = operands[0];
> +  xop[1] = operands[1];
> +  xop[2] = gen_lowpart (GET_MODE (operands[4]), operands[2]);
> +  output_asm_insn ("<shift>\t%x0, %x1, %x2", xop);
> +  return "";
> +}
> +  [(set_attr "type" "shift_reg")]
> +)
> +
>  ;; Logical left shift using SISD or Integer instruction
>  (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
>    [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
> index cd7ded9..ad8a43c 100644
> --- a/gcc/config/aarch64/predicates.md
> +++ b/gcc/config/aarch64/predicates.md
> @@ -35,6 +35,10 @@
>    (and (match_code "const_int")
>         (match_test "op == CONST0_RTX (mode)")))
>  
> +(define_special_predicate "subreg_lowpart_operator"
> +  (and (match_code "subreg")
> +       (match_test "subreg_lowpart_p (op)")))
> +
>  (define_predicate "aarch64_ccmp_immediate"
>    (and (match_code "const_int")
>         (match_test "IN_RANGE (INTVAL (op), -31, 31)")))
> diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
> new file mode 100644
> index 0000000..e2b020e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
> @@ -0,0 +1,61 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* The integer variable shift and rotate instructions truncate their
> +   shift amounts by the datasize.  Make sure that we don't emit a redundant
> +   masking operation.  */
> +
> +unsigned
> +f1 (unsigned x, int y)
> +{
> +  return x << (y & 31);
> +}
> +
> +unsigned long
> +f2 (unsigned long x, int y)
> +{
> +  return x << (y & 63);
> +}
> +
> +unsigned long
> +f3 (unsigned long bit_addr, int y)
> +{
> +  unsigned long bitnumb = bit_addr & 63;
> +  return (1L << bitnumb);
> +}
> +
> +unsigned int
> +f4 (unsigned int x, unsigned int y)
> +{
> +  y &= 31;
> +  return x >> y | (x << (32 - y));
> +}
> +
> +unsigned long
> +f5 (unsigned long x, unsigned long y)
> +{
> +  y &= 63;
> +  return x >> y | (x << (64 - y));
> +}
> +
> +unsigned long
> +f6 (unsigned long x, unsigned long y)
> +{
> +
> +  return (x << (64 - (y & 63)));
> +
> +}
> +
> +unsigned long
> +f7 (unsigned long x, unsigned long y)
> +{
> +  return (x << -(y & 63));
> +}
> +
> +/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
> +/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
> +/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
> +/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
> +/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-06-15 13:39       ` Richard Sandiford
@ 2017-06-21  8:32         ` Michael Collison
  2017-06-21 15:42           ` Richard Sandiford
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2017-06-21  8:32 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: Wilco Dijkstra, Christophe Lyon, GCC Patches, nd

[-- Attachment #1: Type: text/plain, Size: 3864 bytes --]

Updated the patch per Richard's suggestions to allow scheduling of instructions before reload.

Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk?

2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
	New pattern.
	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
	(*aarch64_reg_<mode>3_minus_mask): New pattern.
	(*aarch64_<optab>_reg_di3_mask2): New pattern.
	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
	of shift when the shift amount is masked with constant equal to
	the size of the mode.
	* config/aarch64/predicates.md (subreg_lowpart_operator): New
	predicate.


2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* gcc.target/aarch64/var_shift_mask_1.c: New test.

-----Original Message-----
From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
Sent: Thursday, June 15, 2017 6:40 AM
To: Michael Collison <Michael.Collison@arm.com>
Cc: Wilco Dijkstra <Wilco.Dijkstra@arm.com>; Christophe Lyon <christophe.lyon@linaro.org>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>
Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues

Michael Collison <Michael.Collison@arm.com> writes:
> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +	(SHIFT:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (match_operator 4 "subreg_lowpart_operator"
> +	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> +			   (match_operand 3 "const_int_operand" "n")))])))
> +   (clobber (match_scratch:SI 5 "=&r"))]
> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    emit_insn (gen_negsi2 (operands[5], operands[2]));
> +
> +    rtx and_op = gen_rtx_AND (SImode, operands[5], operands[3]);
> +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
> +    		     		     SUBREG_BYTE (operands[4]));
> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
> +    DONE;
> +  }
> +)

Thanks, I agree this looks correct from the split/reload_completed POV.
I think we can go one better though, either:

(a) Still allow the split when !reload_completed, and use:

     if (GET_MODE (operands[5]) == SCRATCH)
       operands[5] = gen_reg_rtx (SImode);

    This will allow the individual instructions to be scheduled by sched1.

(b) Continue to restrict the split to reload_completed, change operand 0
    to =&r so that it can be used as a temporary, and drop operand 5 entirely.

Or perhaps do both:

(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
  [(set (match_operand:GPI 0 "register_operand" "=&r")
	(SHIFT:GPI
	  (match_operand:GPI 1 "register_operand" "r")
	  (match_operator 4 "subreg_lowpart_operator"
	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
			   (match_operand 3 "const_int_operand" "n")))])))]
  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
  "#"
  "&& 1"
  [(const_int 0)]
  {
    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (<GPI:MODE>mode)
               : operands[0]);
    emit_insn (gen_negsi2 (tmp, operands[2]));

    rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
    		     		     SUBREG_BYTE (operands[4]));
    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
    DONE;
  }
)

Sorry for the run-around.  I should have realised earlier that these patterns didn't really need a distinct register after RA.

Thanks,
Richard

[-- Attachment #2: pr5546v5.patch --]
[-- Type: application/octet-stream, Size: 8060 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5707e53..45377a2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7407,17 +7407,26 @@ cost_plus:
         }
       else
         {
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
-		{
-		  /* Vector shift (register).  */
-		  *cost += extra_cost->vect.alu;
-		}
-	      else
+	      if (speed)
+		/* Vector shift (register).  */
+		*cost += extra_cost->vect.alu;
+	    }
+	  else
+	    {
+	      if (speed)
+		/* LSLV.  */
+		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
 		{
-		  /* LSLV.  */
-		  *cost += extra_cost->alu.shift_reg;
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
 		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
@@ -7446,14 +7455,27 @@ cost_plus:
 	}
       else
 	{
-
-	  /* ASR (register) and friends.  */
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
+	      if (speed)
+		/* Vector shift (register).  */
 		*cost += extra_cost->vect.alu;
-	      else
+	    }
+	  else
+	    {
+	      if (speed)
+		/* ASR (register) and friends.  */
 		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
+		{
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
+		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
 	}
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index d89df66..ff5720c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -3942,6 +3942,97 @@
   }
 )
 
+;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
+;; they truncate the shift/rotate amount by the size of the registers they
+;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away
+;; such redundant masking instructions.  GCC can do that automatically when
+;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
+;; because some of the SISD shift alternatives don't perform this truncations.
+;; So this pattern exists to catch such cases.
+
+(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	   [(and:GPI (match_operand:GPI 2 "register_operand" "r")
+		     (match_operand 3 "const_int_operand" "n"))])))]
+  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
+  "<shift>\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "shift_reg")]
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
+  [(set (match_operand:GPI 0 "register_operand" "=&r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
+			   (match_operand 3 "const_int_operand" "n")))])))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
+	       : operands[0]);
+    emit_insn (gen_negsi2 (tmp, operands[2]));
+
+    rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
+    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
+				     SUBREG_BYTE (operands[4]));
+    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
+    DONE;
+  }
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
+  [(set (match_operand:GPI 0 "register_operand" "=&r")
+	(ashift:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (minus:QI (match_operand 2 "const_int_operand" "n")
+		    (match_operator 5 "subreg_lowpart_operator"
+		    [(and:SI (match_operand:SI 3 "register_operand" "r")
+			     (match_operand 4 "const_int_operand" "n"))]))))]
+  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
+   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+  {
+    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
+	       : operands[0]);
+
+    emit_insn (gen_negsi2 (tmp, operands[3]));
+
+    rtx and_op = gen_rtx_AND (SImode, tmp, operands[4]);
+    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[5]), and_op,
+				     SUBREG_BYTE (operands[5]));
+
+    emit_insn (gen_ashl<mode>3 (operands[0], operands[1], subreg_tmp));
+    DONE;
+  }
+)
+
+(define_insn "*aarch64_<optab>_reg_di3_mask2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(SHIFT:DI
+	  (match_operand:DI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	   [(and:SI (match_operand:SI 2 "register_operand" "r")
+		     (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)"
+{
+  rtx xop[3];
+  xop[0] = operands[0];
+  xop[1] = operands[1];
+  xop[2] = gen_lowpart (GET_MODE (operands[4]), operands[2]);
+  output_asm_insn ("<shift>\t%x0, %x1, %x2", xop);
+  return "";
+}
+  [(set_attr "type" "shift_reg")]
+)
+
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index cd7ded9..ad8a43c 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -35,6 +35,10 @@
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
+(define_special_predicate "subreg_lowpart_operator"
+  (and (match_code "subreg")
+       (match_test "subreg_lowpart_p (op)")))
+
 (define_predicate "aarch64_ccmp_immediate"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), -31, 31)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
new file mode 100644
index 0000000..e2b020e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* The integer variable shift and rotate instructions truncate their
+   shift amounts by the datasize.  Make sure that we don't emit a redundant
+   masking operation.  */
+
+unsigned
+f1 (unsigned x, int y)
+{
+  return x << (y & 31);
+}
+
+unsigned long
+f2 (unsigned long x, int y)
+{
+  return x << (y & 63);
+}
+
+unsigned long
+f3 (unsigned long bit_addr, int y)
+{
+  unsigned long bitnumb = bit_addr & 63;
+  return (1L << bitnumb);
+}
+
+unsigned int
+f4 (unsigned int x, unsigned int y)
+{
+  y &= 31;
+  return x >> y | (x << (32 - y));
+}
+
+unsigned long
+f5 (unsigned long x, unsigned long y)
+{
+  y &= 63;
+  return x >> y | (x << (64 - y));
+}
+
+unsigned long
+f6 (unsigned long x, unsigned long y)
+{
+
+  return (x << (64 - (y & 63)));
+
+}
+
+unsigned long
+f7 (unsigned long x, unsigned long y)
+{
+  return (x << -(y & 63));
+}
+
+/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
+/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
+/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-06-15  8:06     ` Michael Collison
@ 2017-06-15 13:39       ` Richard Sandiford
  2017-06-21  8:32         ` Michael Collison
  0 siblings, 1 reply; 15+ messages in thread
From: Richard Sandiford @ 2017-06-15 13:39 UTC (permalink / raw)
  To: Michael Collison; +Cc: Wilco Dijkstra, Christophe Lyon, GCC Patches, nd

Michael Collison <Michael.Collison@arm.com> writes:
> +(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
> +  [(set (match_operand:GPI 0 "register_operand" "=r")
> +	(SHIFT:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (match_operator 4 "subreg_lowpart_operator"
> +	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
> +			   (match_operand 3 "const_int_operand" "n")))])))
> +   (clobber (match_scratch:SI 5 "=&r"))]
> +  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    emit_insn (gen_negsi2 (operands[5], operands[2]));
> +
> +    rtx and_op = gen_rtx_AND (SImode, operands[5], operands[3]);
> +    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
> +    		     		     SUBREG_BYTE (operands[4]));
> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
> +    DONE;
> +  }
> +)

Thanks, I agree this looks correct from the split/reload_completed POV.
I think we can go one better though, either:

(a) Still allow the split when !reload_completed, and use:

     if (GET_MODE (operands[5]) == SCRATCH)
       operands[5] = gen_reg_rtx (SImode);

    This will allow the individual instructions to be scheduled by sched1.

(b) Continue to restrict the split to reload_completed, change operand 0
    to =&r so that it can be used as a temporary, and drop operand 5 entirely.

Or perhaps do both:

(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
  [(set (match_operand:GPI 0 "register_operand" "=&r")
	(SHIFT:GPI
	  (match_operand:GPI 1 "register_operand" "r")
	  (match_operator 4 "subreg_lowpart_operator"
	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
			   (match_operand 3 "const_int_operand" "n")))])))]
  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
  "#"
  "&& 1"
  [(const_int 0)]
  {
    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (<GPI:MODE>mode)
               : operands[0]);
    emit_insn (gen_negsi2 (tmp, operands[2]));

    rtx and_op = gen_rtx_AND (SImode, tmp, operands[3]);
    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
    		     		     SUBREG_BYTE (operands[4]));
    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
    DONE;
  }
)

Sorry for the run-around.  I should have realised earlier that these
patterns didn't really need a distinct register after RA.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-05-23 23:16   ` Michael Collison
@ 2017-06-15  8:06     ` Michael Collison
  2017-06-15 13:39       ` Richard Sandiford
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2017-06-15  8:06 UTC (permalink / raw)
  To: Richard Sandiford, Wilco Dijkstra, Christophe Lyon; +Cc: GCC Patches, nd

[-- Attachment #1: Type: text/plain, Size: 3635 bytes --]

Updated the patch to more fully address the reload_completed issues raised by Richard, to ensure that the SIMD patterns are not used for shifts and fixed a code generation bug.

Okay for trunk?

Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk?

2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
	New pattern.
	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
	(*aarch64_reg_<mode>3_minus_mask): New pattern.
	(*aarch64_<optab>_reg_di3_mask2): New pattern.
	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
	of shift when the shift amount is masked with constant equal to
	the size of the mode.
	* config/aarch64/predicates.md (subreg_lowpart_operator): New
	predicate.


2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* gcc.target/aarch64/var_shift_mask_1.c: New test.

-----Original Message-----
From: Michael Collison 
Sent: Tuesday, May 23, 2017 4:01 PM
To: 'Richard Sandiford' <richard.sandiford@linaro.org>; Wilco Dijkstra <Wilco.Dijkstra@arm.com>; 'Christophe Lyon' <christophe.lyon@linaro.org>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>
Subject: RE: [PATCH] [Aarch64] Variable shift count truncation issues

I updated the patch to resolve the big endian issues as recommended by Richard. I also changed all uses of 'can_create_pseudo_p' to '!reload_completed' to make the intent clearer.

Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk?

2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
	New pattern.
	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
	(*aarch64_reg_<mode>3_minus_mask): New pattern.
	(*aarch64_<optab>_reg_di3_mask2): New pattern.
	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
	of shift when the shift amount is masked with constant equal to
	the size of the mode.
	* config/aarch64/predicates.md (subreg_lowpart_operator): New
	predicate.


2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* gcc.target/aarch64/var_shift_mask_1.c: New test.
-----Original Message-----
From: Richard Sandiford [mailto:richard.sandiford@linaro.org]
Sent: Tuesday, May 23, 2017 7:25 AM
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: Michael Collison <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>
Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Richard Sandiford wrote:
>
>> Insn patterns shouldn't check can_create_pseudo_p, because there's no 
>> guarantee that the associated split happens before RA.  In this case 
>> it should be safe to reuse operand 0 after RA if you change it to:
>
> The goal is to only create and split this pattern before register allocation.
> It's a transient pattern, combine creates it, and it is split immediately after.
>
> Maybe !reload_completed is clearer as that is what several other 
> define_insn_and_split patterns do?

But the concept of a transient pattern doesn't really exist.  We shouldn't rely for correctness on a split being applied before RA.

If all we want to do is match and split something that combine generates, it would be better to do it as a pure define_split.

Thanks,
Richard

[-- Attachment #2: pr5546v4.patch --]
[-- Type: application/octet-stream, Size: 8044 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5707e53..45377a2 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7407,17 +7407,26 @@ cost_plus:
         }
       else
         {
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
-		{
-		  /* Vector shift (register).  */
-		  *cost += extra_cost->vect.alu;
-		}
-	      else
+	      if (speed)
+		/* Vector shift (register).  */
+		*cost += extra_cost->vect.alu;
+	    }
+	  else
+	    {
+	      if (speed)
+		/* LSLV.  */
+		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
 		{
-		  /* LSLV.  */
-		  *cost += extra_cost->alu.shift_reg;
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
 		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
@@ -7446,14 +7455,27 @@ cost_plus:
 	}
       else
 	{
-
-	  /* ASR (register) and friends.  */
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
+	      if (speed)
+		/* Vector shift (register).  */
 		*cost += extra_cost->vect.alu;
-	      else
+	    }
+	  else
+	    {
+	      if (speed)
+		/* ASR (register) and friends.  */
 		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
+		{
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
+		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
 	}
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index d89df66..c843123 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -3942,6 +3942,94 @@
   }
 )
 
+;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
+;; they truncate the shift/rotate amount by the size of the registers they
+;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away
+;; such redundant masking instructions.  GCC can do that automatically when
+;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
+;; because some of the SISD shift alternatives don't perform this truncations.
+;; So this pattern exists to catch such cases.
+
+(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	   [(and:GPI (match_operand:GPI 2 "register_operand" "r")
+		     (match_operand 3 "const_int_operand" "n"))])))]
+  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
+  "<shift>\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "shift_reg")]
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
+			   (match_operand 3 "const_int_operand" "n")))])))
+   (clobber (match_scratch:SI 5 "=&r"))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    emit_insn (gen_negsi2 (operands[5], operands[2]));
+
+    rtx and_op = gen_rtx_AND (SImode, operands[5], operands[3]);
+    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[4]), and_op,
+    		     		     SUBREG_BYTE (operands[4]));
+    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp));
+    DONE;
+  }
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(ashift:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (minus:QI (match_operand 2 "const_int_operand" "n")
+		    (match_operator 5 "subreg_lowpart_operator"
+		    [(and:SI (match_operand:SI 3 "register_operand" "r")
+			     (match_operand 4 "const_int_operand" "n"))]))))
+   (clobber (match_scratch:SI 6 "=&r"))]
+  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
+   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    emit_insn (gen_negsi2 (operands[6], operands[3]));
+
+    rtx and_op = gen_rtx_AND (SImode, operands[6], operands[4]);
+    rtx subreg_tmp = gen_rtx_SUBREG (GET_MODE (operands[5]), and_op,
+    		     		     SUBREG_BYTE (operands[5]));
+
+    emit_insn (gen_ashl<mode>3 (operands[0], operands[1], subreg_tmp));
+    DONE;
+  }
+)
+
+(define_insn "*aarch64_<optab>_reg_di3_mask2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(SHIFT:DI
+	  (match_operand:DI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	   [(and:SI (match_operand:SI 2 "register_operand" "r")
+		     (match_operand 3 "aarch64_shift_imm_di" "Usd"))])))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)"
+{
+  rtx xop[3];
+  xop[0] = operands[0];
+  xop[1] = operands[1];
+  xop[2] = gen_lowpart (GET_MODE (operands[4]), operands[2]);
+  output_asm_insn ("<shift>\t%x0, %x1, %x2", xop);
+  return "";
+}
+  [(set_attr "type" "shift_reg")]
+)
+
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index cd7ded9..ad8a43c 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -35,6 +35,10 @@
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
+(define_special_predicate "subreg_lowpart_operator"
+  (and (match_code "subreg")
+       (match_test "subreg_lowpart_p (op)")))
+
 (define_predicate "aarch64_ccmp_immediate"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), -31, 31)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
new file mode 100644
index 0000000..e2b020e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* The integer variable shift and rotate instructions truncate their
+   shift amounts by the datasize.  Make sure that we don't emit a redundant
+   masking operation.  */
+
+unsigned
+f1 (unsigned x, int y)
+{
+  return x << (y & 31);
+}
+
+unsigned long
+f2 (unsigned long x, int y)
+{
+  return x << (y & 63);
+}
+
+unsigned long
+f3 (unsigned long bit_addr, int y)
+{
+  unsigned long bitnumb = bit_addr & 63;
+  return (1L << bitnumb);
+}
+
+unsigned int
+f4 (unsigned int x, unsigned int y)
+{
+  y &= 31;
+  return x >> y | (x << (32 - y));
+}
+
+unsigned long
+f5 (unsigned long x, unsigned long y)
+{
+  y &= 63;
+  return x >> y | (x << (64 - y));
+}
+
+unsigned long
+f6 (unsigned long x, unsigned long y)
+{
+
+  return (x << (64 - (y & 63)));
+
+}
+
+unsigned long
+f7 (unsigned long x, unsigned long y)
+{
+  return (x << -(y & 63));
+}
+
+/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
+/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
+/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-05-23 14:30 ` Richard Sandiford
@ 2017-05-23 23:16   ` Michael Collison
  2017-06-15  8:06     ` Michael Collison
  0 siblings, 1 reply; 15+ messages in thread
From: Michael Collison @ 2017-05-23 23:16 UTC (permalink / raw)
  To: Richard Sandiford, Wilco Dijkstra, Christophe Lyon; +Cc: GCC Patches, nd

[-- Attachment #1: Type: text/plain, Size: 2232 bytes --]

I updated the patch to resolve the big endian issues as recommended by Richard. I also changed all uses of 'can_create_pseudo_p' to '!reload_completed' to make the intent clearer.

Bootstrapped and tested on aarch64-linux-gnu. Okay for trunk?

2017-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* config/aarch64/aarch64.md (*aarch64_<optab>_reg_<mode>3_mask1):
	New pattern.
	(*aarch64_reg_<mode>3_neg_mask2): New pattern.
	(*aarch64_reg_<mode>3_minus_mask): New pattern.
	(*aarch64_<optab>_reg_di3_mask2): New pattern.
	* config/aarch64/aarch64.c (aarch64_rtx_costs): Account for cost
	of shift when the shift amount is masked with constant equal to
	the size of the mode.
	* config/aarch64/predicates.md (subreg_lowpart_operator): New
	predicate.


2016-05-22  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
	    Michael Collison <michael.collison@arm.com>

	PR target/70119
	* gcc.target/aarch64/var_shift_mask_1.c: New test.
-----Original Message-----
From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
Sent: Tuesday, May 23, 2017 7:25 AM
To: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Cc: Michael Collison <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>
Subject: Re: [PATCH] [Aarch64] Variable shift count truncation issues

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Richard Sandiford wrote:
>
>> Insn patterns shouldn't check can_create_pseudo_p, because there's no 
>> guarantee that the associated split happens before RA.  In this case 
>> it should be safe to reuse operand 0 after RA if you change it to:
>
> The goal is to only create and split this pattern before register allocation.
> It's a transient pattern, combine creates it, and it is split immediately after.
>
> Maybe !reload_completed is clearer as that is what several other 
> define_insn_and_split patterns do?

But the concept of a transient pattern doesn't really exist.  We shouldn't rely for correctness on a split being applied before RA.

If all we want to do is match and split something that combine generates, it would be better to do it as a pure define_split.

Thanks,
Richard

[-- Attachment #2: pr5546v3.patch --]
[-- Type: application/octet-stream, Size: 7776 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 714bb79..6ff2e22 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -7347,17 +7347,26 @@ cost_plus:
         }
       else
         {
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
-		{
-		  /* Vector shift (register).  */
-		  *cost += extra_cost->vect.alu;
-		}
-	      else
+	      if (speed)
+		/* Vector shift (register).  */
+		*cost += extra_cost->vect.alu;
+	    }
+	  else
+	    {
+	      if (speed)
+		/* LSLV.  */
+		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
 		{
-		  /* LSLV.  */
-		  *cost += extra_cost->alu.shift_reg;
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
 		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
@@ -7386,14 +7395,27 @@ cost_plus:
 	}
       else
 	{
-
-	  /* ASR (register) and friends.  */
-	  if (speed)
+	  if (VECTOR_MODE_P (mode))
 	    {
-	      if (VECTOR_MODE_P (mode))
+	      if (speed)
+		/* Vector shift (register).  */
 		*cost += extra_cost->vect.alu;
-	      else
+	    }
+	  else
+	    {
+	      if (speed)
+		/* ASR (register) and friends.  */
 		*cost += extra_cost->alu.shift_reg;
+
+	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
+		  && CONST_INT_P (XEXP (op1, 1))
+		  && INTVAL (XEXP (op1, 1)) == GET_MODE_BITSIZE (mode) - 1)
+		{
+		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
+		  /* We already demanded XEXP (op1, 0) to be REG_P, so
+		     don't recurse into it.  */
+		  return true;
+		}
 	    }
 	  return false;  /* All arguments need to be in registers.  */
 	}
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 5adc5ed..a814022 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -3999,6 +3999,91 @@
   }
 )
 
+;; When the LSL, LSR, ASR, ROR instructions operate on all register arguments
+;; they truncate the shift/rotate amount by the size of the registers they
+;; operate on: 32 for W-regs, 63 for X-regs.  This allows us to optimise away
+;; such redundant masking instructions.  GCC can do that automatically when
+;; SHIFT_COUNT_TRUNCATED is true, but we can't enable it for TARGET_SIMD
+;; because some of the SISD shift alternatives don't perform this truncations.
+;; So this pattern exists to catch such cases.
+
+(define_insn "*aarch64_<optab>_reg_<mode>3_mask1"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	   [(and:GPI (match_operand:GPI 2 "register_operand" "r")
+		     (match_operand 3 "const_int_operand" "n"))])))]
+  "(~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0"
+  "<shift>\t%<w>0, %<w>1, %<w>2"
+  [(set_attr "type" "shift_reg")]
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_neg_mask2"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(SHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	  [(neg:SI (and:SI (match_operand:SI 2 "register_operand" "r")
+			   (match_operand 3 "const_int_operand" "n")))])))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
+   && !reload_completed"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx tmp = gen_reg_rtx (SImode);
+
+    emit_insn (gen_negsi2 (tmp, operands[2]));
+    rtx tmp2 = simplify_gen_subreg (QImode, tmp, SImode, 0);
+    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], tmp2));
+    DONE;
+  }
+)
+
+(define_insn_and_split "*aarch64_reg_<mode>3_minus_mask"
+  [(set (match_operand:GPI 0 "register_operand" "=r")
+	(ashift:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (minus:QI (match_operand 2 "const_int_operand" "n")
+		    (match_operator 5 "subreg_lowpart_operator"
+		    [(and:SI (match_operand:SI 3 "register_operand" "r")
+			     (match_operand 4 "const_int_operand" "n"))]))))]
+  "((~INTVAL (operands[4]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) == 0)
+   && !reload_completed
+   && INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx tmp = gen_reg_rtx (SImode);
+
+    emit_insn (gen_negsi2 (tmp, operands[3]));
+    rtx tmp2 = simplify_gen_subreg (QImode, tmp, SImode, 0);
+    emit_insn (gen_ashl<mode>3 (operands[0], operands[1], tmp2));
+    DONE;
+  }
+)
+
+(define_insn_and_split "*aarch64_<optab>_reg_di3_mask2"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(SHIFT:DI
+	  (match_operand:DI 1 "register_operand" "r")
+	  (match_operator 4 "subreg_lowpart_operator"
+	  [(and:SI (match_operand:SI 2 "register_operand" "r")
+		   (match_operand 3 "const_int_operand" "n"))])))]
+  "((~INTVAL (operands[3]) & (GET_MODE_BITSIZE (DImode)-1)) == 0)
+   && !reload_completed"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx tmp = simplify_gen_subreg (QImode, operands[2], SImode, 0);
+    emit_insn (gen_<optab>di3 (operands[0], operands[1], tmp));
+    DONE;
+  }
+)
+
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index e83d45b..5dfb098 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -35,6 +35,10 @@
   (and (match_code "const_int")
        (match_test "op == CONST0_RTX (mode)")))
 
+(define_special_predicate "subreg_lowpart_operator"
+  (and (match_code "subreg")
+       (match_test "subreg_lowpart_p (op)")))
+
 (define_predicate "aarch64_ccmp_immediate"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), -31, 31)")))
diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
new file mode 100644
index 0000000..e2b020e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_1.c
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* The integer variable shift and rotate instructions truncate their
+   shift amounts by the datasize.  Make sure that we don't emit a redundant
+   masking operation.  */
+
+unsigned
+f1 (unsigned x, int y)
+{
+  return x << (y & 31);
+}
+
+unsigned long
+f2 (unsigned long x, int y)
+{
+  return x << (y & 63);
+}
+
+unsigned long
+f3 (unsigned long bit_addr, int y)
+{
+  unsigned long bitnumb = bit_addr & 63;
+  return (1L << bitnumb);
+}
+
+unsigned int
+f4 (unsigned int x, unsigned int y)
+{
+  y &= 31;
+  return x >> y | (x << (32 - y));
+}
+
+unsigned long
+f5 (unsigned long x, unsigned long y)
+{
+  y &= 63;
+  return x >> y | (x << (64 - y));
+}
+
+unsigned long
+f6 (unsigned long x, unsigned long y)
+{
+
+  return (x << (64 - (y & 63)));
+
+}
+
+unsigned long
+f7 (unsigned long x, unsigned long y)
+{
+  return (x << -(y & 63));
+}
+
+/* { dg-final { scan-assembler-times "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 4 } } */
+/* { dg-final { scan-assembler-times "ror\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-times "ror\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" 1 } } */
+/* { dg-final { scan-assembler-not "and\tw\[0-9\]+, w\[0-9\]+, 31" } } */
+/* { dg-final { scan-assembler-not "and\tx\[0-9\]+, x\[0-9\]+, 63" } } */
+/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
  2017-05-19 12:25 Wilco Dijkstra
@ 2017-05-23 14:30 ` Richard Sandiford
  2017-05-23 23:16   ` Michael Collison
  0 siblings, 1 reply; 15+ messages in thread
From: Richard Sandiford @ 2017-05-23 14:30 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Michael Collison, GCC Patches, nd

Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Richard Sandiford wrote:
>
>> Insn patterns shouldn't check can_create_pseudo_p, because there's no
>> guarantee that the associated split happens before RA.  In this case it
>> should be safe to reuse operand 0 after RA if you change it to:
>
> The goal is to only create and split this pattern before register allocation.
> It's a transient pattern, combine creates it, and it is split immediately after.
>
> Maybe !reload_completed is clearer as that is what several other
> define_insn_and_split patterns do?

But the concept of a transient pattern doesn't really exist.  We shouldn't
rely for correctness on a split being applied before RA.

If all we want to do is match and split something that combine generates,
it would be better to do it as a pure define_split.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] [Aarch64] Variable shift count truncation issues
@ 2017-05-19 12:25 Wilco Dijkstra
  2017-05-23 14:30 ` Richard Sandiford
  0 siblings, 1 reply; 15+ messages in thread
From: Wilco Dijkstra @ 2017-05-19 12:25 UTC (permalink / raw)
  To: richard.sandiford, Michael Collison; +Cc: GCC Patches, nd

Richard Sandiford wrote:

> Insn patterns shouldn't check can_create_pseudo_p, because there's no
> guarantee that the associated split happens before RA.  In this case it
> should be safe to reuse operand 0 after RA if you change it to:

The goal is to only create and split this pattern before register allocation.
It's a transient pattern, combine creates it, and it is split immediately after.

Maybe !reload_completed is clearer as that is what several other
define_insn_and_split patterns do?

Wilco

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2017-06-30 18:54 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-18 23:17 [PATCH] [Aarch64] Variable shift count truncation issues Michael Collison
2017-05-19  7:51 ` Richard Sandiford
2017-05-19 11:03   ` Christophe Lyon
2017-05-19 12:25 Wilco Dijkstra
2017-05-23 14:30 ` Richard Sandiford
2017-05-23 23:16   ` Michael Collison
2017-06-15  8:06     ` Michael Collison
2017-06-15 13:39       ` Richard Sandiford
2017-06-21  8:32         ` Michael Collison
2017-06-21 15:42           ` Richard Sandiford
     [not found]             ` <20170622101638.GA30708@arm.com>
2017-06-23  9:28               ` Michael Collison
2017-06-23  9:30                 ` James Greenhalgh
2017-06-30 18:04                 ` Andreas Schwab
2017-06-30 18:54                   ` Michael Collison
2017-06-22 10:22 James Greenhalgh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).