[PATCH] [Aarch64] Optimize subtract in shift counts

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] [Aarch64] Optimize subtract in shift counts
@ 2017-08-07 20:36 Michael Collison
  2017-08-07 20:44 ` Andrew Pinski
  2017-08-07 20:58 ` Richard Kenner
  0 siblings, 2 replies; 32+ messages in thread
From: Michael Collison @ 2017-08-07 20:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd

[-- Attachment #1: Type: text/plain, Size: 898 bytes --]

This patch improves code generation for shifts with subtract instructions where the first operand to the subtract is equal to the bit-size of the operation.


long f1(long x, int i)
{
  return x >> (64 - i);
}

int f2(int x, int i)
{
  return x << (32 - i);
}


With trunk at -O2 we generate:

f1:
	mov	w2, 64
	sub	w1, w2, w1
	asr	x0, x0, x1
	ret

f2:
	mov	w2, 32
	sub	w1, w2, w1
	lsl	w0, w0, w1
	ret

with the patch we generate:

f1:
	neg	w2, w1
	asr	x0, x0, x2
	ret
	.size	f1, .-f1
	.align	2
	.p2align 3,,7
	.global	f2
	.type	f2, %function
f2:
	neg	w2, w1
	lsl	w0, w0, w2
	ret

Okay for trunk?

2017-08-07  Michael Collison <michael.collison@arm.com>

	* config/aarch64/aarch64.md (*aarch64_reg_<optab>_minus<mode>3):
	New pattern.

2016-08-07  Michael Collison <michael.collison@arm.com>

	* gcc.target/aarch64/var_shift_mask_2.c: New test.

[-- Attachment #2: pr7313v6.patch --]
[-- Type: application/octet-stream, Size: 3089 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f876a2b..9d4284c 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4044,6 +4044,45 @@
   [(set_attr "type" "shift_reg")]
 )
 
+(define_insn_and_split "*aarch64_reg_<optab>_minus<mode>3"
+  [(set (match_operand:GPI 0 "register_operand" "=&r")
+	(ASHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (minus:QI (match_operand 2 "const_int_operand" "n")
+		    (match_operand:QI 3 "register_operand" "r"))))]
+  "INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    /* Handle cases where operand 3 is a plain QI register, or
+       a subreg with either a SImode or DImode register.  */
+
+    rtx subreg_tmp = (REG_P (operands[3])
+		      ? gen_lowpart_SUBREG (SImode, operands[3])
+		      : SUBREG_REG (operands[3]));
+
+    if (REG_P (subreg_tmp) && GET_MODE (subreg_tmp) == DImode)
+      subreg_tmp = gen_lowpart_SUBREG (SImode, subreg_tmp);
+
+    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
+	       : operands[0]);
+
+    if (<MODE>mode == DImode && !can_create_pseudo_p ())
+      tmp = gen_lowpart_SUBREG (SImode, operands[0]);
+
+    emit_insn (gen_negsi2 (tmp, subreg_tmp));
+
+    rtx and_op = gen_rtx_AND (SImode, tmp,
+			      GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - 1));
+
+    rtx subreg_tmp2 = gen_lowpart_SUBREG (QImode, and_op);
+
+    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp2));
+    DONE;
+  }
+)
+
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_2.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_2.c
new file mode 100644
index 0000000..c1fe691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_2.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+long long
+f1 (long long x, int i)
+{
+
+  return x >> (64 - i);
+}
+
+unsigned long long
+f2 (unsigned long long x, unsigned int i)
+{
+
+  return x >> (64 - i);
+}
+
+int
+f3 (int x, int i)
+{
+
+  return x >> (32 - i);
+}
+
+unsigned int
+f4 (unsigned int x, unsigned int i)
+{
+
+  return x >> (32 - i);
+}
+
+int
+f5 (int x, int i)
+{
+  return x << (32 - i);
+}
+
+long long
+f6 (long long x, int i)
+{
+  return x << (64 - i);
+}
+
+/* { dg-final { scan-assembler "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
+/* { dg-final { scan-assembler "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" } } */
+/* { dg-final { scan-assembler "lsr\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
+/* { dg-final { scan-assembler "lsr\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" } } */
+/* { dg-final { scan-assembler "asr\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
+/* { dg-final { scan-assembler "asr\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" } } */
+/* { dg-final { scan-assembler-times "neg\tw\[0-9\]+, w\[0-9\]+" 6 } } */
+/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-07 20:36 [PATCH] [Aarch64] Optimize subtract in shift counts Michael Collison
@ 2017-08-07 20:44 ` Andrew Pinski
  2017-08-07 21:02   ` Richard Kenner
  2017-08-07 20:58 ` Richard Kenner
  1 sibling, 1 reply; 32+ messages in thread
From: Andrew Pinski @ 2017-08-07 20:44 UTC (permalink / raw)
  To: Michael Collison; +Cc: gcc-patches, nd

On Mon, Aug 7, 2017 at 1:36 PM, Michael Collison
<Michael.Collison@arm.com> wrote:
> This patch improves code generation for shifts with subtract instructions where the first operand to the subtract is equal to the bit-size of the operation.
>
>
> long f1(long x, int i)
> {
>   return x >> (64 - i);
> }
>
> int f2(int x, int i)
> {
>   return x << (32 - i);
> }
>
>
> With trunk at -O2 we generate:
>
> f1:
>         mov     w2, 64
>         sub     w1, w2, w1
>         asr     x0, x0, x1
>         ret
>
> f2:
>         mov     w2, 32
>         sub     w1, w2, w1
>         lsl     w0, w0, w1
>         ret
>
> with the patch we generate:
>
> f1:
>         neg     w2, w1
>         asr     x0, x0, x2
>         ret
>         .size   f1, .-f1
>         .align  2
>         .p2align 3,,7
>         .global f2
>         .type   f2, %function
> f2:
>         neg     w2, w1
>         lsl     w0, w0, w2
>         ret
>
> Okay for trunk?


Shouldn't this be handled in simplify-rtx instead of an aarch64
specific pattern?

That is simplify:
(SHIFT A (32 - B)) -> (SHIFT A (AND (NEG B) 31))
etc.

or maybe not.  I don't mind either way after thinking about it more.

Thanks,
Andrew

>
> 2017-08-07  Michael Collison <michael.collison@arm.com>
>
>         * config/aarch64/aarch64.md (*aarch64_reg_<optab>_minus<mode>3):
>         New pattern.
>
> 2016-08-07  Michael Collison <michael.collison@arm.com>
>
>         * gcc.target/aarch64/var_shift_mask_2.c: New test.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-07 20:36 [PATCH] [Aarch64] Optimize subtract in shift counts Michael Collison
  2017-08-07 20:44 ` Andrew Pinski
@ 2017-08-07 20:58 ` Richard Kenner
  1 sibling, 0 replies; 32+ messages in thread
From: Richard Kenner @ 2017-08-07 20:58 UTC (permalink / raw)
  To: Michael.Collison; +Cc: gcc-patches, nd

> This patch improves code generation for shifts with subtract
> instructions where the first operand to the subtract is equal to the
> bit-size of the operation.

I would suspect that this will work on lots of targets.  Is doing it
in combine an option?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-07 20:44 ` Andrew Pinski
@ 2017-08-07 21:02   ` Richard Kenner
  2017-08-07 23:44     ` Michael Collison
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Kenner @ 2017-08-07 21:02 UTC (permalink / raw)
  To: pinskia; +Cc: gcc-patches, Michael.Collison, nd

> That is simplify:
> (SHIFT A (32 - B)) -> (SHIFT A (AND (NEG B) 31))
> etc.

I think you need SHIFT_COUNT_TRUNCATED to be true for this to be
valid, but this is exactly what I was getting at in my last message.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-07 21:02   ` Richard Kenner
@ 2017-08-07 23:44     ` Michael Collison
  2017-08-08  1:56       ` Richard Kenner
  0 siblings, 1 reply; 32+ messages in thread
From: Michael Collison @ 2017-08-07 23:44 UTC (permalink / raw)
  To: Richard Kenner, pinskia; +Cc: gcc-patches, nd

On Aarc64 SHIFT_COUNT_TRUNCATED is only true if SIMD code generation is disabled. This is because the simd instructions can be used for shifting but they do not truncate the shift count.

-----Original Message-----
From: Richard Kenner [mailto:kenner@vlsi1.ultra.nyu.edu] 
Sent: Monday, August 7, 2017 2:02 PM
To: pinskia@gmail.com
Cc: gcc-patches@gcc.gnu.org; Michael Collison <Michael.Collison@arm.com>; nd <nd@arm.com>
Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts

> That is simplify:
> (SHIFT A (32 - B)) -> (SHIFT A (AND (NEG B) 31)) etc.

I think you need SHIFT_COUNT_TRUNCATED to be true for this to be valid, but this is exactly what I was getting at in my last message.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-07 23:44     ` Michael Collison
@ 2017-08-08  1:56       ` Richard Kenner
  2017-08-08  4:06         ` Michael Collison
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Kenner @ 2017-08-08  1:56 UTC (permalink / raw)
  To: Michael.Collison; +Cc: gcc-patches, nd, pinskia

> On Aarc64 SHIFT_COUNT_TRUNCATED is only true if SIMD code generation
> is disabled. This is because the simd instructions can be used for
> shifting but they do not truncate the shift count.

In that case, the change isn't safe!  Consider if the value was
negative, for example.  Yes, it's technically undefined, but I'm not
sure I'd want to rely on that.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08  1:56       ` Richard Kenner
@ 2017-08-08  4:06         ` Michael Collison
  2017-08-08 12:13           ` Richard Kenner
  0 siblings, 1 reply; 32+ messages in thread
From: Michael Collison @ 2017-08-08  4:06 UTC (permalink / raw)
  To: Richard Kenner; +Cc: gcc-patches, nd, pinskia

Richard,

The pattern will only be matched if the value is positive. More specifically if the constant value is 32 (SImode) or 64 (DImode).

-----Original Message-----
From: Richard Kenner [mailto:kenner@vlsi1.ultra.nyu.edu] 
Sent: Monday, August 7, 2017 6:56 PM
To: Michael Collison <Michael.Collison@arm.com>
Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; pinskia@gmail.com
Subject: RE: [PATCH] [Aarch64] Optimize subtract in shift counts

> On Aarc64 SHIFT_COUNT_TRUNCATED is only true if SIMD code generation 
> is disabled. This is because the simd instructions can be used for 
> shifting but they do not truncate the shift count.

In that case, the change isn't safe!  Consider if the value was negative, for example.  Yes, it's technically undefined, but I'm not sure I'd want to rely on that.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08  4:06         ` Michael Collison
@ 2017-08-08 12:13           ` Richard Kenner
  2017-08-08 19:46             ` Michael Collison
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Kenner @ 2017-08-08 12:13 UTC (permalink / raw)
  To: Michael.Collison; +Cc: gcc-patches, nd, pinskia

> The pattern will only be matched if the value is positive. More
> specifically if the constant value is 32 (SImode) or 64 (DImode).

I don't mean the constant, but the value subtracted from it.
If that's negative, then we have a shift count larger than the
wordsize.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08 12:13           ` Richard Kenner
@ 2017-08-08 19:46             ` Michael Collison
  2017-08-08 19:52               ` Richard Kenner
  0 siblings, 1 reply; 32+ messages in thread
From: Michael Collison @ 2017-08-08 19:46 UTC (permalink / raw)
  To: Richard Kenner; +Cc: gcc-patches, nd, pinskia

This case is covered by Wilco's previous reply:

https://gcc.gnu.org/ml/gcc-patches/2017-08/msg00575.html

-----Original Message-----
From: Richard Kenner [mailto:kenner@vlsi1.ultra.nyu.edu] 
Sent: Tuesday, August 8, 2017 5:13 AM
To: Michael Collison <Michael.Collison@arm.com>
Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; pinskia@gmail.com
Subject: RE: [PATCH] [Aarch64] Optimize subtract in shift counts

> The pattern will only be matched if the value is positive. More 
> specifically if the constant value is 32 (SImode) or 64 (DImode).

I don't mean the constant, but the value subtracted from it.
If that's negative, then we have a shift count larger than the wordsize.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08 19:46             ` Michael Collison
@ 2017-08-08 19:52               ` Richard Kenner
  2017-08-08 19:59                 ` Michael Collison
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Kenner @ 2017-08-08 19:52 UTC (permalink / raw)
  To: Michael.Collison; +Cc: gcc-patches, nd, pinskia

> This case is covered by Wilco's previous reply:
> 
> https://gcc.gnu.org/ml/gcc-patches/2017-08/msg00575.html

Which I don't understand:

> No it's perfectly safe - it becomes an integer-only shift after the
> split since it keeps the masking as part of the pattern.

Let say we have your first example:

long f1(long x, int i)
{
  return x >> (64 - i);
}

If "i" is -2, this should be a shift of 66 (which is indeed, technically
undefined), but becomes a shift of 62.  What am I missing?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08 19:52               ` Richard Kenner
@ 2017-08-08 19:59                 ` Michael Collison
  2017-08-08 20:04                   ` Richard Kenner
  0 siblings, 1 reply; 32+ messages in thread
From: Michael Collison @ 2017-08-08 19:59 UTC (permalink / raw)
  To: Richard Kenner; +Cc: gcc-patches, nd, pinskia

Because for integer shift instructions the shift count is truncated. We ensure that we only use integer shift instructions by emitting a shift with a mask. This only matches integer shift instructions in the md file.

-----Original Message-----
From: Richard Kenner [mailto:kenner@vlsi1.ultra.nyu.edu] 
Sent: Tuesday, August 8, 2017 12:52 PM
To: Michael Collison <Michael.Collison@arm.com>
Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; pinskia@gmail.com
Subject: RE: [PATCH] [Aarch64] Optimize subtract in shift counts

> This case is covered by Wilco's previous reply:
> 
> https://gcc.gnu.org/ml/gcc-patches/2017-08/msg00575.html

Which I don't understand:

> No it's perfectly safe - it becomes an integer-only shift after the 
> split since it keeps the masking as part of the pattern.

Let say we have your first example:

long f1(long x, int i)
{
  return x >> (64 - i);
}

If "i" is -2, this should be a shift of 66 (which is indeed, technically undefined), but becomes a shift of 62.  What am I missing?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08 19:59                 ` Michael Collison
@ 2017-08-08 20:04                   ` Richard Kenner
  2017-08-08 20:18                     ` Michael Collison
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Kenner @ 2017-08-08 20:04 UTC (permalink / raw)
  To: Michael.Collison; +Cc: gcc-patches, nd, pinskia

> Because for integer shift instructions the shift count is
> truncated. We ensure that we only use integer shift instructions by
> emitting a shift with a mask. This only matches integer shift
> instructions in the md file.

That's why I asked about SHIFT_COUNT_TRUNCATED.  So it's truncated for
some shifts, but not all?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08 20:04                   ` Richard Kenner
@ 2017-08-08 20:18                     ` Michael Collison
  2017-08-08 20:20                       ` Richard Kenner
  0 siblings, 1 reply; 32+ messages in thread
From: Michael Collison @ 2017-08-08 20:18 UTC (permalink / raw)
  To: Richard Kenner; +Cc: gcc-patches, nd, pinskia

Correct. It is truncated for integer shift, but not simd shift instructions. We generate a pattern in the split that only generates the integer shift instructions.

-----Original Message-----
From: Richard Kenner [mailto:kenner@vlsi1.ultra.nyu.edu] 
Sent: Tuesday, August 8, 2017 1:04 PM
To: Michael Collison <Michael.Collison@arm.com>
Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; pinskia@gmail.com
Subject: RE: [PATCH] [Aarch64] Optimize subtract in shift counts

> Because for integer shift instructions the shift count is truncated. 
> We ensure that we only use integer shift instructions by emitting a 
> shift with a mask. This only matches integer shift instructions in the 
> md file.

That's why I asked about SHIFT_COUNT_TRUNCATED.  So it's truncated for some shifts, but not all?

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08 20:18                     ` Michael Collison
@ 2017-08-08 20:20                       ` Richard Kenner
  2017-08-08 20:33                         ` Michael Collison
  2017-08-14  8:58                         ` Richard Biener
  0 siblings, 2 replies; 32+ messages in thread
From: Richard Kenner @ 2017-08-08 20:20 UTC (permalink / raw)
  To: Michael.Collison; +Cc: gcc-patches, nd, pinskia

> Correct. It is truncated for integer shift, but not simd shift
> instructions. We generate a pattern in the split that only generates
> the integer shift instructions.

That's unfortunate, because it would be nice to do this in simplify_rtx,
since it's machine-independent, but that has to be conditioned on
SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08 20:20                       ` Richard Kenner
@ 2017-08-08 20:33                         ` Michael Collison
  2017-08-14  8:58                         ` Richard Biener
  1 sibling, 0 replies; 32+ messages in thread
From: Michael Collison @ 2017-08-08 20:33 UTC (permalink / raw)
  To: Richard Kenner; +Cc: gcc-patches, nd, pinskia

Correct.

-----Original Message-----
From: Richard Kenner [mailto:kenner@vlsi1.ultra.nyu.edu] 
Sent: Tuesday, August 8, 2017 1:20 PM
To: Michael Collison <Michael.Collison@arm.com>
Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; pinskia@gmail.com
Subject: RE: [PATCH] [Aarch64] Optimize subtract in shift counts

> Correct. It is truncated for integer shift, but not simd shift 
> instructions. We generate a pattern in the split that only generates 
> the integer shift instructions.

That's unfortunate, because it would be nice to do this in simplify_rtx, since it's machine-independent, but that has to be conditioned on SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-08 20:20                       ` Richard Kenner
  2017-08-08 20:33                         ` Michael Collison
@ 2017-08-14  8:58                         ` Richard Biener
  2017-08-15  7:28                           ` Michael Collison
  2017-08-21 19:11                           ` Richard Sandiford
  1 sibling, 2 replies; 32+ messages in thread
From: Richard Biener @ 2017-08-14  8:58 UTC (permalink / raw)
  To: Richard Kenner; +Cc: Michael.Collison, GCC Patches, nd, Andrew Pinski

On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner
<kenner@vlsi1.ultra.nyu.edu> wrote:
>> Correct. It is truncated for integer shift, but not simd shift
>> instructions. We generate a pattern in the split that only generates
>> the integer shift instructions.
>
> That's unfortunate, because it would be nice to do this in simplify_rtx,
> since it's machine-independent, but that has to be conditioned on
> SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.

SHIFT_COUNT_TRUNCATED should go ... you should express this in
the patterns, like for example with

(define_insn ashlSI3
  [(set (match_operand 0 "")
         (ashl:SI (match_operand ... )
                     (subreg:QI (match_operand:SI ...)))]

or an explicit and:SI and combine / simplify_rtx should apply the magic
optimization we expect.

Richard.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-14  8:58                         ` Richard Biener
@ 2017-08-15  7:28                           ` Michael Collison
  2017-08-21 19:11                           ` Richard Sandiford
  1 sibling, 0 replies; 32+ messages in thread
From: Michael Collison @ 2017-08-15  7:28 UTC (permalink / raw)
  To: Richard Biener, Richard Kenner; +Cc: GCC Patches, nd, Andrew Pinski

This is exactly the approach that was taken with this patch. An earlier patch actually contains the patterns that match the truncation:

https://gcc.gnu.org/ml/gcc-patches/2017-06/msg01095.html

-----Original Message-----
From: Richard Biener [mailto:richard.guenther@gmail.com] 
Sent: Monday, August 14, 2017 1:27 AM
To: Richard Kenner <kenner@vlsi1.ultra.nyu.edu>
Cc: Michael Collison <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts

On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner <kenner@vlsi1.ultra.nyu.edu> wrote:
>> Correct. It is truncated for integer shift, but not simd shift 
>> instructions. We generate a pattern in the split that only generates 
>> the integer shift instructions.
>
> That's unfortunate, because it would be nice to do this in 
> simplify_rtx, since it's machine-independent, but that has to be 
> conditioned on SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.

SHIFT_COUNT_TRUNCATED should go ... you should express this in the patterns, like for example with

(define_insn ashlSI3
  [(set (match_operand 0 "")
         (ashl:SI (match_operand ... )
                     (subreg:QI (match_operand:SI ...)))]

or an explicit and:SI and combine / simplify_rtx should apply the magic optimization we expect.

Richard.

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-14  8:58                         ` Richard Biener
  2017-08-15  7:28                           ` Michael Collison
@ 2017-08-21 19:11                           ` Richard Sandiford
  2017-08-21 19:14                             ` Richard Biener
  1 sibling, 1 reply; 32+ messages in thread
From: Richard Sandiford @ 2017-08-21 19:11 UTC (permalink / raw)
  To: Richard Biener
  Cc: Richard Kenner, Michael.Collison, GCC Patches, nd, Andrew Pinski

Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner
> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>> Correct. It is truncated for integer shift, but not simd shift
>>> instructions. We generate a pattern in the split that only generates
>>> the integer shift instructions.
>>
>> That's unfortunate, because it would be nice to do this in simplify_rtx,
>> since it's machine-independent, but that has to be conditioned on
>> SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>
> SHIFT_COUNT_TRUNCATED should go ... you should express this in
> the patterns, like for example with
>
> (define_insn ashlSI3
>   [(set (match_operand 0 "")
>          (ashl:SI (match_operand ... )
>                      (subreg:QI (match_operand:SI ...)))]
>
> or an explicit and:SI and combine / simplify_rtx should apply the magic
> optimization we expect.

The problem with the explicit AND is that you'd end up with either
an AND of two constants for constant shifts, or with two separate patterns,
one for constant shifts and one for variable shifts.  (And the problem in
theory with two patterns is that it reduces the RA's freedom, although in
practice I guess we'd always want a constant shift where possible for
cost reasons, and so the RA would never need to replace pseudos with
constants itself.)

I think all useful instances of this optimisation will be exposed by
the gimple optimisers, so maybe expand could to do it based on
TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than
the rtx code and it does take the mode into account.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-21 19:11                           ` Richard Sandiford
@ 2017-08-21 19:14                             ` Richard Biener
  2017-08-22  8:17                               ` Richard Sandiford
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Biener @ 2017-08-21 19:14 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: Richard Kenner, Michael.Collison, GCC Patches, nd, Andrew Pinski

On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford <richard.sandiford@linaro.org> wrote:
>Richard Biener <richard.guenther@gmail.com> writes:
>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner
>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>> Correct. It is truncated for integer shift, but not simd shift
>>>> instructions. We generate a pattern in the split that only
>generates
>>>> the integer shift instructions.
>>>
>>> That's unfortunate, because it would be nice to do this in
>simplify_rtx,
>>> since it's machine-independent, but that has to be conditioned on
>>> SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>
>> SHIFT_COUNT_TRUNCATED should go ... you should express this in
>> the patterns, like for example with
>>
>> (define_insn ashlSI3
>>   [(set (match_operand 0 "")
>>          (ashl:SI (match_operand ... )
>>                      (subreg:QI (match_operand:SI ...)))]
>>
>> or an explicit and:SI and combine / simplify_rtx should apply the
>magic
>> optimization we expect.
>
>The problem with the explicit AND is that you'd end up with either
>an AND of two constants for constant shifts, or with two separate
>patterns,
>one for constant shifts and one for variable shifts.  (And the problem
>in
>theory with two patterns is that it reduces the RA's freedom, although
>in
>practice I guess we'd always want a constant shift where possible for
>cost reasons, and so the RA would never need to replace pseudos with
>constants itself.)
>
>I think all useful instances of this optimisation will be exposed by
>the gimple optimisers, so maybe expand could to do it based on
>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than
>the rtx code and it does take the mode into account.

Sure, that could work as well and also take into account range info. But we'd then need named expanders and the result would still have the explicit and or need to be an unspec or a different RTL operation. 

Richard. 

>Thanks,
>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-21 19:14                             ` Richard Biener
@ 2017-08-22  8:17                               ` Richard Sandiford
  2017-08-22  8:46                                 ` Richard Biener
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Sandiford @ 2017-08-22  8:17 UTC (permalink / raw)
  To: Richard Biener
  Cc: Richard Kenner, Michael.Collison, GCC Patches, nd, Andrew Pinski

Richard Biener <richard.guenther@gmail.com> writes:
> On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford
> <richard.sandiford@linaro.org> wrote:
>>Richard Biener <richard.guenther@gmail.com> writes:
>>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner
>>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>>> Correct. It is truncated for integer shift, but not simd shift
>>>>> instructions. We generate a pattern in the split that only
>>generates
>>>>> the integer shift instructions.
>>>>
>>>> That's unfortunate, because it would be nice to do this in
>>simplify_rtx,
>>>> since it's machine-independent, but that has to be conditioned on
>>>> SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>>
>>> SHIFT_COUNT_TRUNCATED should go ... you should express this in
>>> the patterns, like for example with
>>>
>>> (define_insn ashlSI3
>>>   [(set (match_operand 0 "")
>>>          (ashl:SI (match_operand ... )
>>>                      (subreg:QI (match_operand:SI ...)))]
>>>
>>> or an explicit and:SI and combine / simplify_rtx should apply the
>>magic
>>> optimization we expect.
>>
>>The problem with the explicit AND is that you'd end up with either
>>an AND of two constants for constant shifts, or with two separate
>>patterns,
>>one for constant shifts and one for variable shifts.  (And the problem
>>in
>>theory with two patterns is that it reduces the RA's freedom, although
>>in
>>practice I guess we'd always want a constant shift where possible for
>>cost reasons, and so the RA would never need to replace pseudos with
>>constants itself.)
>>
>>I think all useful instances of this optimisation will be exposed by
>>the gimple optimisers, so maybe expand could to do it based on
>>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than
>>the rtx code and it does take the mode into account.
>
> Sure, that could work as well and also take into account range info. But
> we'd then need named expanders and the result would still have the
> explicit and or need to be an unspec or a different RTL operation.

Without SHIFT_COUNT_TRUNCATED, out-of-range rtl shifts have
target-dependent rather than undefined behaviour, so it's OK
for a target to use shift codes with out-of-range values.  And
TARGET_SHIFT_TRUNCATION_MASK is a guarantee from the target about
how the normal shift optabs behave, so I don't think we'd need new
optabs or new unspecs.

E.g. it already works this way when expanding double-word shifts,
which IIRC is why TARGET_SHIFT_TRUNCATION_MASK was added.  There it's
possible to use a shorter sequence if you know that the shift optab
truncates the count, so we can do that even if SHIFT_COUNT_TRUNCATED
isn't defined.

Thanks,
Richard

>
> Richard. 
>
>>Thanks,
>>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-22  8:17                               ` Richard Sandiford
@ 2017-08-22  8:46                                 ` Richard Biener
  2017-08-22  8:50                                   ` Richard Sandiford
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Biener @ 2017-08-22  8:46 UTC (permalink / raw)
  To: Richard Biener, Richard Kenner, Michael.Collison, GCC Patches,
	nd, Andrew Pinski, Richard Sandiford

On Tue, Aug 22, 2017 at 9:29 AM, Richard Sandiford
<richard.sandiford@linaro.org> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford
>> <richard.sandiford@linaro.org> wrote:
>>>Richard Biener <richard.guenther@gmail.com> writes:
>>>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner
>>>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>>>> Correct. It is truncated for integer shift, but not simd shift
>>>>>> instructions. We generate a pattern in the split that only
>>>generates
>>>>>> the integer shift instructions.
>>>>>
>>>>> That's unfortunate, because it would be nice to do this in
>>>simplify_rtx,
>>>>> since it's machine-independent, but that has to be conditioned on
>>>>> SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>>>
>>>> SHIFT_COUNT_TRUNCATED should go ... you should express this in
>>>> the patterns, like for example with
>>>>
>>>> (define_insn ashlSI3
>>>>   [(set (match_operand 0 "")
>>>>          (ashl:SI (match_operand ... )
>>>>                      (subreg:QI (match_operand:SI ...)))]
>>>>
>>>> or an explicit and:SI and combine / simplify_rtx should apply the
>>>magic
>>>> optimization we expect.
>>>
>>>The problem with the explicit AND is that you'd end up with either
>>>an AND of two constants for constant shifts, or with two separate
>>>patterns,
>>>one for constant shifts and one for variable shifts.  (And the problem
>>>in
>>>theory with two patterns is that it reduces the RA's freedom, although
>>>in
>>>practice I guess we'd always want a constant shift where possible for
>>>cost reasons, and so the RA would never need to replace pseudos with
>>>constants itself.)
>>>
>>>I think all useful instances of this optimisation will be exposed by
>>>the gimple optimisers, so maybe expand could to do it based on
>>>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than
>>>the rtx code and it does take the mode into account.
>>
>> Sure, that could work as well and also take into account range info. But
>> we'd then need named expanders and the result would still have the
>> explicit and or need to be an unspec or a different RTL operation.
>
> Without SHIFT_COUNT_TRUNCATED, out-of-range rtl shifts have
> target-dependent rather than undefined behaviour, so it's OK
> for a target to use shift codes with out-of-range values.

Hmm, but that means simplify-rtx can't do anything with them because
we need to preserve target dependent behavior.  I think the RTL IL should
be always well-defined and its semantics shouldn't have any target
dependences (ideally, and if, then they should be well specified via
extra target hooks/macros).

>  And
> TARGET_SHIFT_TRUNCATION_MASK is a guarantee from the target about
> how the normal shift optabs behave, so I don't think we'd need new
> optabs or new unspecs.
>
> E.g. it already works this way when expanding double-word shifts,
> which IIRC is why TARGET_SHIFT_TRUNCATION_MASK was added.  There it's
> possible to use a shorter sequence if you know that the shift optab
> truncates the count, so we can do that even if SHIFT_COUNT_TRUNCATED
> isn't defined.

I'm somewhat confused by docs saying TARGET_SHIFT_TRUNCATION_MASK
applies to the instructions generated by the named shift patterns but _not_
general shift RTXen.  But the generated pattern contains shift RTXen and how
can we figure whether they were generated by the named expanders or
by other means?  Don't define_expand also serve as define_insn for things
like combine?

That said, from looking at the docs and your description above it seems that
semantics are not fully reflected in the RTL instruction stream?

Richard.

> Thanks,
> Richard
>
>>
>> Richard.
>>
>>>Thanks,
>>>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-22  8:46                                 ` Richard Biener
@ 2017-08-22  8:50                                   ` Richard Sandiford
  2017-09-06  9:11                                     ` Michael Collison
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Sandiford @ 2017-08-22  8:50 UTC (permalink / raw)
  To: Richard Biener
  Cc: Richard Kenner, Michael.Collison, GCC Patches, nd, Andrew Pinski

Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Aug 22, 2017 at 9:29 AM, Richard Sandiford
> <richard.sandiford@linaro.org> wrote:
>> Richard Biener <richard.guenther@gmail.com> writes:
>>> On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford
>>> <richard.sandiford@linaro.org> wrote:
>>>>Richard Biener <richard.guenther@gmail.com> writes:
>>>>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner
>>>>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>>>>> Correct. It is truncated for integer shift, but not simd shift
>>>>>>> instructions. We generate a pattern in the split that only
>>>>generates
>>>>>>> the integer shift instructions.
>>>>>>
>>>>>> That's unfortunate, because it would be nice to do this in
>>>>simplify_rtx,
>>>>>> since it's machine-independent, but that has to be conditioned on
>>>>>> SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>>>>
>>>>> SHIFT_COUNT_TRUNCATED should go ... you should express this in
>>>>> the patterns, like for example with
>>>>>
>>>>> (define_insn ashlSI3
>>>>>   [(set (match_operand 0 "")
>>>>>          (ashl:SI (match_operand ... )
>>>>>                      (subreg:QI (match_operand:SI ...)))]
>>>>>
>>>>> or an explicit and:SI and combine / simplify_rtx should apply the
>>>>magic
>>>>> optimization we expect.
>>>>
>>>>The problem with the explicit AND is that you'd end up with either
>>>>an AND of two constants for constant shifts, or with two separate
>>>>patterns,
>>>>one for constant shifts and one for variable shifts.  (And the problem
>>>>in
>>>>theory with two patterns is that it reduces the RA's freedom, although
>>>>in
>>>>practice I guess we'd always want a constant shift where possible for
>>>>cost reasons, and so the RA would never need to replace pseudos with
>>>>constants itself.)
>>>>
>>>>I think all useful instances of this optimisation will be exposed by
>>>>the gimple optimisers, so maybe expand could to do it based on
>>>>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than
>>>>the rtx code and it does take the mode into account.
>>>
>>> Sure, that could work as well and also take into account range info. But
>>> we'd then need named expanders and the result would still have the
>>> explicit and or need to be an unspec or a different RTL operation.
>>
>> Without SHIFT_COUNT_TRUNCATED, out-of-range rtl shifts have
>> target-dependent rather than undefined behaviour, so it's OK
>> for a target to use shift codes with out-of-range values.
>
> Hmm, but that means simplify-rtx can't do anything with them because
> we need to preserve target dependent behavior.

Yeah, it needs to punt.  In practice that shouldn't matter much.

> I think the RTL IL should be always well-defined and its semantics
> shouldn't have any target dependences (ideally, and if, then they
> should be well specified via extra target hooks/macros).

That would be nice :-)  I think the problem has traditionally been
that shifts can be used in quite a few define_insn patterns besides
those for shift instructions.  So if your target defines shifts to have
256-bit precision (say) then you need to make sure that every define_insn
with a shift rtx will honour that.

It's more natural for target guarantees to apply to instructions than
to rtx codes.

>>  And
>> TARGET_SHIFT_TRUNCATION_MASK is a guarantee from the target about
>> how the normal shift optabs behave, so I don't think we'd need new
>> optabs or new unspecs.
>>
>> E.g. it already works this way when expanding double-word shifts,
>> which IIRC is why TARGET_SHIFT_TRUNCATION_MASK was added.  There it's
>> possible to use a shorter sequence if you know that the shift optab
>> truncates the count, so we can do that even if SHIFT_COUNT_TRUNCATED
>> isn't defined.
>
> I'm somewhat confused by docs saying TARGET_SHIFT_TRUNCATION_MASK
> applies to the instructions generated by the named shift patterns but _not_
> general shift RTXen.  But the generated pattern contains shift RTXen and how
> can we figure whether they were generated by the named expanders or
> by other means?  Don't define_expand also serve as define_insn for things
> like combine?

Yeah, you can't (and aren't supposed to try to) reverse-engineer the
expander from the generated instructions.  TARGET_SHIFT_TRUNCATION_MASK
should only be used if you're expanding a shift optab.

> That said, from looking at the docs and your description above it seems that
> semantics are not fully reflected in the RTL instruction stream?

Yeah, the semantics go from being well-defined in the optab interface
to being target-dependent in the rtl stream.

Thanks,
Richard

>
> Richard.
>
>> Thanks,
>> Richard
>>
>>>
>>> Richard.
>>>
>>>>Thanks,
>>>>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-08-22  8:50                                   ` Richard Sandiford
@ 2017-09-06  9:11                                     ` Michael Collison
  2017-09-06 10:32                                       ` Richard Sandiford
  0 siblings, 1 reply; 32+ messages in thread
From: Michael Collison @ 2017-09-06  9:11 UTC (permalink / raw)
  To: Richard Sandiford, Richard Biener
  Cc: Richard Kenner, GCC Patches, nd, Andrew Pinski

Richard Sandiford do you have any objections to the patch as it stands? It doesn't appear as if anything is going to change in the mid-end anytime soon.

-----Original Message-----
From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
Sent: Tuesday, August 22, 2017 9:11 AM
To: Richard Biener <richard.guenther@gmail.com>
Cc: Richard Kenner <kenner@vlsi1.ultra.nyu.edu>; Michael Collison <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts

Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Aug 22, 2017 at 9:29 AM, Richard Sandiford 
> <richard.sandiford@linaro.org> wrote:
>> Richard Biener <richard.guenther@gmail.com> writes:
>>> On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford 
>>> <richard.sandiford@linaro.org> wrote:
>>>>Richard Biener <richard.guenther@gmail.com> writes:
>>>>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner 
>>>>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>>>>> Correct. It is truncated for integer shift, but not simd shift 
>>>>>>> instructions. We generate a pattern in the split that only
>>>>generates
>>>>>>> the integer shift instructions.
>>>>>>
>>>>>> That's unfortunate, because it would be nice to do this in
>>>>simplify_rtx,
>>>>>> since it's machine-independent, but that has to be conditioned on 
>>>>>> SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>>>>
>>>>> SHIFT_COUNT_TRUNCATED should go ... you should express this in the 
>>>>> patterns, like for example with
>>>>>
>>>>> (define_insn ashlSI3
>>>>>   [(set (match_operand 0 "")
>>>>>          (ashl:SI (match_operand ... )
>>>>>                      (subreg:QI (match_operand:SI ...)))]
>>>>>
>>>>> or an explicit and:SI and combine / simplify_rtx should apply the
>>>>magic
>>>>> optimization we expect.
>>>>
>>>>The problem with the explicit AND is that you'd end up with either 
>>>>an AND of two constants for constant shifts, or with two separate 
>>>>patterns, one for constant shifts and one for variable shifts.  (And 
>>>>the problem in theory with two patterns is that it reduces the RA's 
>>>>freedom, although in practice I guess we'd always want a constant 
>>>>shift where possible for cost reasons, and so the RA would never 
>>>>need to replace pseudos with constants itself.)
>>>>
>>>>I think all useful instances of this optimisation will be exposed by 
>>>>the gimple optimisers, so maybe expand could to do it based on 
>>>>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than 
>>>>the rtx code and it does take the mode into account.
>>>
>>> Sure, that could work as well and also take into account range info. 
>>> But we'd then need named expanders and the result would still have 
>>> the explicit and or need to be an unspec or a different RTL operation.
>>
>> Without SHIFT_COUNT_TRUNCATED, out-of-range rtl shifts have 
>> target-dependent rather than undefined behaviour, so it's OK for a 
>> target to use shift codes with out-of-range values.
>
> Hmm, but that means simplify-rtx can't do anything with them because 
> we need to preserve target dependent behavior.

Yeah, it needs to punt.  In practice that shouldn't matter much.

> I think the RTL IL should be always well-defined and its semantics 
> shouldn't have any target dependences (ideally, and if, then they 
> should be well specified via extra target hooks/macros).

That would be nice :-)  I think the problem has traditionally been that shifts can be used in quite a few define_insn patterns besides those for shift instructions.  So if your target defines shifts to have 256-bit precision (say) then you need to make sure that every define_insn with a shift rtx will honour that.

It's more natural for target guarantees to apply to instructions than to rtx codes.

>>  And
>> TARGET_SHIFT_TRUNCATION_MASK is a guarantee from the target about how 
>> the normal shift optabs behave, so I don't think we'd need new optabs 
>> or new unspecs.
>>
>> E.g. it already works this way when expanding double-word shifts, 
>> which IIRC is why TARGET_SHIFT_TRUNCATION_MASK was added.  There it's 
>> possible to use a shorter sequence if you know that the shift optab 
>> truncates the count, so we can do that even if SHIFT_COUNT_TRUNCATED 
>> isn't defined.
>
> I'm somewhat confused by docs saying TARGET_SHIFT_TRUNCATION_MASK 
> applies to the instructions generated by the named shift patterns but 
> _not_ general shift RTXen.  But the generated pattern contains shift 
> RTXen and how can we figure whether they were generated by the named 
> expanders or by other means?  Don't define_expand also serve as 
> define_insn for things like combine?

Yeah, you can't (and aren't supposed to try to) reverse-engineer the expander from the generated instructions.  TARGET_SHIFT_TRUNCATION_MASK should only be used if you're expanding a shift optab.

> That said, from looking at the docs and your description above it 
> seems that semantics are not fully reflected in the RTL instruction stream?

Yeah, the semantics go from being well-defined in the optab interface to being target-dependent in the rtl stream.

Thanks,
Richard

>
> Richard.
>
>> Thanks,
>> Richard
>>
>>>
>>> Richard.
>>>
>>>>Thanks,
>>>>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-09-06  9:11                                     ` Michael Collison
@ 2017-09-06 10:32                                       ` Richard Sandiford
  2017-09-06 15:41                                         ` Michael Collison
  0 siblings, 1 reply; 32+ messages in thread
From: Richard Sandiford @ 2017-09-06 10:32 UTC (permalink / raw)
  To: Michael Collison
  Cc: Richard Biener, Richard Kenner, GCC Patches, nd, Andrew Pinski

Michael Collison <Michael.Collison@arm.com> writes:
> Richard Sandiford do you have any objections to the patch as it stands?
> It doesn't appear as if anything is going to change in the mid-end
> anytime soon.

I think one of the suggestions was to do it in expand, taking advantage
of range info and TARGET_SHIFT_TRUNCATION_MASK.  This would be like
the current FMA_EXPR handling in expand_expr_real_2.

I know there was talk about cleaner approaches, but at least doing
the above seems cleaner than doing in the backend.  It should also
be a nicely-contained piece of work.

Thanks,
Richard

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
> Sent: Tuesday, August 22, 2017 9:11 AM
> To: Richard Biener <richard.guenther@gmail.com>
> Cc: Richard Kenner <kenner@vlsi1.ultra.nyu.edu>; Michael Collison
> <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>; nd
> <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
> Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts
>
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Tue, Aug 22, 2017 at 9:29 AM, Richard Sandiford 
>> <richard.sandiford@linaro.org> wrote:
>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>> On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford 
>>>> <richard.sandiford@linaro.org> wrote:
>>>>>Richard Biener <richard.guenther@gmail.com> writes:
>>>>>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner 
>>>>>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>>>>>> Correct. It is truncated for integer shift, but not simd shift 
>>>>>>>> instructions. We generate a pattern in the split that only
>>>>>generates
>>>>>>>> the integer shift instructions.
>>>>>>>
>>>>>>> That's unfortunate, because it would be nice to do this in
>>>>>simplify_rtx,
>>>>>>> since it's machine-independent, but that has to be conditioned on 
>>>>>>> SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>>>>>
>>>>>> SHIFT_COUNT_TRUNCATED should go ... you should express this in the 
>>>>>> patterns, like for example with
>>>>>>
>>>>>> (define_insn ashlSI3
>>>>>>   [(set (match_operand 0 "")
>>>>>>          (ashl:SI (match_operand ... )
>>>>>>                      (subreg:QI (match_operand:SI ...)))]
>>>>>>
>>>>>> or an explicit and:SI and combine / simplify_rtx should apply the
>>>>>magic
>>>>>> optimization we expect.
>>>>>
>>>>>The problem with the explicit AND is that you'd end up with either 
>>>>>an AND of two constants for constant shifts, or with two separate 
>>>>>patterns, one for constant shifts and one for variable shifts.  (And 
>>>>>the problem in theory with two patterns is that it reduces the RA's 
>>>>>freedom, although in practice I guess we'd always want a constant 
>>>>>shift where possible for cost reasons, and so the RA would never 
>>>>>need to replace pseudos with constants itself.)
>>>>>
>>>>>I think all useful instances of this optimisation will be exposed by 
>>>>>the gimple optimisers, so maybe expand could to do it based on 
>>>>>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than 
>>>>>the rtx code and it does take the mode into account.
>>>>
>>>> Sure, that could work as well and also take into account range info. 
>>>> But we'd then need named expanders and the result would still have 
>>>> the explicit and or need to be an unspec or a different RTL operation.
>>>
>>> Without SHIFT_COUNT_TRUNCATED, out-of-range rtl shifts have 
>>> target-dependent rather than undefined behaviour, so it's OK for a 
>>> target to use shift codes with out-of-range values.
>>
>> Hmm, but that means simplify-rtx can't do anything with them because 
>> we need to preserve target dependent behavior.
>
> Yeah, it needs to punt.  In practice that shouldn't matter much.
>
>> I think the RTL IL should be always well-defined and its semantics 
>> shouldn't have any target dependences (ideally, and if, then they 
>> should be well specified via extra target hooks/macros).
>
> That would be nice :-) I think the problem has traditionally been that
>> shifts can be used in quite a few define_insn patterns besides those
>> for shift instructions.  So if your target defines shifts to have
>> 256-bit precision (say) then you need to make sure that every
>> define_insn with a shift rtx will honour that.
>
> It's more natural for target guarantees to apply to instructions than to
>> rtx codes.
>
>>>  And
>>> TARGET_SHIFT_TRUNCATION_MASK is a guarantee from the target about how 
>>> the normal shift optabs behave, so I don't think we'd need new optabs 
>>> or new unspecs.
>>>
>>> E.g. it already works this way when expanding double-word shifts, 
>>> which IIRC is why TARGET_SHIFT_TRUNCATION_MASK was added.  There it's 
>>> possible to use a shorter sequence if you know that the shift optab 
>>> truncates the count, so we can do that even if SHIFT_COUNT_TRUNCATED 
>>> isn't defined.
>>
>> I'm somewhat confused by docs saying TARGET_SHIFT_TRUNCATION_MASK 
>> applies to the instructions generated by the named shift patterns but 
>> _not_ general shift RTXen.  But the generated pattern contains shift 
>> RTXen and how can we figure whether they were generated by the named 
>> expanders or by other means?  Don't define_expand also serve as 
>> define_insn for things like combine?
>
> Yeah, you can't (and aren't supposed to try to) reverse-engineer the
>> expander from the generated instructions.
>> TARGET_SHIFT_TRUNCATION_MASK should only be used if you're expanding a
>> shift optab.
>
>> That said, from looking at the docs and your description above it 
>> seems that semantics are not fully reflected in the RTL instruction stream?
>
> Yeah, the semantics go from being well-defined in the optab interface to being target-dependent in the rtl stream.
>
> Thanks,
> Richard
>
>>
>> Richard.
>>
>>> Thanks,
>>> Richard
>>>
>>>>
>>>> Richard.
>>>>
>>>>>Thanks,
>>>>>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-09-06 10:32                                       ` Richard Sandiford
@ 2017-09-06 15:41                                         ` Michael Collison
  2017-09-06 16:53                                           ` Richard Sandiford
  0 siblings, 1 reply; 32+ messages in thread
From: Michael Collison @ 2017-09-06 15:41 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: Richard Biener, Richard Kenner, GCC Patches, nd, Andrew Pinski

Richard,

The problem with this approach for Aarch64 is that TARGET_SHIFT_TRUNCATION_MASK is based on SHIFT_COUNT_TRUNCATED which is normally 0 as it based on the TARGET_SIMD flag.

-----Original Message-----
From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
Sent: Wednesday, September 6, 2017 11:32 AM
To: Michael Collison <Michael.Collison@arm.com>
Cc: Richard Biener <richard.guenther@gmail.com>; Richard Kenner <kenner@vlsi1.ultra.nyu.edu>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts

Michael Collison <Michael.Collison@arm.com> writes:
> Richard Sandiford do you have any objections to the patch as it stands?
> It doesn't appear as if anything is going to change in the mid-end 
> anytime soon.

I think one of the suggestions was to do it in expand, taking advantage of range info and TARGET_SHIFT_TRUNCATION_MASK.  This would be like the current FMA_EXPR handling in expand_expr_real_2.

I know there was talk about cleaner approaches, but at least doing the above seems cleaner than doing in the backend.  It should also be a nicely-contained piece of work.

Thanks,
Richard

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@linaro.org]
> Sent: Tuesday, August 22, 2017 9:11 AM
> To: Richard Biener <richard.guenther@gmail.com>
> Cc: Richard Kenner <kenner@vlsi1.ultra.nyu.edu>; Michael Collison 
> <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>; nd 
> <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
> Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts
>
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Tue, Aug 22, 2017 at 9:29 AM, Richard Sandiford 
>> <richard.sandiford@linaro.org> wrote:
>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>> On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford 
>>>> <richard.sandiford@linaro.org> wrote:
>>>>>Richard Biener <richard.guenther@gmail.com> writes:
>>>>>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner 
>>>>>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>>>>>> Correct. It is truncated for integer shift, but not simd shift 
>>>>>>>> instructions. We generate a pattern in the split that only
>>>>>generates
>>>>>>>> the integer shift instructions.
>>>>>>>
>>>>>>> That's unfortunate, because it would be nice to do this in
>>>>>simplify_rtx,
>>>>>>> since it's machine-independent, but that has to be conditioned 
>>>>>>> on SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>>>>>
>>>>>> SHIFT_COUNT_TRUNCATED should go ... you should express this in 
>>>>>> the patterns, like for example with
>>>>>>
>>>>>> (define_insn ashlSI3
>>>>>>   [(set (match_operand 0 "")
>>>>>>          (ashl:SI (match_operand ... )
>>>>>>                      (subreg:QI (match_operand:SI ...)))]
>>>>>>
>>>>>> or an explicit and:SI and combine / simplify_rtx should apply the
>>>>>magic
>>>>>> optimization we expect.
>>>>>
>>>>>The problem with the explicit AND is that you'd end up with either 
>>>>>an AND of two constants for constant shifts, or with two separate 
>>>>>patterns, one for constant shifts and one for variable shifts.  
>>>>>(And the problem in theory with two patterns is that it reduces the 
>>>>>RA's freedom, although in practice I guess we'd always want a 
>>>>>constant shift where possible for cost reasons, and so the RA would 
>>>>>never need to replace pseudos with constants itself.)
>>>>>
>>>>>I think all useful instances of this optimisation will be exposed 
>>>>>by the gimple optimisers, so maybe expand could to do it based on 
>>>>>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than 
>>>>>the rtx code and it does take the mode into account.
>>>>
>>>> Sure, that could work as well and also take into account range info. 
>>>> But we'd then need named expanders and the result would still have 
>>>> the explicit and or need to be an unspec or a different RTL operation.
>>>
>>> Without SHIFT_COUNT_TRUNCATED, out-of-range rtl shifts have 
>>> target-dependent rather than undefined behaviour, so it's OK for a 
>>> target to use shift codes with out-of-range values.
>>
>> Hmm, but that means simplify-rtx can't do anything with them because 
>> we need to preserve target dependent behavior.
>
> Yeah, it needs to punt.  In practice that shouldn't matter much.
>
>> I think the RTL IL should be always well-defined and its semantics 
>> shouldn't have any target dependences (ideally, and if, then they 
>> should be well specified via extra target hooks/macros).
>
> That would be nice :-) I think the problem has traditionally been that
>> shifts can be used in quite a few define_insn patterns besides those 
>> for shift instructions.  So if your target defines shifts to have 
>> 256-bit precision (say) then you need to make sure that every 
>> define_insn with a shift rtx will honour that.
>
> It's more natural for target guarantees to apply to instructions than 
> to
>> rtx codes.
>
>>>  And
>>> TARGET_SHIFT_TRUNCATION_MASK is a guarantee from the target about 
>>> how the normal shift optabs behave, so I don't think we'd need new 
>>> optabs or new unspecs.
>>>
>>> E.g. it already works this way when expanding double-word shifts, 
>>> which IIRC is why TARGET_SHIFT_TRUNCATION_MASK was added.  There 
>>> it's possible to use a shorter sequence if you know that the shift 
>>> optab truncates the count, so we can do that even if 
>>> SHIFT_COUNT_TRUNCATED isn't defined.
>>
>> I'm somewhat confused by docs saying TARGET_SHIFT_TRUNCATION_MASK 
>> applies to the instructions generated by the named shift patterns but 
>> _not_ general shift RTXen.  But the generated pattern contains shift 
>> RTXen and how can we figure whether they were generated by the named 
>> expanders or by other means?  Don't define_expand also serve as 
>> define_insn for things like combine?
>
> Yeah, you can't (and aren't supposed to try to) reverse-engineer the
>> expander from the generated instructions.
>> TARGET_SHIFT_TRUNCATION_MASK should only be used if you're expanding 
>> a shift optab.
>
>> That said, from looking at the docs and your description above it 
>> seems that semantics are not fully reflected in the RTL instruction stream?
>
> Yeah, the semantics go from being well-defined in the optab interface to being target-dependent in the rtl stream.
>
> Thanks,
> Richard
>
>>
>> Richard.
>>
>>> Thanks,
>>> Richard
>>>
>>>>
>>>> Richard.
>>>>
>>>>>Thanks,
>>>>>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-09-06 15:41                                         ` Michael Collison
@ 2017-09-06 16:53                                           ` Richard Sandiford
  2017-09-06 16:56                                             ` Richard Sandiford
  2017-09-06 17:21                                             ` Richard Sandiford
  0 siblings, 2 replies; 32+ messages in thread
From: Richard Sandiford @ 2017-09-06 16:53 UTC (permalink / raw)
  To: Michael Collison
  Cc: Richard Biener, Richard Kenner, GCC Patches, nd, Andrew Pinski

Michael Collison <Michael.Collison@arm.com> writes:
> Richard,
>
> The problem with this approach for Aarch64 is that
> TARGET_SHIFT_TRUNCATION_MASK is based on SHIFT_COUNT_TRUNCATED which is
> normally 0 as it based on the TARGET_SIMD flag.

Maybe I'm wrong, but that seems like a missed optimisation in itself.
Like you say, the definition is:

  static unsigned HOST_WIDE_INT
  aarch64_shift_truncation_mask (machine_mode mode)
  {
    return
      (!SHIFT_COUNT_TRUNCATED
       || aarch64_vector_mode_supported_p (mode)
       || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
  }

SHIFT_COUNT_TRUNCATED is:

  #define SHIFT_COUNT_TRUNCATED (!TARGET_SIMD)

and aarch64_vector_mode_supported_p always returns false for
!TARGET_SIMD:

  static bool
  aarch64_vector_mode_supported_p (machine_mode mode)
  {
    if (TARGET_SIMD
        && (mode == V4SImode  || mode == V8HImode
            || mode == V16QImode || mode == V2DImode
            || mode == V2SImode  || mode == V4HImode
            || mode == V8QImode || mode == V2SFmode
            || mode == V4SFmode || mode == V2DFmode
            || mode == V4HFmode || mode == V8HFmode
            || mode == V1DFmode))
      return true;

    return false;
  }

So when does the second || condition fire?

I'm surprised the aarch64_vect_struct_mode_p part is needed, since
this hook describes the shift optabs, and AArch64 don't provide any
shift optabs for OI, CI or XI.

Thanks,
Richard

> -----Original Message-----
> From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
> Sent: Wednesday, September 6, 2017 11:32 AM
> To: Michael Collison <Michael.Collison@arm.com>
> Cc: Richard Biener <richard.guenther@gmail.com>; Richard Kenner
> <kenner@vlsi1.ultra.nyu.edu>; GCC Patches <gcc-patches@gcc.gnu.org>; nd
> <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
> Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts
>
> Michael Collison <Michael.Collison@arm.com> writes:
>> Richard Sandiford do you have any objections to the patch as it stands?
>> It doesn't appear as if anything is going to change in the mid-end 
>> anytime soon.
>
> I think one of the suggestions was to do it in expand, taking advantage of range info and TARGET_SHIFT_TRUNCATION_MASK.  This would be like the current FMA_EXPR handling in expand_expr_real_2.
>
> I know there was talk about cleaner approaches, but at least doing the above seems cleaner than doing in the backend.  It should also be a nicely-contained piece of work.
>
> Thanks,
> Richard
>
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@linaro.org]
>> Sent: Tuesday, August 22, 2017 9:11 AM
>> To: Richard Biener <richard.guenther@gmail.com>
>> Cc: Richard Kenner <kenner@vlsi1.ultra.nyu.edu>; Michael Collison 
>> <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>; nd 
>> <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
>> Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts
>>
>> Richard Biener <richard.guenther@gmail.com> writes:
>>> On Tue, Aug 22, 2017 at 9:29 AM, Richard Sandiford 
>>> <richard.sandiford@linaro.org> wrote:
>>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>>> On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford 
>>>>> <richard.sandiford@linaro.org> wrote:
>>>>>>Richard Biener <richard.guenther@gmail.com> writes:
>>>>>>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner 
>>>>>>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>>>>>>> Correct. It is truncated for integer shift, but not simd shift 
>>>>>>>>> instructions. We generate a pattern in the split that only
>>>>>>generates
>>>>>>>>> the integer shift instructions.
>>>>>>>>
>>>>>>>> That's unfortunate, because it would be nice to do this in
>>>>>>simplify_rtx,
>>>>>>>> since it's machine-independent, but that has to be conditioned 
>>>>>>>> on SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>>>>>>
>>>>>>> SHIFT_COUNT_TRUNCATED should go ... you should express this in 
>>>>>>> the patterns, like for example with
>>>>>>>
>>>>>>> (define_insn ashlSI3
>>>>>>>   [(set (match_operand 0 "")
>>>>>>>          (ashl:SI (match_operand ... )
>>>>>>>                      (subreg:QI (match_operand:SI ...)))]
>>>>>>>
>>>>>>> or an explicit and:SI and combine / simplify_rtx should apply the
>>>>>>magic
>>>>>>> optimization we expect.
>>>>>>
>>>>>>The problem with the explicit AND is that you'd end up with either 
>>>>>>an AND of two constants for constant shifts, or with two separate 
>>>>>>patterns, one for constant shifts and one for variable shifts.  
>>>>>>(And the problem in theory with two patterns is that it reduces the 
>>>>>>RA's freedom, although in practice I guess we'd always want a 
>>>>>>constant shift where possible for cost reasons, and so the RA would 
>>>>>>never need to replace pseudos with constants itself.)
>>>>>>
>>>>>>I think all useful instances of this optimisation will be exposed 
>>>>>>by the gimple optimisers, so maybe expand could to do it based on 
>>>>>>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than 
>>>>>>the rtx code and it does take the mode into account.
>>>>>
>>>>> Sure, that could work as well and also take into account range info. 
>>>>> But we'd then need named expanders and the result would still have 
>>>>> the explicit and or need to be an unspec or a different RTL operation.
>>>>
>>>> Without SHIFT_COUNT_TRUNCATED, out-of-range rtl shifts have 
>>>> target-dependent rather than undefined behaviour, so it's OK for a 
>>>> target to use shift codes with out-of-range values.
>>>
>>> Hmm, but that means simplify-rtx can't do anything with them because 
>>> we need to preserve target dependent behavior.
>>
>> Yeah, it needs to punt.  In practice that shouldn't matter much.
>>
>>> I think the RTL IL should be always well-defined and its semantics 
>>> shouldn't have any target dependences (ideally, and if, then they 
>>> should be well specified via extra target hooks/macros).
>>
>> That would be nice :-) I think the problem has traditionally been that
>>> shifts can be used in quite a few define_insn patterns besides those 
>>> for shift instructions.  So if your target defines shifts to have 
>>> 256-bit precision (say) then you need to make sure that every 
>>> define_insn with a shift rtx will honour that.
>>
>> It's more natural for target guarantees to apply to instructions than 
>> to
>>> rtx codes.
>>
>>>>  And
>>>> TARGET_SHIFT_TRUNCATION_MASK is a guarantee from the target about 
>>>> how the normal shift optabs behave, so I don't think we'd need new 
>>>> optabs or new unspecs.
>>>>
>>>> E.g. it already works this way when expanding double-word shifts, 
>>>> which IIRC is why TARGET_SHIFT_TRUNCATION_MASK was added.  There 
>>>> it's possible to use a shorter sequence if you know that the shift 
>>>> optab truncates the count, so we can do that even if 
>>>> SHIFT_COUNT_TRUNCATED isn't defined.
>>>
>>> I'm somewhat confused by docs saying TARGET_SHIFT_TRUNCATION_MASK 
>>> applies to the instructions generated by the named shift patterns but 
>>> _not_ general shift RTXen.  But the generated pattern contains shift 
>>> RTXen and how can we figure whether they were generated by the named 
>>> expanders or by other means?  Don't define_expand also serve as 
>>> define_insn for things like combine?
>>
>> Yeah, you can't (and aren't supposed to try to) reverse-engineer the
>>> expander from the generated instructions.
>>> TARGET_SHIFT_TRUNCATION_MASK should only be used if you're expanding 
>>> a shift optab.
>>
>>> That said, from looking at the docs and your description above it 
>>> seems that semantics are not fully reflected in the RTL instruction stream?
>>
>> Yeah, the semantics go from being well-defined in the optab interface
>> to being target-dependent in the rtl stream.
>>
>> Thanks,
>> Richard
>>
>>>
>>> Richard.
>>>
>>>> Thanks,
>>>> Richard
>>>>
>>>>>
>>>>> Richard.
>>>>>
>>>>>>Thanks,
>>>>>>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-09-06 16:53                                           ` Richard Sandiford
@ 2017-09-06 16:56                                             ` Richard Sandiford
  2017-09-06 17:21                                             ` Richard Sandiford
  1 sibling, 0 replies; 32+ messages in thread
From: Richard Sandiford @ 2017-09-06 16:56 UTC (permalink / raw)
  To: Michael Collison
  Cc: Richard Biener, Richard Kenner, GCC Patches, nd, Andrew Pinski

Richard Sandiford <richard.sandiford@linaro.org> writes:
> Michael Collison <Michael.Collison@arm.com> writes:
>> Richard,
>>
>> The problem with this approach for Aarch64 is that
>> TARGET_SHIFT_TRUNCATION_MASK is based on SHIFT_COUNT_TRUNCATED which is
>> normally 0 as it based on the TARGET_SIMD flag.
>
> Maybe I'm wrong, but that seems like a missed optimisation in itself.
> Like you say, the definition is:
>
>   static unsigned HOST_WIDE_INT
>   aarch64_shift_truncation_mask (machine_mode mode)
>   {
>     return
>       (!SHIFT_COUNT_TRUNCATED
>        || aarch64_vector_mode_supported_p (mode)
>        || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE
>        || (mode) - 1);
>   }

er,

aarch64_shift_truncation_mask (machine_mode mode)
{
  return
    (!SHIFT_COUNT_TRUNCATED
     || aarch64_vector_mode_supported_p (mode)
     || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
}

> SHIFT_COUNT_TRUNCATED is:
>
>   #define SHIFT_COUNT_TRUNCATED (!TARGET_SIMD)
>
> and aarch64_vector_mode_supported_p always returns false for
> !TARGET_SIMD:
>
>   static bool
>   aarch64_vector_mode_supported_p (machine_mode mode)
>   {
>     if (TARGET_SIMD
>         && (mode == V4SImode  || mode == V8HImode
>             || mode == V16QImode || mode == V2DImode
>             || mode == V2SImode  || mode == V4HImode
>             || mode == V8QImode || mode == V2SFmode
>             || mode == V4SFmode || mode == V2DFmode
>             || mode == V4HFmode || mode == V8HFmode
>             || mode == V1DFmode))
>       return true;
>
>     return false;
>   }
>
> So when does the second || condition fire?
>
> I'm surprised the aarch64_vect_struct_mode_p part is needed, since
> this hook describes the shift optabs, and AArch64 don't provide any
> shift optabs for OI, CI or XI.
>
> Thanks,
> Richard
>
>> -----Original Message-----
>> From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
>> Sent: Wednesday, September 6, 2017 11:32 AM
>> To: Michael Collison <Michael.Collison@arm.com>
>> Cc: Richard Biener <richard.guenther@gmail.com>; Richard Kenner
>> <kenner@vlsi1.ultra.nyu.edu>; GCC Patches <gcc-patches@gcc.gnu.org>; nd
>> <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
>> Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts
>>
>> Michael Collison <Michael.Collison@arm.com> writes:
>>> Richard Sandiford do you have any objections to the patch as it stands?
>>> It doesn't appear as if anything is going to change in the mid-end 
>>> anytime soon.
>>
>> I think one of the suggestions was to do it in expand, taking
>> advantage of range info and TARGET_SHIFT_TRUNCATION_MASK.  This would
>> be like the current FMA_EXPR handling in expand_expr_real_2.
>>
>> I know there was talk about cleaner approaches, but at least doing the
>> above seems cleaner than doing in the backend.  It should also be a
>> nicely-contained piece of work.
>>
>> Thanks,
>> Richard
>>
>>> -----Original Message-----
>>> From: Richard Sandiford [mailto:richard.sandiford@linaro.org]
>>> Sent: Tuesday, August 22, 2017 9:11 AM
>>> To: Richard Biener <richard.guenther@gmail.com>
>>> Cc: Richard Kenner <kenner@vlsi1.ultra.nyu.edu>; Michael Collison 
>>> <Michael.Collison@arm.com>; GCC Patches <gcc-patches@gcc.gnu.org>; nd 
>>> <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
>>> Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts
>>>
>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>> On Tue, Aug 22, 2017 at 9:29 AM, Richard Sandiford 
>>>> <richard.sandiford@linaro.org> wrote:
>>>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>>>> On August 21, 2017 7:46:09 PM GMT+02:00, Richard Sandiford 
>>>>>> <richard.sandiford@linaro.org> wrote:
>>>>>>>Richard Biener <richard.guenther@gmail.com> writes:
>>>>>>>> On Tue, Aug 8, 2017 at 10:20 PM, Richard Kenner 
>>>>>>>> <kenner@vlsi1.ultra.nyu.edu> wrote:
>>>>>>>>>> Correct. It is truncated for integer shift, but not simd shift 
>>>>>>>>>> instructions. We generate a pattern in the split that only
>>>>>>>generates
>>>>>>>>>> the integer shift instructions.
>>>>>>>>>
>>>>>>>>> That's unfortunate, because it would be nice to do this in
>>>>>>>simplify_rtx,
>>>>>>>>> since it's machine-independent, but that has to be conditioned 
>>>>>>>>> on SHIFT_COUNT_TRUNCATED, so you wouldn't get the benefit of it.
>>>>>>>>
>>>>>>>> SHIFT_COUNT_TRUNCATED should go ... you should express this in 
>>>>>>>> the patterns, like for example with
>>>>>>>>
>>>>>>>> (define_insn ashlSI3
>>>>>>>>   [(set (match_operand 0 "")
>>>>>>>>          (ashl:SI (match_operand ... )
>>>>>>>>                      (subreg:QI (match_operand:SI ...)))]
>>>>>>>>
>>>>>>>> or an explicit and:SI and combine / simplify_rtx should apply the
>>>>>>>magic
>>>>>>>> optimization we expect.
>>>>>>>
>>>>>>>The problem with the explicit AND is that you'd end up with either 
>>>>>>>an AND of two constants for constant shifts, or with two separate 
>>>>>>>patterns, one for constant shifts and one for variable shifts.  
>>>>>>>(And the problem in theory with two patterns is that it reduces the 
>>>>>>>RA's freedom, although in practice I guess we'd always want a 
>>>>>>>constant shift where possible for cost reasons, and so the RA would 
>>>>>>>never need to replace pseudos with constants itself.)
>>>>>>>
>>>>>>>I think all useful instances of this optimisation will be exposed 
>>>>>>>by the gimple optimisers, so maybe expand could to do it based on 
>>>>>>>TARGET_SHIFT_TRUNCATION_MASK?  That describes the optab rather than 
>>>>>>>the rtx code and it does take the mode into account.
>>>>>>
>>>>>> Sure, that could work as well and also take into account range info. 
>>>>>> But we'd then need named expanders and the result would still have 
>>>>>> the explicit and or need to be an unspec or a different RTL operation.
>>>>>
>>>>> Without SHIFT_COUNT_TRUNCATED, out-of-range rtl shifts have 
>>>>> target-dependent rather than undefined behaviour, so it's OK for a 
>>>>> target to use shift codes with out-of-range values.
>>>>
>>>> Hmm, but that means simplify-rtx can't do anything with them because 
>>>> we need to preserve target dependent behavior.
>>>
>>> Yeah, it needs to punt.  In practice that shouldn't matter much.
>>>
>>>> I think the RTL IL should be always well-defined and its semantics 
>>>> shouldn't have any target dependences (ideally, and if, then they 
>>>> should be well specified via extra target hooks/macros).
>>>
>>> That would be nice :-) I think the problem has traditionally been that
>>>> shifts can be used in quite a few define_insn patterns besides those 
>>>> for shift instructions.  So if your target defines shifts to have 
>>>> 256-bit precision (say) then you need to make sure that every 
>>>> define_insn with a shift rtx will honour that.
>>>
>>> It's more natural for target guarantees to apply to instructions than 
>>> to
>>>> rtx codes.
>>>
>>>>>  And
>>>>> TARGET_SHIFT_TRUNCATION_MASK is a guarantee from the target about 
>>>>> how the normal shift optabs behave, so I don't think we'd need new 
>>>>> optabs or new unspecs.
>>>>>
>>>>> E.g. it already works this way when expanding double-word shifts, 
>>>>> which IIRC is why TARGET_SHIFT_TRUNCATION_MASK was added.  There 
>>>>> it's possible to use a shorter sequence if you know that the shift 
>>>>> optab truncates the count, so we can do that even if 
>>>>> SHIFT_COUNT_TRUNCATED isn't defined.
>>>>
>>>> I'm somewhat confused by docs saying TARGET_SHIFT_TRUNCATION_MASK 
>>>> applies to the instructions generated by the named shift patterns but 
>>>> _not_ general shift RTXen.  But the generated pattern contains shift 
>>>> RTXen and how can we figure whether they were generated by the named 
>>>> expanders or by other means?  Don't define_expand also serve as 
>>>> define_insn for things like combine?
>>>
>>> Yeah, you can't (and aren't supposed to try to) reverse-engineer the
>>>> expander from the generated instructions.
>>>> TARGET_SHIFT_TRUNCATION_MASK should only be used if you're expanding 
>>>> a shift optab.
>>>
>>>> That said, from looking at the docs and your description above it 
>>>> seems that semantics are not fully reflected in the RTL instruction stream?
>>>
>>> Yeah, the semantics go from being well-defined in the optab interface
>>> to being target-dependent in the rtl stream.
>>>
>>> Thanks,
>>> Richard
>>>
>>>>
>>>> Richard.
>>>>
>>>>> Thanks,
>>>>> Richard
>>>>>
>>>>>>
>>>>>> Richard.
>>>>>>
>>>>>>>Thanks,
>>>>>>>Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-09-06 16:53                                           ` Richard Sandiford
  2017-09-06 16:56                                             ` Richard Sandiford
@ 2017-09-06 17:21                                             ` Richard Sandiford
  2017-09-15  8:15                                               ` Michael Collison
  1 sibling, 1 reply; 32+ messages in thread
From: Richard Sandiford @ 2017-09-06 17:21 UTC (permalink / raw)
  To: Michael Collison
  Cc: Richard Biener, Richard Kenner, GCC Patches, nd, Andrew Pinski

Richard Sandiford <richard.sandiford@linaro.org> writes:
> Richard Sandiford <richard.sandiford@linaro.org> writes:
>> Michael Collison <Michael.Collison@arm.com> writes:
>>> Richard,
>>>
>>> The problem with this approach for Aarch64 is that
>>> TARGET_SHIFT_TRUNCATION_MASK is based on SHIFT_COUNT_TRUNCATED which is
>>> normally 0 as it based on the TARGET_SIMD flag.
>>
>> Maybe I'm wrong, but that seems like a missed optimisation in itself.

Sorry to follow up on myself yet again, but I'd forgotten this was
because we allow the SIMD unit to do scalar shifts.  So I guess
we have no choice, even though it seems unfortunate.

> +(define_insn_and_split "*aarch64_reg_<optab>_minus<mode>3"
> +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> +	(ASHIFT:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (minus:QI (match_operand 2 "const_int_operand" "n")
> +		    (match_operand:QI 3 "register_operand" "r"))))]
> +  "INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +  {
> +    /* Handle cases where operand 3 is a plain QI register, or
> +       a subreg with either a SImode or DImode register.  */
> +
> +    rtx subreg_tmp = (REG_P (operands[3])
> +		      ? gen_lowpart_SUBREG (SImode, operands[3])
> +		      : SUBREG_REG (operands[3]));
> +
> +    if (REG_P (subreg_tmp) && GET_MODE (subreg_tmp) == DImode)
> +      subreg_tmp = gen_lowpart_SUBREG (SImode, subreg_tmp);

I think this all simplifies to:

  rtx subreg_tmp = gen_lowpart (SImode, operands[3]);

(or it would be worth having a comment that explains why not).
As well as being shorter, it will properly simplify hard REGs
to new hard REGs.

> +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> +	       : operands[0]);
> +
> +    if (<MODE>mode == DImode && !can_create_pseudo_p ())
> +      tmp = gen_lowpart_SUBREG (SImode, operands[0]);

I think this too would be simpler with gen_lowpart:

    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
	       : gen_lowpart (SImode, operands[0]));

> +
> +    emit_insn (gen_negsi2 (tmp, subreg_tmp));
> +
> +    rtx and_op = gen_rtx_AND (SImode, tmp,
> +			      GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - 1));
> +
> +    rtx subreg_tmp2 = gen_lowpart_SUBREG (QImode, and_op);
> +
> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp2));
> +    DONE;
> +  }
> +)

The pattern should probably set the "length" attribute to 8.

Looks good to me with those changes FWIW.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-09-06 17:21                                             ` Richard Sandiford
@ 2017-09-15  8:15                                               ` Michael Collison
  2017-09-29 23:01                                                 ` James Greenhalgh
  0 siblings, 1 reply; 32+ messages in thread
From: Michael Collison @ 2017-09-15  8:15 UTC (permalink / raw)
  To: Richard Sandiford
  Cc: Richard Biener, Richard Kenner, GCC Patches, nd, Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 3379 bytes --]

Patch updated with comments to simplify pattern .from Richard Sandiford. Okay for trunk?

2017-09-14  Michael Collison <michael.collison@arm.com>

	* config/aarch64/aarch64.md (*aarch64_reg_<optab>_minus<mode>3):
	New pattern.

2017-09-14  Michael Collison <michael.collison@arm.com>

	* gcc.target/aarch64/var_shift_mask_2.c: New test.

-----Original Message-----
From: Richard Sandiford [mailto:richard.sandiford@linaro.org] 
Sent: Wednesday, September 6, 2017 10:22 AM
To: Michael Collison <Michael.Collison@arm.com>
Cc: Richard Biener <richard.guenther@gmail.com>; Richard Kenner <kenner@vlsi1.ultra.nyu.edu>; GCC Patches <gcc-patches@gcc.gnu.org>; nd <nd@arm.com>; Andrew Pinski <pinskia@gmail.com>
Subject: Re: [PATCH] [Aarch64] Optimize subtract in shift counts

Richard Sandiford <richard.sandiford@linaro.org> writes:
> Richard Sandiford <richard.sandiford@linaro.org> writes:
>> Michael Collison <Michael.Collison@arm.com> writes:
>>> Richard,
>>>
>>> The problem with this approach for Aarch64 is that 
>>> TARGET_SHIFT_TRUNCATION_MASK is based on SHIFT_COUNT_TRUNCATED which 
>>> is normally 0 as it based on the TARGET_SIMD flag.
>>
>> Maybe I'm wrong, but that seems like a missed optimisation in itself.

Sorry to follow up on myself yet again, but I'd forgotten this was because we allow the SIMD unit to do scalar shifts.  So I guess we have no choice, even though it seems unfortunate.

> +(define_insn_and_split "*aarch64_reg_<optab>_minus<mode>3"
> +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> +	(ASHIFT:GPI
> +	  (match_operand:GPI 1 "register_operand" "r")
> +	  (minus:QI (match_operand 2 "const_int_operand" "n")
> +		    (match_operand:QI 3 "register_operand" "r"))))]
> +  "INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +  {
> +    /* Handle cases where operand 3 is a plain QI register, or
> +       a subreg with either a SImode or DImode register.  */
> +
> +    rtx subreg_tmp = (REG_P (operands[3])
> +		      ? gen_lowpart_SUBREG (SImode, operands[3])
> +		      : SUBREG_REG (operands[3]));
> +
> +    if (REG_P (subreg_tmp) && GET_MODE (subreg_tmp) == DImode)
> +      subreg_tmp = gen_lowpart_SUBREG (SImode, subreg_tmp);

I think this all simplifies to:

  rtx subreg_tmp = gen_lowpart (SImode, operands[3]);

(or it would be worth having a comment that explains why not).
As well as being shorter, it will properly simplify hard REGs to new hard REGs.

> +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> +	       : operands[0]);
> +
> +    if (<MODE>mode == DImode && !can_create_pseudo_p ())
> +      tmp = gen_lowpart_SUBREG (SImode, operands[0]);

I think this too would be simpler with gen_lowpart:

    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
	       : gen_lowpart (SImode, operands[0]));

> +
> +    emit_insn (gen_negsi2 (tmp, subreg_tmp));
> +
> +    rtx and_op = gen_rtx_AND (SImode, tmp,
> +			      GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - 1));
> +
> +    rtx subreg_tmp2 = gen_lowpart_SUBREG (QImode, and_op);
> +
> +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp2));
> +    DONE;
> +  }
> +)

The pattern should probably set the "length" attribute to 8.

Looks good to me with those changes FWIW.

Thanks,
Richard

[-- Attachment #2: pr7313v7.patch --]
[-- Type: application/octet-stream, Size: 2694 bytes --]

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index f876a2b..9c2ccf8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4044,6 +4044,35 @@
   [(set_attr "type" "shift_reg")]
 )
 
+(define_insn_and_split "*aarch64_reg_<optab>_minus<mode>3"
+  [(set (match_operand:GPI 0 "register_operand" "=&r")
+	(ASHIFT:GPI
+	  (match_operand:GPI 1 "register_operand" "r")
+	  (minus:QI (match_operand 2 "const_int_operand" "n")
+		    (match_operand:QI 3 "register_operand" "r"))))]
+  "INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
+  "#"
+  "&& true"
+  [(const_int 0)]
+  {
+    rtx subreg_tmp = gen_lowpart (SImode, operands[3]);
+
+    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
+	       : gen_lowpart (SImode, operands[0]));
+
+    emit_insn (gen_negsi2 (tmp, subreg_tmp));
+
+    rtx and_op = gen_rtx_AND (SImode, tmp,
+			      GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - 1));
+
+    rtx subreg_tmp2 = gen_lowpart_SUBREG (QImode, and_op);
+
+    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp2));
+    DONE;
+  }
+  [(set_attr "length" "8")]
+)
+
 ;; Logical left shift using SISD or Integer instruction
 (define_insn "*aarch64_ashl_sisd_or_int_<mode>3"
   [(set (match_operand:GPI 0 "register_operand" "=r,r,w,w")
diff --git a/gcc/testsuite/gcc.target/aarch64/var_shift_mask_2.c b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_2.c
new file mode 100644
index 0000000..c1fe691
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/var_shift_mask_2.c
@@ -0,0 +1,51 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+long long
+f1 (long long x, int i)
+{
+
+  return x >> (64 - i);
+}
+
+unsigned long long
+f2 (unsigned long long x, unsigned int i)
+{
+
+  return x >> (64 - i);
+}
+
+int
+f3 (int x, int i)
+{
+
+  return x >> (32 - i);
+}
+
+unsigned int
+f4 (unsigned int x, unsigned int i)
+{
+
+  return x >> (32 - i);
+}
+
+int
+f5 (int x, int i)
+{
+  return x << (32 - i);
+}
+
+long long
+f6 (long long x, int i)
+{
+  return x << (64 - i);
+}
+
+/* { dg-final { scan-assembler "lsl\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
+/* { dg-final { scan-assembler "lsl\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" } } */
+/* { dg-final { scan-assembler "lsr\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
+/* { dg-final { scan-assembler "lsr\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" } } */
+/* { dg-final { scan-assembler "asr\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
+/* { dg-final { scan-assembler "asr\tx\[0-9\]+, x\[0-9\]+, x\[0-9\]+" } } */
+/* { dg-final { scan-assembler-times "neg\tw\[0-9\]+, w\[0-9\]+" 6 } } */
+/* { dg-final { scan-assembler-not "sub\tw\[0-9\]+, w\[0-9\]+, w\[0-9\]+" } } */
-- 
1.9.1


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
  2017-09-15  8:15                                               ` Michael Collison
@ 2017-09-29 23:01                                                 ` James Greenhalgh
  0 siblings, 0 replies; 32+ messages in thread
From: James Greenhalgh @ 2017-09-29 23:01 UTC (permalink / raw)
  To: Michael Collison
  Cc: Richard Sandiford, Richard Biener, Richard Kenner, GCC Patches,
	nd, Andrew Pinski

On Fri, Sep 15, 2017 at 09:15:17AM +0100, Michael Collison wrote:
> Patch updated with comments to simplify pattern .from Richard Sandiford. Okay for trunk?

OK.

Thanks,
James

> 
> 2017-09-14  Michael Collison <michael.collison@arm.com>
> 
> 	* config/aarch64/aarch64.md (*aarch64_reg_<optab>_minus<mode>3):
> 	New pattern.
> 
> 2017-09-14  Michael Collison <michael.collison@arm.com>
> 
> 	* gcc.target/aarch64/var_shift_mask_2.c: New test.
> 
> Richard Sandiford <richard.sandiford@linaro.org> writes:
> > Richard Sandiford <richard.sandiford@linaro.org> writes:
> >> Michael Collison <Michael.Collison@arm.com> writes:
> >>> Richard,
> >>>
> >>> The problem with this approach for Aarch64 is that 
> >>> TARGET_SHIFT_TRUNCATION_MASK is based on SHIFT_COUNT_TRUNCATED which 
> >>> is normally 0 as it based on the TARGET_SIMD flag.
> >>
> >> Maybe I'm wrong, but that seems like a missed optimisation in itself.
> 
> Sorry to follow up on myself yet again, but I'd forgotten this was because we allow the SIMD unit to do scalar shifts.  So I guess we have no choice, even though it seems unfortunate.
> 
> > +(define_insn_and_split "*aarch64_reg_<optab>_minus<mode>3"
> > +  [(set (match_operand:GPI 0 "register_operand" "=&r")
> > +	(ASHIFT:GPI
> > +	  (match_operand:GPI 1 "register_operand" "r")
> > +	  (minus:QI (match_operand 2 "const_int_operand" "n")
> > +		    (match_operand:QI 3 "register_operand" "r"))))]
> > +  "INTVAL (operands[2]) == GET_MODE_BITSIZE (<MODE>mode)"
> > +  "#"
> > +  "&& true"
> > +  [(const_int 0)]
> > +  {
> > +    /* Handle cases where operand 3 is a plain QI register, or
> > +       a subreg with either a SImode or DImode register.  */
> > +
> > +    rtx subreg_tmp = (REG_P (operands[3])
> > +		      ? gen_lowpart_SUBREG (SImode, operands[3])
> > +		      : SUBREG_REG (operands[3]));
> > +
> > +    if (REG_P (subreg_tmp) && GET_MODE (subreg_tmp) == DImode)
> > +      subreg_tmp = gen_lowpart_SUBREG (SImode, subreg_tmp);
> 
> I think this all simplifies to:
> 
>   rtx subreg_tmp = gen_lowpart (SImode, operands[3]);
> 
> (or it would be worth having a comment that explains why not).
> As well as being shorter, it will properly simplify hard REGs to new hard REGs.
> 
> > +    rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> > +	       : operands[0]);
> > +
> > +    if (<MODE>mode == DImode && !can_create_pseudo_p ())
> > +      tmp = gen_lowpart_SUBREG (SImode, operands[0]);
> 
> I think this too would be simpler with gen_lowpart:
> 
>     rtx tmp = (can_create_pseudo_p () ? gen_reg_rtx (SImode)
> 	       : gen_lowpart (SImode, operands[0]));
> 
> > +
> > +    emit_insn (gen_negsi2 (tmp, subreg_tmp));
> > +
> > +    rtx and_op = gen_rtx_AND (SImode, tmp,
> > +			      GEN_INT (GET_MODE_BITSIZE (<MODE>mode) - 1));
> > +
> > +    rtx subreg_tmp2 = gen_lowpart_SUBREG (QImode, and_op);
> > +
> > +    emit_insn (gen_<optab><mode>3 (operands[0], operands[1], subreg_tmp2));
> > +    DONE;
> > +  }
> > +)
> 
> The pattern should probably set the "length" attribute to 8.
> 
> Looks good to me with those changes FWIW.
> 
> Thanks,
> Richard


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH] [Aarch64] Optimize subtract in shift counts
@ 2017-08-22 11:01 Wilco Dijkstra
  0 siblings, 0 replies; 32+ messages in thread
From: Wilco Dijkstra @ 2017-08-22 11:01 UTC (permalink / raw)
  To: Richard Biener, richard.sandiford
  Cc: kenner, Michael Collison, GCC Patches, nd

Hi,

The main reason we have this issue is that DImode can be treated as a
vector of size 1. As a result we do not know whether the shift is an integer
or SIMD instruction. One way around this is to never use the SIMD variant,
another is to introduce V1DImode for vectors of size 1.

Short term I believe the first option is best - I don't think scalar operations executed
as SIMD instructions are useful in general, and these shifts are likely never used in
reality, so we're adding a lot of complexity and potential bugs .
Long term we need to compute register classes and do instruction selection
much earlier so this doesn't increase complexity of register allocation and machine
descriptors.

Wilco

^ permalink raw reply	[flat|nested] 32+ messages in thread

* RE: [PATCH] [Aarch64] Optimize subtract in shift counts
@ 2017-08-08 10:04 Wilco Dijkstra
  0 siblings, 0 replies; 32+ messages in thread
From: Wilco Dijkstra @ 2017-08-08 10:04 UTC (permalink / raw)
  To: GCC Patches, kenner; +Cc: nd, Michael Collison, pinskia

Richard Kenner wrote:
>Michael Collison wrote:
> > On Aarc64 SHIFT_COUNT_TRUNCATED is only true if SIMD code generation
> > is disabled. This is because the simd instructions can be used for
> > shifting but they do not truncate the shift count.
>
> In that case, the change isn't safe!  Consider if the value was
> negative, for example.  Yes, it's technically undefined, but I'm not
> sure I'd want to rely on that.

No it's perfectly safe - it becomes an integer-only shift after the split since it
keeps the masking as part of the pattern.

But generally the SHIFT_COUNT_TRUNCATED is a mess, and so are other ways
of doing this - note the extremely complex subregs in the patch, none of that should
be required as there are no QI registers on AArch64! So it would be great if there
was a better way to describe the number of bits used by a particular shift alternative.

Wilco

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2017-09-29 23:01 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-07 20:36 [PATCH] [Aarch64] Optimize subtract in shift counts Michael Collison
2017-08-07 20:44 ` Andrew Pinski
2017-08-07 21:02   ` Richard Kenner
2017-08-07 23:44     ` Michael Collison
2017-08-08  1:56       ` Richard Kenner
2017-08-08  4:06         ` Michael Collison
2017-08-08 12:13           ` Richard Kenner
2017-08-08 19:46             ` Michael Collison
2017-08-08 19:52               ` Richard Kenner
2017-08-08 19:59                 ` Michael Collison
2017-08-08 20:04                   ` Richard Kenner
2017-08-08 20:18                     ` Michael Collison
2017-08-08 20:20                       ` Richard Kenner
2017-08-08 20:33                         ` Michael Collison
2017-08-14  8:58                         ` Richard Biener
2017-08-15  7:28                           ` Michael Collison
2017-08-21 19:11                           ` Richard Sandiford
2017-08-21 19:14                             ` Richard Biener
2017-08-22  8:17                               ` Richard Sandiford
2017-08-22  8:46                                 ` Richard Biener
2017-08-22  8:50                                   ` Richard Sandiford
2017-09-06  9:11                                     ` Michael Collison
2017-09-06 10:32                                       ` Richard Sandiford
2017-09-06 15:41                                         ` Michael Collison
2017-09-06 16:53                                           ` Richard Sandiford
2017-09-06 16:56                                             ` Richard Sandiford
2017-09-06 17:21                                             ` Richard Sandiford
2017-09-15  8:15                                               ` Michael Collison
2017-09-29 23:01                                                 ` James Greenhalgh
2017-08-07 20:58 ` Richard Kenner
2017-08-08 10:04 Wilco Dijkstra
2017-08-22 11:01 Wilco Dijkstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).