public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [Patch AArch64] Stop generating BSL for simple integer code
@ 2017-06-12 13:36 James Greenhalgh
  2017-06-12 13:45 ` James Greenhalgh
  0 siblings, 1 reply; 7+ messages in thread
From: James Greenhalgh @ 2017-06-12 13:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, richard.earnshaw, marcus.shawcroft

[-- Attachment #1: Type: text/plain, Size: 1246 bytes --]


Hi,

In this testcase, all argument registers and the return register
will be general purpose registers:

  long long
  foo (long long a, long long b, long long c)
  {
    return ((a ^ b) & c) ^ b;
  }

However, due to the implementation of aarch64_simd_bsl<mode>_internal
we'll match that pattern and emit a BSL, necessitating moving all those
arguments and results to the Advanced SIMD registers:

	fmov	d2, x0
	fmov	d0, x2
	fmov	d1, x1
	bsl	v0.8b, v2.8b, v1.8b
	fmov	x0, d0

To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
knows to split back to integer operations if the register allocation
falls that way.

We could have used an unspec, but then we lose some of the nice
simplifications that can be made from explicitly spelling out the semantics
of BSL.

Bootstrapped on aarch64-none-linux-gnu.

OK?

Thanks,
James

---
gcc/

2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-simd.md
	(aarch64_simd_bsl<mode>_internal): Remove DImode.
	(*aarch64_simd_bsl<mode>_alt): Likewise.
	(aarch64_simd_bsldi_internal): New.

gcc/testsuite/

2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/no-dimode-bsl.c: New.
	* gcc.target/aarch64/dimode-bsl.c: New.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patch-AArch64-Stop-generating-BSL-for-simple-integer.patch --]
[-- Type: text/x-patch; name="0001-Patch-AArch64-Stop-generating-BSL-for-simple-integer.patch", Size: 4863 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5a86ff..eea4d25 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2256,13 +2256,13 @@
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
 	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
 	  (match_dup:<V_cmp_result> 3)
 	))]
   "TARGET_SIMD"
@@ -2280,14 +2280,14 @@
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
-	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-	  (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
+	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
+	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+	  (match_dup:VDQ_I 2)))]
   "TARGET_SIMD"
   "@
   bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2296,6 +2296,44 @@
   [(set_attr "type" "neon_bsl<q>")]
 )
 
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,0,w,r")
+	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 3)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  */
+  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
+  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
+  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl")]
+)
+
 (define_expand "aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_cmp_result> 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
new file mode 100644
index 0000000..4e63511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we can generate DImode BSL when we are using
+   copysign.  */
+
+double
+foo (double a, double b)
+{
+  return __builtin_copysign (a, b);
+}
+
+/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
new file mode 100644
index 0000000..67dfda0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we don't combine to BSL when in DImode, avoiding register
+   moves in the general case.
+
+   We want:
+	eor	x0, x0, x1
+	and	x0, x0, x2
+	eor	x0, x0, x1
+	ret
+
+   Rather than:
+	fmov	d2, x0
+	fmov	d0, x2
+	fmov	d1, x1
+	bsl	v0.8b, v2.8b, v1.8b
+	fmov	x0, d0
+	ret  */
+
+long long
+foo (long long a, long long b, long long c)
+{
+  return ((a ^ b) & c) ^ b;
+}
+
+/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [Patch AArch64] Stop generating BSL for simple integer code
  2017-06-12 13:36 [Patch AArch64] Stop generating BSL for simple integer code James Greenhalgh
@ 2017-06-12 13:45 ` James Greenhalgh
  2017-06-21 10:49   ` James Greenhalgh
  0 siblings, 1 reply; 7+ messages in thread
From: James Greenhalgh @ 2017-06-12 13:45 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, richard.earnshaw, marcus.shawcroft

[-- Attachment #1: Type: text/plain, Size: 1509 bytes --]

[Sorry for the re-send. I spotted that the attributes were not right for the
 new pattern I was adding. The change between this and the first version was:

  +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
  +   (set_attr "length" "4,4,4,12")]
]

---

Hi,

In this testcase, all argument registers and the return register
will be general purpose registers:

  long long
  foo (long long a, long long b, long long c)
  {
    return ((a ^ b) & c) ^ b;
  }

However, due to the implementation of aarch64_simd_bsl<mode>_internal
we'll match that pattern and emit a BSL, necessitating moving all those
arguments and results to the Advanced SIMD registers:

	fmov	d2, x0
	fmov	d0, x2
	fmov	d1, x1
	bsl	v0.8b, v2.8b, v1.8b
	fmov	x0, d0

To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
knows to split back to integer operations if the register allocation
falls that way.

We could have used an unspec, but then we lose some of the nice
simplifications that can be made from explicitly spelling out the semantics
of BSL.

Bootstrapped on aarch64-none-linux-gnu.

OK?

Thanks,
James

---
gcc/

2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-simd.md
	(aarch64_simd_bsl<mode>_internal): Remove DImode.
	(*aarch64_simd_bsl<mode>_alt): Likewise.
	(aarch64_simd_bsldi_internal): New.

gcc/testsuite/

2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/no-dimode-bsl.c: New.
	* gcc.target/aarch64/dimode-bsl.c: New.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patch-AArch64-Stop-generating-BSL-for-simple-integer.patch --]
[-- Type: text/x-patch; name="0001-Patch-AArch64-Stop-generating-BSL-for-simple-integer.patch", Size: 4925 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c5a86ff..7b6b12f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2256,13 +2256,13 @@
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
 	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
 	  (match_dup:<V_cmp_result> 3)
 	))]
   "TARGET_SIMD"
@@ -2280,14 +2280,14 @@
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
-	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-	  (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
+	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
+	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+	  (match_dup:VDQ_I 2)))]
   "TARGET_SIMD"
   "@
   bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2296,6 +2296,45 @@
   [(set_attr "type" "neon_bsl<q>")]
 )
 
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,0,w,r")
+	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 3)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  */
+  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
+  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
+  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
 (define_expand "aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_cmp_result> 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
new file mode 100644
index 0000000..4e63511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we can generate DImode BSL when we are using
+   copysign.  */
+
+double
+foo (double a, double b)
+{
+  return __builtin_copysign (a, b);
+}
+
+/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
new file mode 100644
index 0000000..67dfda0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we don't combine to BSL when in DImode, avoiding register
+   moves in the general case.
+
+   We want:
+	eor	x0, x0, x1
+	and	x0, x0, x2
+	eor	x0, x0, x1
+	ret
+
+   Rather than:
+	fmov	d2, x0
+	fmov	d0, x2
+	fmov	d1, x1
+	bsl	v0.8b, v2.8b, v1.8b
+	fmov	x0, d0
+	ret  */
+
+long long
+foo (long long a, long long b, long long c)
+{
+  return ((a ^ b) & c) ^ b;
+}
+
+/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Patch AArch64] Stop generating BSL for simple integer code
  2017-06-12 13:45 ` James Greenhalgh
@ 2017-06-21 10:49   ` James Greenhalgh
  2017-07-03 10:48     ` James Greenhalgh
  0 siblings, 1 reply; 7+ messages in thread
From: James Greenhalgh @ 2017-06-21 10:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, richard.earnshaw, marcus.shawcroft

*ping*

Thanks,
James

On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> [Sorry for the re-send. I spotted that the attributes were not right for the
>  new pattern I was adding. The change between this and the first version was:
> 
>   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
>   +   (set_attr "length" "4,4,4,12")]
> ]
> 
> ---
> 
> Hi,
> 
> In this testcase, all argument registers and the return register
> will be general purpose registers:
> 
>   long long
>   foo (long long a, long long b, long long c)
>   {
>     return ((a ^ b) & c) ^ b;
>   }
> 
> However, due to the implementation of aarch64_simd_bsl<mode>_internal
> we'll match that pattern and emit a BSL, necessitating moving all those
> arguments and results to the Advanced SIMD registers:
> 
> 	fmov	d2, x0
> 	fmov	d0, x2
> 	fmov	d1, x1
> 	bsl	v0.8b, v2.8b, v1.8b
> 	fmov	x0, d0
> 
> To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
> knows to split back to integer operations if the register allocation
> falls that way.
> 
> We could have used an unspec, but then we lose some of the nice
> simplifications that can be made from explicitly spelling out the semantics
> of BSL.
> 
> Bootstrapped on aarch64-none-linux-gnu.
> 
> OK?
> 
> Thanks,
> James
> 
> ---
> gcc/
> 
> 2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64-simd.md
> 	(aarch64_simd_bsl<mode>_internal): Remove DImode.
> 	(*aarch64_simd_bsl<mode>_alt): Likewise.
> 	(aarch64_simd_bsldi_internal): New.
> 
> gcc/testsuite/
> 
> 2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* gcc.target/aarch64/no-dimode-bsl.c: New.
> 	* gcc.target/aarch64/dimode-bsl.c: New.
> 

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c5a86ff..7b6b12f 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2256,13 +2256,13 @@
>  ;; in *aarch64_simd_bsl<mode>_alt.
>  
>  (define_insn "aarch64_simd_bsl<mode>_internal"
> -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> -	(xor:VSDQ_I_DI
> -	   (and:VSDQ_I_DI
> -	     (xor:VSDQ_I_DI
> +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> +	(xor:VDQ_I
> +	   (and:VDQ_I
> +	     (xor:VDQ_I
>  	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
> -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
> -	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> +	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
> +	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
>  	  (match_dup:<V_cmp_result> 3)
>  	))]
>    "TARGET_SIMD"
> @@ -2280,14 +2280,14 @@
>  ;; permutations of commutative operations, we have to have a separate pattern.
>  
>  (define_insn "*aarch64_simd_bsl<mode>_alt"
> -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> -	(xor:VSDQ_I_DI
> -	   (and:VSDQ_I_DI
> -	     (xor:VSDQ_I_DI
> -	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
> -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
> -	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> -	  (match_dup:VSDQ_I_DI 2)))]
> +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> +	(xor:VDQ_I
> +	   (and:VDQ_I
> +	     (xor:VDQ_I
> +	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
> +	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
> +	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> +	  (match_dup:VDQ_I 2)))]
>    "TARGET_SIMD"
>    "@
>    bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
> @@ -2296,6 +2296,45 @@
>    [(set_attr "type" "neon_bsl<q>")]
>  )
>  
> +;; DImode is special, we want to avoid computing operations which are
> +;; more naturally computed in general purpose registers in the vector
> +;; registers.  If we do that, we need to move all three operands from general
> +;; purpose registers to vector registers, then back again.  However, we
> +;; don't want to make this pattern an UNSPEC as we'd lose scope for
> +;; optimizations based on the component operations of a BSL.
> +;;
> +;; That means we need a splitter back to the individual operations, if they
> +;; would be better calculated on the integer side.
> +
> +(define_insn_and_split "aarch64_simd_bsldi_internal"
> +  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
> +	(xor:DI
> +	   (and:DI
> +	     (xor:DI
> +	       (match_operand:DI 3 "register_operand" "w,0,w,r")
> +	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
> +	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
> +	  (match_dup:DI 3)
> +	))]
> +  "TARGET_SIMD"
> +  "@
> +  bsl\\t%0.8b, %2.8b, %3.8b
> +  bit\\t%0.8b, %2.8b, %1.8b
> +  bif\\t%0.8b, %3.8b, %1.8b
> +  #"
> +  "&& GP_REGNUM_P (REGNO (operands[0]))"
> +  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
> +{
> +  /* Split back to individual operations.  */
> +  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
> +  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
> +  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
> +  DONE;
> +}
> +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> +   (set_attr "length" "4,4,4,12")]
> +)
> +
>  (define_expand "aarch64_simd_bsl<mode>"
>    [(match_operand:VALLDIF 0 "register_operand")
>     (match_operand:<V_cmp_result> 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> new file mode 100644
> index 0000000..4e63511
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Test that we can generate DImode BSL when we are using
> +   copysign.  */
> +
> +double
> +foo (double a, double b)
> +{
> +  return __builtin_copysign (a, b);
> +}
> +
> +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> new file mode 100644
> index 0000000..67dfda0
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Test that we don't combine to BSL when in DImode, avoiding register
> +   moves in the general case.
> +
> +   We want:
> +	eor	x0, x0, x1
> +	and	x0, x0, x2
> +	eor	x0, x0, x1
> +	ret
> +
> +   Rather than:
> +	fmov	d2, x0
> +	fmov	d0, x2
> +	fmov	d1, x1
> +	bsl	v0.8b, v2.8b, v1.8b
> +	fmov	x0, d0
> +	ret  */
> +
> +long long
> +foo (long long a, long long b, long long c)
> +{
> +  return ((a ^ b) & c) ^ b;
> +}
> +
> +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Patch AArch64] Stop generating BSL for simple integer code
  2017-06-21 10:49   ` James Greenhalgh
@ 2017-07-03 10:48     ` James Greenhalgh
  2017-07-27 17:49       ` James Greenhalgh
  0 siblings, 1 reply; 7+ messages in thread
From: James Greenhalgh @ 2017-07-03 10:48 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, richard.earnshaw, marcus.shawcroft

On Wed, Jun 21, 2017 at 11:49:07AM +0100, James Greenhalgh wrote:
> *ping*

*ping*x2

Thanks,
James

> On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> > [Sorry for the re-send. I spotted that the attributes were not right for the
> >  new pattern I was adding. The change between this and the first version was:
> > 
> >   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> >   +   (set_attr "length" "4,4,4,12")]
> > ]
> > 
> > ---
> > 
> > Hi,
> > 
> > In this testcase, all argument registers and the return register
> > will be general purpose registers:
> > 
> >   long long
> >   foo (long long a, long long b, long long c)
> >   {
> >     return ((a ^ b) & c) ^ b;
> >   }
> > 
> > However, due to the implementation of aarch64_simd_bsl<mode>_internal
> > we'll match that pattern and emit a BSL, necessitating moving all those
> > arguments and results to the Advanced SIMD registers:
> > 
> > 	fmov	d2, x0
> > 	fmov	d0, x2
> > 	fmov	d1, x1
> > 	bsl	v0.8b, v2.8b, v1.8b
> > 	fmov	x0, d0
> > 
> > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
> > knows to split back to integer operations if the register allocation
> > falls that way.
> > 
> > We could have used an unspec, but then we lose some of the nice
> > simplifications that can be made from explicitly spelling out the semantics
> > of BSL.
> > 
> > Bootstrapped on aarch64-none-linux-gnu.
> > 
> > OK?
> > 
> > Thanks,
> > James
> > 
> > ---
> > gcc/
> > 
> > 2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>
> > 
> > 	* config/aarch64/aarch64-simd.md
> > 	(aarch64_simd_bsl<mode>_internal): Remove DImode.
> > 	(*aarch64_simd_bsl<mode>_alt): Likewise.
> > 	(aarch64_simd_bsldi_internal): New.
> > 
> > gcc/testsuite/
> > 
> > 2017-06-12  James Greenhalgh  <james.greenhalgh@arm.com>
> > 
> > 	* gcc.target/aarch64/no-dimode-bsl.c: New.
> > 	* gcc.target/aarch64/dimode-bsl.c: New.
> > 
> 
> > diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> > index c5a86ff..7b6b12f 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -2256,13 +2256,13 @@
> >  ;; in *aarch64_simd_bsl<mode>_alt.
> >  
> >  (define_insn "aarch64_simd_bsl<mode>_internal"
> > -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> > -	(xor:VSDQ_I_DI
> > -	   (and:VSDQ_I_DI
> > -	     (xor:VSDQ_I_DI
> > +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> > +	(xor:VDQ_I
> > +	   (and:VDQ_I
> > +	     (xor:VDQ_I
> >  	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
> > -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
> > -	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> > +	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
> > +	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> >  	  (match_dup:<V_cmp_result> 3)
> >  	))]
> >    "TARGET_SIMD"
> > @@ -2280,14 +2280,14 @@
> >  ;; permutations of commutative operations, we have to have a separate pattern.
> >  
> >  (define_insn "*aarch64_simd_bsl<mode>_alt"
> > -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> > -	(xor:VSDQ_I_DI
> > -	   (and:VSDQ_I_DI
> > -	     (xor:VSDQ_I_DI
> > -	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
> > -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
> > -	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> > -	  (match_dup:VSDQ_I_DI 2)))]
> > +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> > +	(xor:VDQ_I
> > +	   (and:VDQ_I
> > +	     (xor:VDQ_I
> > +	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
> > +	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
> > +	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> > +	  (match_dup:VDQ_I 2)))]
> >    "TARGET_SIMD"
> >    "@
> >    bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
> > @@ -2296,6 +2296,45 @@
> >    [(set_attr "type" "neon_bsl<q>")]
> >  )
> >  
> > +;; DImode is special, we want to avoid computing operations which are
> > +;; more naturally computed in general purpose registers in the vector
> > +;; registers.  If we do that, we need to move all three operands from general
> > +;; purpose registers to vector registers, then back again.  However, we
> > +;; don't want to make this pattern an UNSPEC as we'd lose scope for
> > +;; optimizations based on the component operations of a BSL.
> > +;;
> > +;; That means we need a splitter back to the individual operations, if they
> > +;; would be better calculated on the integer side.
> > +
> > +(define_insn_and_split "aarch64_simd_bsldi_internal"
> > +  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
> > +	(xor:DI
> > +	   (and:DI
> > +	     (xor:DI
> > +	       (match_operand:DI 3 "register_operand" "w,0,w,r")
> > +	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
> > +	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
> > +	  (match_dup:DI 3)
> > +	))]
> > +  "TARGET_SIMD"
> > +  "@
> > +  bsl\\t%0.8b, %2.8b, %3.8b
> > +  bit\\t%0.8b, %2.8b, %1.8b
> > +  bif\\t%0.8b, %3.8b, %1.8b
> > +  #"
> > +  "&& GP_REGNUM_P (REGNO (operands[0]))"
> > +  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
> > +{
> > +  /* Split back to individual operations.  */
> > +  emit_insn (gen_xordi3 (operands[0], operands[2], operands[3]));
> > +  emit_insn (gen_anddi3 (operands[0], operands[0], operands[1]));
> > +  emit_insn (gen_xordi3 (operands[0], operands[0], operands[3]));
> > +  DONE;
> > +}
> > +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> > +   (set_attr "length" "4,4,4,12")]
> > +)
> > +
> >  (define_expand "aarch64_simd_bsl<mode>"
> >    [(match_operand:VALLDIF 0 "register_operand")
> >     (match_operand:<V_cmp_result> 1 "register_operand")
> > diff --git a/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> > new file mode 100644
> > index 0000000..4e63511
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/dimode-bsl.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* Test that we can generate DImode BSL when we are using
> > +   copysign.  */
> > +
> > +double
> > +foo (double a, double b)
> > +{
> > +  return __builtin_copysign (a, b);
> > +}
> > +
> > +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> > new file mode 100644
> > index 0000000..67dfda0
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/no-dimode-bsl.c
> > @@ -0,0 +1,30 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2" } */
> > +
> > +/* Test that we don't combine to BSL when in DImode, avoiding register
> > +   moves in the general case.
> > +
> > +   We want:
> > +	eor	x0, x0, x1
> > +	and	x0, x0, x2
> > +	eor	x0, x0, x1
> > +	ret
> > +
> > +   Rather than:
> > +	fmov	d2, x0
> > +	fmov	d0, x2
> > +	fmov	d1, x1
> > +	bsl	v0.8b, v2.8b, v1.8b
> > +	fmov	x0, d0
> > +	ret  */
> > +
> > +long long
> > +foo (long long a, long long b, long long c)
> > +{
> > +  return ((a ^ b) & c) ^ b;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "bsl\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
> > +/* { dg-final { scan-assembler-not "fmov\td\[0-9\]" } } */
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Patch AArch64] Stop generating BSL for simple integer code
  2017-07-03 10:48     ` James Greenhalgh
@ 2017-07-27 17:49       ` James Greenhalgh
  2017-10-04 16:44         ` James Greenhalgh
  0 siblings, 1 reply; 7+ messages in thread
From: James Greenhalgh @ 2017-07-27 17:49 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, richard.earnshaw, marcus.shawcroft

[-- Attachment #1: Type: text/plain, Size: 2517 bytes --]


On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> [Sorry for the re-send. I spotted that the attributes were not right for the
>  new pattern I was adding. The change between this and the first version was:
>
>   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
>   +   (set_attr "length" "4,4,4,12")]
> ]
>
> ---
>
> Hi,
>
> In this testcase, all argument registers and the return register
> will be general purpose registers:
>
>   long long
>   foo (long long a, long long b, long long c)
>   {
>     return ((a ^ b) & c) ^ b;
>   }
>
> However, due to the implementation of aarch64_simd_bsl<mode>_internal
> we'll match that pattern and emit a BSL, necessitating moving all those
> arguments and results to the Advanced SIMD registers:
>
> 	fmov	d2, x0
> 	fmov	d0, x2
> 	fmov	d1, x1
> 	bsl	v0.8b, v2.8b, v1.8b
> 	fmov	x0, d0
>
> To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
> knows to split back to integer operations if the register allocation
> falls that way.
>
> We could have used an unspec, but then we lose some of the nice
> simplifications that can be made from explicitly spelling out the semantics
> of BSL.

Off list, Richard and I found considerable issues with this patch. From
idioms failing to match in the testcase, to subtle register allocation bugs,
to potential for suboptimal code generation.

That's not good!

This spin of the patch corrects those issues by adding a second split
pattern, allocating a temporary register if we're permitted to, or
properly using tied output operands if we're not, and by generally playing
things a bit safer around the possibility of register overlaps.

The testcase is expanded to an execute test, hopefully giving a little
more assurance that we're doing the right thing - now testing the BSL idiom
generation in both general-purpose and floating-point registers and comparing
the results. Hopefully we're now a bit more robust!

Bootstrapped and tested on aarch64-none-linux-gnu and cross-tested on
aarch64-none-elf with no issues.

OK for trunk?

Thanks,
James

---
gcc/

2017-07-27  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-simd.md
	(aarch64_simd_bsl<mode>_internal): Remove DImode.
	(*aarch64_simd_bsl<mode>_alt): Likewise.
	(aarch64_simd_bsldi_internal): New.
	(aarch64_simd_bsldi_alt): Likewise.

gcc/testsuite/

2017-07-27  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/bsl-idiom.c: New.
	* gcc.target/aarch64/copysign-bsl.c: New.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Re-Patch-AArch64-Stop-generating-BSL-for-simple-inte.patch --]
[-- Type: text/x-patch; name="0001-Re-Patch-AArch64-Stop-generating-BSL-for-simple-inte.patch", Size: 8539 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 011fcec0..a186eae 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2280,13 +2280,13 @@
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
 	       (match_operand:<V_cmp_result> 3 "register_operand" "w,0,w")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
 	  (match_dup:<V_cmp_result> 3)
 	))]
   "TARGET_SIMD"
@@ -2304,14 +2304,14 @@
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
-	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-	  (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
+	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+	       (match_operand:VDQ_I 2 "register_operand" "w,0,w"))
+	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+	  (match_dup:VDQ_I 2)))]
   "TARGET_SIMD"
   "@
   bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2320,6 +2320,100 @@
   [(set_attr "type" "neon_bsl<q>")]
 )
 
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,0,w,r")
+	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 3)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
+(define_insn_and_split "aarch64_simd_bsldi_alt"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,w,0,r")
+	       (match_operand:DI 2 "register_operand" "w,0,w,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 2)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %3.8b, %2.8b
+  bit\\t%0.8b, %3.8b, %1.8b
+  bif\\t%0.8b, %2.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[2]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
 (define_expand "aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_cmp_result> 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
new file mode 100644
index 0000000..8151387
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
@@ -0,0 +1,88 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-rtl-combine --save-temps" } */
+
+/* Test that we don't generate BSL when in DImode with values in integer
+   registers, and do generate it where we have values in floating-point
+   registers.  This is useful, as it allows us to avoid register moves
+   in the general case.
+
+   We want:
+	eor	x0, x0, x1
+	and	x0, x0, x2
+	eor	x0, x0, x1
+	ret
+
+   Rather than:
+	fmov	d2, x0
+	fmov	d0, x2
+	fmov	d1, x1
+	bsl	v0.8b, v2.8b, v1.8b
+	fmov	x0, d0
+	ret  */
+
+extern void abort (void);
+
+unsigned long long __attribute__ ((noinline))
+foo (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  return ((a ^ b) & c) ^ b;
+}
+
+unsigned long long __attribute__ ((noinline))
+foo2 (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  return ((a ^ b) & c) ^ a;
+}
+
+#define force_simd(V1)   asm volatile ("mov %d0, %1.d[0]"	\
+	   : "=w"(V1)						\
+	   : "w"(V1)						\
+	   : /* No clobbers */);
+
+unsigned long long __attribute__ ((noinline))
+bar (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  force_simd (a);
+  force_simd (b);
+  force_simd (c);
+  c = ((a ^ b) & c) ^ b;
+  force_simd (c);
+  return c;
+}
+
+unsigned long long __attribute__ ((noinline))
+bar2 (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  force_simd (a);
+  force_simd (b);
+  force_simd (c);
+  c = ((a ^ b) & c) ^ a;
+  force_simd (c);
+  return c;
+}
+
+int
+main (int argc, char** argv)
+{
+  unsigned long long a = 0x0123456789abcdefULL;
+  unsigned long long b = 0xfedcba9876543210ULL;
+  unsigned long long c = 0xaabbccddeeff7777ULL;
+  if (foo (a, b, c) != bar (a, b, c))
+    abort ();
+  if (foo2 (a, b, c) != bar2 (a, b, c))
+    abort ();
+  return 0;
+}
+
+/* 2 BSL, 6 FMOV (to floating-point registers), and 2 FMOV (to general
+purpose registers) for the "bar" tests, which should still use BSL.  */
+/* { dg-final { scan-assembler-times "bsl\tv\[0-9\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]" 6 } } */
+/* { dg-final { scan-assembler-times "fmov\tx\[0-9\]" 2 } } */
+
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+
+/* We always match the idiom during combine.  */
+/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_internal" 2 "combine" } } */
+/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_alt" 2 "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
new file mode 100644
index 0000000..4e63511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we can generate DImode BSL when we are using
+   copysign.  */
+
+double
+foo (double a, double b)
+{
+  return __builtin_copysign (a, b);
+}
+
+/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Patch AArch64] Stop generating BSL for simple integer code
  2017-07-27 17:49       ` James Greenhalgh
@ 2017-10-04 16:44         ` James Greenhalgh
  2017-11-14 14:20           ` James Greenhalgh
  0 siblings, 1 reply; 7+ messages in thread
From: James Greenhalgh @ 2017-10-04 16:44 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, richard.earnshaw, marcus.shawcroft

[-- Attachment #1: Type: text/plain, Size: 2954 bytes --]


On Thu, Jul 27, 2017 at 06:49:01PM +0100, James Greenhalgh wrote:
>
> On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> > [Sorry for the re-send. I spotted that the attributes were not right for the
> >  new pattern I was adding. The change between this and the first version was:
> >
> >   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> >   +   (set_attr "length" "4,4,4,12")]
> > ]
> >
> > ---
> >
> > Hi,
> >
> > In this testcase, all argument registers and the return register
> > will be general purpose registers:
> >
> >   long long
> >   foo (long long a, long long b, long long c)
> >   {
> >     return ((a ^ b) & c) ^ b;
> >   }
> >
> > However, due to the implementation of aarch64_simd_bsl<mode>_internal
> > we'll match that pattern and emit a BSL, necessitating moving all those
> > arguments and results to the Advanced SIMD registers:
> >
> > 	fmov	d2, x0
> > 	fmov	d0, x2
> > 	fmov	d1, x1
> > 	bsl	v0.8b, v2.8b, v1.8b
> > 	fmov	x0, d0
> >
> > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
> > knows to split back to integer operations if the register allocation
> > falls that way.
> >
> > We could have used an unspec, but then we lose some of the nice
> > simplifications that can be made from explicitly spelling out the semantics
> > of BSL.
>
> Off list, Richard and I found considerable issues with this patch. From
> idioms failing to match in the testcase, to subtle register allocation bugs,
> to potential for suboptimal code generation.
>
> That's not good!
>
> This spin of the patch corrects those issues by adding a second split
> pattern, allocating a temporary register if we're permitted to, or
> properly using tied output operands if we're not, and by generally playing
> things a bit safer around the possibility of register overlaps.
>
> The testcase is expanded to an execute test, hopefully giving a little
> more assurance that we're doing the right thing - now testing the BSL idiom
> generation in both general-purpose and floating-point registers and comparing
> the results. Hopefully we're now a bit more robust!
>
> Bootstrapped and tested on aarch64-none-linux-gnu and cross-tested on
> aarch64-none-elf with no issues.
>
> OK for trunk?

I'd like to ping this patch for review (I would like the explicit review as
an earlier revision had bugs), attached is a rebase of this patch on trunk.

OK? If so, please commit it on my behalf as I will be out of office for a
few weeks for some time off.

Thanks,
James

---
gcc/

2017-10-04  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64-simd.md
	(aarch64_simd_bsl<mode>_internal): Remove DImode.
	(*aarch64_simd_bsl<mode>_alt): Likewise.
	(aarch64_simd_bsldi_internal): New.
	(aarch64_simd_bsldi_alt): Likewise.

gcc/testsuite/

2017-10-04  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/bsl-idiom.c: New.
	* gcc.target/aarch64/copysign-bsl.c: New.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Re-Patch-AArch64-Stop-generating-BSL-for-simple-inte.patch --]
[-- Type: text/x-patch; name="0001-Re-Patch-AArch64-Stop-generating-BSL-for-simple-inte.patch", Size: 8551 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 70e9339..1888d2f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -2322,13 +2322,13 @@
 ;; in *aarch64_simd_bsl<mode>_alt.
 
 (define_insn "aarch64_simd_bsl<mode>_internal"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
 	       (match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
-	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
+	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
+	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
 	  (match_dup:<V_INT_EQUIV> 3)
 	))]
   "TARGET_SIMD"
@@ -2346,14 +2346,14 @@
 ;; permutations of commutative operations, we have to have a separate pattern.
 
 (define_insn "*aarch64_simd_bsl<mode>_alt"
-  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
-	(xor:VSDQ_I_DI
-	   (and:VSDQ_I_DI
-	     (xor:VSDQ_I_DI
-	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
-	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
-	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
-	  (match_dup:VSDQ_I_DI 2)))]
+  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
+	(xor:VDQ_I
+	   (and:VDQ_I
+	     (xor:VDQ_I
+	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
+	       (match_operand:<V_INT_EQUIV> 2 "register_operand" "w,0,w"))
+	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
+	  (match_dup:<V_INT_EQUIV> 2)))]
   "TARGET_SIMD"
   "@
   bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
@@ -2362,6 +2362,100 @@
   [(set_attr "type" "neon_bsl<q>")]
 )
 
+;; DImode is special, we want to avoid computing operations which are
+;; more naturally computed in general purpose registers in the vector
+;; registers.  If we do that, we need to move all three operands from general
+;; purpose registers to vector registers, then back again.  However, we
+;; don't want to make this pattern an UNSPEC as we'd lose scope for
+;; optimizations based on the component operations of a BSL.
+;;
+;; That means we need a splitter back to the individual operations, if they
+;; would be better calculated on the integer side.
+
+(define_insn_and_split "aarch64_simd_bsldi_internal"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,0,w,r")
+	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 3)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %2.8b, %3.8b
+  bit\\t%0.8b, %2.8b, %1.8b
+  bif\\t%0.8b, %3.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[3]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
+(define_insn_and_split "aarch64_simd_bsldi_alt"
+  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
+	(xor:DI
+	   (and:DI
+	     (xor:DI
+	       (match_operand:DI 3 "register_operand" "w,w,0,r")
+	       (match_operand:DI 2 "register_operand" "w,0,w,r"))
+	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
+	  (match_dup:DI 2)
+	))]
+  "TARGET_SIMD"
+  "@
+  bsl\\t%0.8b, %3.8b, %2.8b
+  bit\\t%0.8b, %3.8b, %1.8b
+  bif\\t%0.8b, %2.8b, %1.8b
+  #"
+  "&& GP_REGNUM_P (REGNO (operands[0]))"
+  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
+{
+  /* Split back to individual operations.  If we're before reload, and
+     able to create a temporary register, do so.  If we're after reload,
+     we've got an early-clobber destination register, so use that.
+     Otherwise, we can't create pseudos and we can't yet guarantee that
+     operands[0] is safe to write, so FAIL to split.  */
+
+  rtx scratch;
+  if (reload_completed)
+    scratch = operands[0];
+  else if (can_create_pseudo_p ())
+    scratch = gen_reg_rtx (DImode);
+  else
+    FAIL;
+
+  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
+  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
+  emit_insn (gen_xordi3 (operands[0], scratch, operands[2]));
+  DONE;
+}
+  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
+   (set_attr "length" "4,4,4,12")]
+)
+
 (define_expand "aarch64_simd_bsl<mode>"
   [(match_operand:VALLDIF 0 "register_operand")
    (match_operand:<V_INT_EQUIV> 1 "register_operand")
diff --git a/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
new file mode 100644
index 0000000..8151387
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
@@ -0,0 +1,88 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -fdump-rtl-combine --save-temps" } */
+
+/* Test that we don't generate BSL when in DImode with values in integer
+   registers, and do generate it where we have values in floating-point
+   registers.  This is useful, as it allows us to avoid register moves
+   in the general case.
+
+   We want:
+	eor	x0, x0, x1
+	and	x0, x0, x2
+	eor	x0, x0, x1
+	ret
+
+   Rather than:
+	fmov	d2, x0
+	fmov	d0, x2
+	fmov	d1, x1
+	bsl	v0.8b, v2.8b, v1.8b
+	fmov	x0, d0
+	ret  */
+
+extern void abort (void);
+
+unsigned long long __attribute__ ((noinline))
+foo (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  return ((a ^ b) & c) ^ b;
+}
+
+unsigned long long __attribute__ ((noinline))
+foo2 (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  return ((a ^ b) & c) ^ a;
+}
+
+#define force_simd(V1)   asm volatile ("mov %d0, %1.d[0]"	\
+	   : "=w"(V1)						\
+	   : "w"(V1)						\
+	   : /* No clobbers */);
+
+unsigned long long __attribute__ ((noinline))
+bar (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  force_simd (a);
+  force_simd (b);
+  force_simd (c);
+  c = ((a ^ b) & c) ^ b;
+  force_simd (c);
+  return c;
+}
+
+unsigned long long __attribute__ ((noinline))
+bar2 (unsigned long long a, unsigned long long b, unsigned long long c)
+{
+  force_simd (a);
+  force_simd (b);
+  force_simd (c);
+  c = ((a ^ b) & c) ^ a;
+  force_simd (c);
+  return c;
+}
+
+int
+main (int argc, char** argv)
+{
+  unsigned long long a = 0x0123456789abcdefULL;
+  unsigned long long b = 0xfedcba9876543210ULL;
+  unsigned long long c = 0xaabbccddeeff7777ULL;
+  if (foo (a, b, c) != bar (a, b, c))
+    abort ();
+  if (foo2 (a, b, c) != bar2 (a, b, c))
+    abort ();
+  return 0;
+}
+
+/* 2 BSL, 6 FMOV (to floating-point registers), and 2 FMOV (to general
+purpose registers) for the "bar" tests, which should still use BSL.  */
+/* { dg-final { scan-assembler-times "bsl\tv\[0-9\]" 2 } } */
+/* { dg-final { scan-assembler-times "fmov\td\[0-9\]" 6 } } */
+/* { dg-final { scan-assembler-times "fmov\tx\[0-9\]" 2 } } */
+
+/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
+/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
+
+/* We always match the idiom during combine.  */
+/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_internal" 2 "combine" } } */
+/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_alt" 2 "combine" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
new file mode 100644
index 0000000..4e63511
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+
+/* Test that we can generate DImode BSL when we are using
+   copysign.  */
+
+double
+foo (double a, double b)
+{
+  return __builtin_copysign (a, b);
+}
+
+/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [Patch AArch64] Stop generating BSL for simple integer code
  2017-10-04 16:44         ` James Greenhalgh
@ 2017-11-14 14:20           ` James Greenhalgh
  0 siblings, 0 replies; 7+ messages in thread
From: James Greenhalgh @ 2017-11-14 14:20 UTC (permalink / raw)
  To: gcc-patches; +Cc: nd, Richard Earnshaw, Marcus Shawcroft

On Wed, Oct 04, 2017 at 05:44:07PM +0100, James Greenhalgh wrote:
> 
> On Thu, Jul 27, 2017 at 06:49:01PM +0100, James Greenhalgh wrote:
> >
> > On Mon, Jun 12, 2017 at 02:44:40PM +0100, James Greenhalgh wrote:
> > > [Sorry for the re-send. I spotted that the attributes were not right for the
> > >  new pattern I was adding. The change between this and the first version was:
> > >
> > >   +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> > >   +   (set_attr "length" "4,4,4,12")]
> > > ]
> > >
> > > ---
> > >
> > > Hi,
> > >
> > > In this testcase, all argument registers and the return register
> > > will be general purpose registers:
> > >
> > >   long long
> > >   foo (long long a, long long b, long long c)
> > >   {
> > >     return ((a ^ b) & c) ^ b;
> > >   }
> > >
> > > However, due to the implementation of aarch64_simd_bsl<mode>_internal
> > > we'll match that pattern and emit a BSL, necessitating moving all those
> > > arguments and results to the Advanced SIMD registers:
> > >
> > > 	fmov	d2, x0
> > > 	fmov	d0, x2
> > > 	fmov	d1, x1
> > > 	bsl	v0.8b, v2.8b, v1.8b
> > > 	fmov	x0, d0
> > >
> > > To fix this, we turn aarch64_simd_bsldi_internal in to an insn_and_split that
> > > knows to split back to integer operations if the register allocation
> > > falls that way.
> > >
> > > We could have used an unspec, but then we lose some of the nice
> > > simplifications that can be made from explicitly spelling out the semantics
> > > of BSL.
> >
> > Off list, Richard and I found considerable issues with this patch. From
> > idioms failing to match in the testcase, to subtle register allocation bugs,
> > to potential for suboptimal code generation.
> >
> > That's not good!
> >
> > This spin of the patch corrects those issues by adding a second split
> > pattern, allocating a temporary register if we're permitted to, or
> > properly using tied output operands if we're not, and by generally playing
> > things a bit safer around the possibility of register overlaps.
> >
> > The testcase is expanded to an execute test, hopefully giving a little
> > more assurance that we're doing the right thing - now testing the BSL idiom
> > generation in both general-purpose and floating-point registers and comparing
> > the results. Hopefully we're now a bit more robust!
> >
> > Bootstrapped and tested on aarch64-none-linux-gnu and cross-tested on
> > aarch64-none-elf with no issues.
> >
> > OK for trunk?
> 
> I'd like to ping this patch for review (I would like the explicit review as
> an earlier revision had bugs), attached is a rebase of this patch on trunk.
> 
> OK? If so, please commit it on my behalf as I will be out of office for a
> few weeks for some time off.

IMHO, This has had plenty of time for feedback from other AArch64 maintainers.
I've used my maintainer privileges and pushed this as r254727 after a
successful bootstrap and test cycle.

Thanks,
James

> gcc/
> 
> 2017-10-04  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* config/aarch64/aarch64-simd.md
> 	(aarch64_simd_bsl<mode>_internal): Remove DImode.
> 	(*aarch64_simd_bsl<mode>_alt): Likewise.
> 	(aarch64_simd_bsldi_internal): New.
> 	(aarch64_simd_bsldi_alt): Likewise.
> 
> gcc/testsuite/
> 
> 2017-10-04  James Greenhalgh  <james.greenhalgh@arm.com>
> 
> 	* gcc.target/aarch64/bsl-idiom.c: New.
> 	* gcc.target/aarch64/copysign-bsl.c: New.
> 

> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 70e9339..1888d2f 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -2322,13 +2322,13 @@
>  ;; in *aarch64_simd_bsl<mode>_alt.
>  
>  (define_insn "aarch64_simd_bsl<mode>_internal"
> -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> -	(xor:VSDQ_I_DI
> -	   (and:VSDQ_I_DI
> -	     (xor:VSDQ_I_DI
> +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> +	(xor:VDQ_I
> +	   (and:VDQ_I
> +	     (xor:VDQ_I
>  	       (match_operand:<V_INT_EQUIV> 3 "register_operand" "w,0,w")
> -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,w,0"))
> -	     (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> +	       (match_operand:VDQ_I 2 "register_operand" "w,w,0"))
> +	     (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
>  	  (match_dup:<V_INT_EQUIV> 3)
>  	))]
>    "TARGET_SIMD"
> @@ -2346,14 +2346,14 @@
>  ;; permutations of commutative operations, we have to have a separate pattern.
>  
>  (define_insn "*aarch64_simd_bsl<mode>_alt"
> -  [(set (match_operand:VSDQ_I_DI 0 "register_operand" "=w,w,w")
> -	(xor:VSDQ_I_DI
> -	   (and:VSDQ_I_DI
> -	     (xor:VSDQ_I_DI
> -	       (match_operand:VSDQ_I_DI 3 "register_operand" "w,w,0")
> -	       (match_operand:VSDQ_I_DI 2 "register_operand" "w,0,w"))
> -	      (match_operand:VSDQ_I_DI 1 "register_operand" "0,w,w"))
> -	  (match_dup:VSDQ_I_DI 2)))]
> +  [(set (match_operand:VDQ_I 0 "register_operand" "=w,w,w")
> +	(xor:VDQ_I
> +	   (and:VDQ_I
> +	     (xor:VDQ_I
> +	       (match_operand:VDQ_I 3 "register_operand" "w,w,0")
> +	       (match_operand:<V_INT_EQUIV> 2 "register_operand" "w,0,w"))
> +	      (match_operand:VDQ_I 1 "register_operand" "0,w,w"))
> +	  (match_dup:<V_INT_EQUIV> 2)))]
>    "TARGET_SIMD"
>    "@
>    bsl\\t%0.<Vbtype>, %3.<Vbtype>, %2.<Vbtype>
> @@ -2362,6 +2362,100 @@
>    [(set_attr "type" "neon_bsl<q>")]
>  )
>  
> +;; DImode is special, we want to avoid computing operations which are
> +;; more naturally computed in general purpose registers in the vector
> +;; registers.  If we do that, we need to move all three operands from general
> +;; purpose registers to vector registers, then back again.  However, we
> +;; don't want to make this pattern an UNSPEC as we'd lose scope for
> +;; optimizations based on the component operations of a BSL.
> +;;
> +;; That means we need a splitter back to the individual operations, if they
> +;; would be better calculated on the integer side.
> +
> +(define_insn_and_split "aarch64_simd_bsldi_internal"
> +  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
> +	(xor:DI
> +	   (and:DI
> +	     (xor:DI
> +	       (match_operand:DI 3 "register_operand" "w,0,w,r")
> +	       (match_operand:DI 2 "register_operand" "w,w,0,r"))
> +	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
> +	  (match_dup:DI 3)
> +	))]
> +  "TARGET_SIMD"
> +  "@
> +  bsl\\t%0.8b, %2.8b, %3.8b
> +  bit\\t%0.8b, %2.8b, %1.8b
> +  bif\\t%0.8b, %3.8b, %1.8b
> +  #"
> +  "&& GP_REGNUM_P (REGNO (operands[0]))"
> +  [(match_dup 1) (match_dup 1) (match_dup 2) (match_dup 3)]
> +{
> +  /* Split back to individual operations.  If we're before reload, and
> +     able to create a temporary register, do so.  If we're after reload,
> +     we've got an early-clobber destination register, so use that.
> +     Otherwise, we can't create pseudos and we can't yet guarantee that
> +     operands[0] is safe to write, so FAIL to split.  */
> +
> +  rtx scratch;
> +  if (reload_completed)
> +    scratch = operands[0];
> +  else if (can_create_pseudo_p ())
> +    scratch = gen_reg_rtx (DImode);
> +  else
> +    FAIL;
> +
> +  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
> +  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
> +  emit_insn (gen_xordi3 (operands[0], scratch, operands[3]));
> +  DONE;
> +}
> +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> +   (set_attr "length" "4,4,4,12")]
> +)
> +
> +(define_insn_and_split "aarch64_simd_bsldi_alt"
> +  [(set (match_operand:DI 0 "register_operand" "=w,w,w,&r")
> +	(xor:DI
> +	   (and:DI
> +	     (xor:DI
> +	       (match_operand:DI 3 "register_operand" "w,w,0,r")
> +	       (match_operand:DI 2 "register_operand" "w,0,w,r"))
> +	     (match_operand:DI 1 "register_operand" "0,w,w,r"))
> +	  (match_dup:DI 2)
> +	))]
> +  "TARGET_SIMD"
> +  "@
> +  bsl\\t%0.8b, %3.8b, %2.8b
> +  bit\\t%0.8b, %3.8b, %1.8b
> +  bif\\t%0.8b, %2.8b, %1.8b
> +  #"
> +  "&& GP_REGNUM_P (REGNO (operands[0]))"
> +  [(match_dup 0) (match_dup 1) (match_dup 2) (match_dup 3)]
> +{
> +  /* Split back to individual operations.  If we're before reload, and
> +     able to create a temporary register, do so.  If we're after reload,
> +     we've got an early-clobber destination register, so use that.
> +     Otherwise, we can't create pseudos and we can't yet guarantee that
> +     operands[0] is safe to write, so FAIL to split.  */
> +
> +  rtx scratch;
> +  if (reload_completed)
> +    scratch = operands[0];
> +  else if (can_create_pseudo_p ())
> +    scratch = gen_reg_rtx (DImode);
> +  else
> +    FAIL;
> +
> +  emit_insn (gen_xordi3 (scratch, operands[2], operands[3]));
> +  emit_insn (gen_anddi3 (scratch, scratch, operands[1]));
> +  emit_insn (gen_xordi3 (operands[0], scratch, operands[2]));
> +  DONE;
> +}
> +  [(set_attr "type" "neon_bsl,neon_bsl,neon_bsl,multiple")
> +   (set_attr "length" "4,4,4,12")]
> +)
> +
>  (define_expand "aarch64_simd_bsl<mode>"
>    [(match_operand:VALLDIF 0 "register_operand")
>     (match_operand:<V_INT_EQUIV> 1 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
> new file mode 100644
> index 0000000..8151387
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/bsl-idiom.c
> @@ -0,0 +1,88 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fdump-rtl-combine --save-temps" } */
> +
> +/* Test that we don't generate BSL when in DImode with values in integer
> +   registers, and do generate it where we have values in floating-point
> +   registers.  This is useful, as it allows us to avoid register moves
> +   in the general case.
> +
> +   We want:
> +	eor	x0, x0, x1
> +	and	x0, x0, x2
> +	eor	x0, x0, x1
> +	ret
> +
> +   Rather than:
> +	fmov	d2, x0
> +	fmov	d0, x2
> +	fmov	d1, x1
> +	bsl	v0.8b, v2.8b, v1.8b
> +	fmov	x0, d0
> +	ret  */
> +
> +extern void abort (void);
> +
> +unsigned long long __attribute__ ((noinline))
> +foo (unsigned long long a, unsigned long long b, unsigned long long c)
> +{
> +  return ((a ^ b) & c) ^ b;
> +}
> +
> +unsigned long long __attribute__ ((noinline))
> +foo2 (unsigned long long a, unsigned long long b, unsigned long long c)
> +{
> +  return ((a ^ b) & c) ^ a;
> +}
> +
> +#define force_simd(V1)   asm volatile ("mov %d0, %1.d[0]"	\
> +	   : "=w"(V1)						\
> +	   : "w"(V1)						\
> +	   : /* No clobbers */);
> +
> +unsigned long long __attribute__ ((noinline))
> +bar (unsigned long long a, unsigned long long b, unsigned long long c)
> +{
> +  force_simd (a);
> +  force_simd (b);
> +  force_simd (c);
> +  c = ((a ^ b) & c) ^ b;
> +  force_simd (c);
> +  return c;
> +}
> +
> +unsigned long long __attribute__ ((noinline))
> +bar2 (unsigned long long a, unsigned long long b, unsigned long long c)
> +{
> +  force_simd (a);
> +  force_simd (b);
> +  force_simd (c);
> +  c = ((a ^ b) & c) ^ a;
> +  force_simd (c);
> +  return c;
> +}
> +
> +int
> +main (int argc, char** argv)
> +{
> +  unsigned long long a = 0x0123456789abcdefULL;
> +  unsigned long long b = 0xfedcba9876543210ULL;
> +  unsigned long long c = 0xaabbccddeeff7777ULL;
> +  if (foo (a, b, c) != bar (a, b, c))
> +    abort ();
> +  if (foo2 (a, b, c) != bar2 (a, b, c))
> +    abort ();
> +  return 0;
> +}
> +
> +/* 2 BSL, 6 FMOV (to floating-point registers), and 2 FMOV (to general
> +purpose registers) for the "bar" tests, which should still use BSL.  */
> +/* { dg-final { scan-assembler-times "bsl\tv\[0-9\]" 2 } } */
> +/* { dg-final { scan-assembler-times "fmov\td\[0-9\]" 6 } } */
> +/* { dg-final { scan-assembler-times "fmov\tx\[0-9\]" 2 } } */
> +
> +/* { dg-final { scan-assembler-not "bif\tv\[0-9\]" } } */
> +/* { dg-final { scan-assembler-not "bit\tv\[0-9\]" } } */
> +
> +/* We always match the idiom during combine.  */
> +/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_internal" 2 "combine" } } */
> +/* { dg-final { scan-rtl-dump-times "aarch64_simd_bsldi_alt" 2 "combine" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
> new file mode 100644
> index 0000000..4e63511
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/copysign-bsl.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Test that we can generate DImode BSL when we are using
> +   copysign.  */
> +
> +double
> +foo (double a, double b)
> +{
> +  return __builtin_copysign (a, b);
> +}
> +
> +/* { dg-final { scan-assembler "bsl\tv\[0-9\]" } } */

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2017-11-14 14:13 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-12 13:36 [Patch AArch64] Stop generating BSL for simple integer code James Greenhalgh
2017-06-12 13:45 ` James Greenhalgh
2017-06-21 10:49   ` James Greenhalgh
2017-07-03 10:48     ` James Greenhalgh
2017-07-27 17:49       ` James Greenhalgh
2017-10-04 16:44         ` James Greenhalgh
2017-11-14 14:20           ` James Greenhalgh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).