public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][AArch64] Add combine pattern to fuse AESE/AESMC instructions
@ 2018-05-11 13:32 Kyrill Tkachov
  2018-05-14 16:23 ` Richard Earnshaw (lists)
  0 siblings, 1 reply; 2+ messages in thread
From: Kyrill Tkachov @ 2018-05-11 13:32 UTC (permalink / raw)
  To: gcc-patches; +Cc: Marcus Shawcroft, Richard Earnshaw (lists), James Greenhalgh

[-- Attachment #1: Type: text/plain, Size: 1527 bytes --]

Hi all,

When the AESE,AESD and AESMC, AESMC instructions are generated through the appropriate arm_neon.h intrinsics
we really want to keep them together when the AESE feeds into an AESMC and fusion is supported by the target CPU.
We have macro-fusion hooks and scheduling model forwarding paths defined to facilitate that.
It is, however, not always enough.

This patch adds another mechanism for doing that.
When we can detect during combine that the required dependency is exists (AESE -> AESMC, AESD -> AESIMC)
just keep them together with a combine pattern throughout the rest of compilation.
We won't ever want to split them.

The testcases generate 4 AESE(D) instructions in a block followed by 4 AES(I)MC instructions that
consume the corresponding results and it also adds a bunch of computations in-between so that the
AESE and AESMC instructions are not trivially back-to-back, thus exercising the compiler's ability
to bring them together.

With this patch all 4 pairs are fused whereas before a couple of fusions would be missed due to intervening
arithmetic and memory instructions.

Bootstrapped and tested on aarch64-none-linux-gnu.

Ok for trunk?

Thanks,
Kyrill

2018-05-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * config/aarch64/aarch64-simd.md (*aarch64_crypto_aese_fused):
     New pattern.
     (aarch64_crypto_aesd_fused): Likewise.

2018-05-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>

     * gcc.target/aarch64/crypto-fuse-1.c: New test.
     * gcc.target/aarch64/crypto-fuse-2.c: Likewise.

[-- Attachment #2: fuse-combine.patch --]
[-- Type: text/x-patch, Size: 4693 bytes --]

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 7c166b6c8ec40475d1e01561b613b590b6690ad5..9a6ed304432af0ca23ec7d3797783a3128776a6e 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5790,6 +5790,44 @@ (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
       (const_string "yes")])]
 )
 
+;; When AESE/AESMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;;  Mash the two together during combine.
+
+(define_insn "*aarch64_crypto_aese_fused"
+  [(set (match_operand:V16QI 0 "register_operand" "=&w")
+	(unspec:V16QI
+	  [(unspec:V16QI
+	    [(match_operand:V16QI 1 "register_operand" "0")
+	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESE)
+	  ] UNSPEC_AESMC))]
+  "TARGET_SIMD && TARGET_AES
+   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
+  "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b"
+  [(set_attr "type" "crypto_aese")
+   (set_attr "length" "8")]
+)
+
+;; When AESD/AESIMC fusion is enabled we really want to keep the two together
+;; and enforce the register dependency without scheduling or register
+;; allocation messing up the order or introducing moves inbetween.
+;;  Mash the two together during combine.
+
+(define_insn "*aarch64_crypto_aesd_fused"
+  [(set (match_operand:V16QI 0 "register_operand" "=&w")
+	(unspec:V16QI
+	  [(unspec:V16QI
+	    [(match_operand:V16QI 1 "register_operand" "0")
+	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESD)
+	  ] UNSPEC_AESIMC))]
+  "TARGET_SIMD && TARGET_AES
+   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
+  "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b"
+  [(set_attr "type" "crypto_aese")
+   (set_attr "length" "8")]
+)
+
 ;; sha1
 
 (define_insn "aarch64_crypto_sha1hsi"
diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
new file mode 100644
index 0000000000000000000000000000000000000000..79fd6011ed946d746ed5f03d26c7fe661f3f8154
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
+
+#include <arm_neon.h>
+
+#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
+#define AESMC(r, i) (r = vaesmcq_u8 (i))
+
+uint8x16_t dummy;
+uint8x16_t a;
+uint8x16_t b;
+uint8x16_t c;
+uint8x16_t d;
+uint8x16_t e;
+
+void
+foo (void)
+{
+  AESE (a, a, e);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (b, b, e);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (c, c, e);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (d, d, e);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESMC (a, a);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (b, b);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (c, c);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (d, d);
+}
+
+/* { dg-final { scan-assembler-times "crypto_aese_fused" 4 } } */
\ No newline at end of file
diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
new file mode 100644
index 0000000000000000000000000000000000000000..ed9eb69e803b24ec16a72075c46a9b6e6898c2fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
+
+#include <arm_neon.h>
+
+#define AESE(r, v, key) (r = vaesdq_u8 ((v), (key)));
+#define AESMC(r, i) (r = vaesimcq_u8 (i))
+
+uint8x16_t dummy;
+uint8x16_t a;
+uint8x16_t b;
+uint8x16_t c;
+uint8x16_t d;
+uint8x16_t e;
+
+void
+foo (void)
+{
+  AESE (a, a, e);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (b, b, e);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (c, c, e);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESE (d, d, e);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+
+  AESMC (a, a);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (b, b);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (c, c);
+  dummy = vaddq_u8 (dummy, dummy);
+  dummy = vaddq_u8 (dummy, dummy);
+  AESMC (d, d);
+}
+
+/* { dg-final { scan-assembler-times "crypto_aesd_fused" 4 } } */
\ No newline at end of file

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH][AArch64] Add combine pattern to fuse AESE/AESMC instructions
  2018-05-11 13:32 [PATCH][AArch64] Add combine pattern to fuse AESE/AESMC instructions Kyrill Tkachov
@ 2018-05-14 16:23 ` Richard Earnshaw (lists)
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Earnshaw (lists) @ 2018-05-14 16:23 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc-patches; +Cc: Marcus Shawcroft, James Greenhalgh

On 11/05/18 14:29, Kyrill Tkachov wrote:
> Hi all,
> 
> When the AESE,AESD and AESMC, AESMC instructions are generated through
> the appropriate arm_neon.h intrinsics
> we really want to keep them together when the AESE feeds into an AESMC
> and fusion is supported by the target CPU.
> We have macro-fusion hooks and scheduling model forwarding paths defined
> to facilitate that.
> It is, however, not always enough.
> 
> This patch adds another mechanism for doing that.
> When we can detect during combine that the required dependency is exists
> (AESE -> AESMC, AESD -> AESIMC)
> just keep them together with a combine pattern throughout the rest of
> compilation.
> We won't ever want to split them.
> 
> The testcases generate 4 AESE(D) instructions in a block followed by 4
> AES(I)MC instructions that
> consume the corresponding results and it also adds a bunch of
> computations in-between so that the
> AESE and AESMC instructions are not trivially back-to-back, thus
> exercising the compiler's ability
> to bring them together.
> 
> With this patch all 4 pairs are fused whereas before a couple of fusions
> would be missed due to intervening
> arithmetic and memory instructions.
> 
> Bootstrapped and tested on aarch64-none-linux-gnu.
> 
> Ok for trunk?
> 
> Thanks,
> Kyrill
> 
> 2018-05-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * config/aarch64/aarch64-simd.md (*aarch64_crypto_aese_fused):
>     New pattern.
>     (aarch64_crypto_aesd_fused): Likewise.
> 
> 2018-05-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
> 
>     * gcc.target/aarch64/crypto-fuse-1.c: New test.
>     * gcc.target/aarch64/crypto-fuse-2.c: Likewise.

Your testcases are missing a newline at the end of each file.  Otherwise OK.

R.

> 
> fuse-combine.patch
> 
> 
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index 7c166b6c8ec40475d1e01561b613b590b6690ad5..9a6ed304432af0ca23ec7d3797783a3128776a6e 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -5790,6 +5790,44 @@ (define_insn "aarch64_crypto_aes<aesmc_op>v16qi"
>        (const_string "yes")])]
>  )
>  
> +;; When AESE/AESMC fusion is enabled we really want to keep the two together
> +;; and enforce the register dependency without scheduling or register
> +;; allocation messing up the order or introducing moves inbetween.
> +;;  Mash the two together during combine.
> +
> +(define_insn "*aarch64_crypto_aese_fused"
> +  [(set (match_operand:V16QI 0 "register_operand" "=&w")
> +	(unspec:V16QI
> +	  [(unspec:V16QI
> +	    [(match_operand:V16QI 1 "register_operand" "0")
> +	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESE)
> +	  ] UNSPEC_AESMC))]
> +  "TARGET_SIMD && TARGET_AES
> +   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
> +  "aese\\t%0.16b, %2.16b\;aesmc\\t%0.16b, %0.16b"
> +  [(set_attr "type" "crypto_aese")
> +   (set_attr "length" "8")]
> +)
> +
> +;; When AESD/AESIMC fusion is enabled we really want to keep the two together
> +;; and enforce the register dependency without scheduling or register
> +;; allocation messing up the order or introducing moves inbetween.
> +;;  Mash the two together during combine.
> +
> +(define_insn "*aarch64_crypto_aesd_fused"
> +  [(set (match_operand:V16QI 0 "register_operand" "=&w")
> +	(unspec:V16QI
> +	  [(unspec:V16QI
> +	    [(match_operand:V16QI 1 "register_operand" "0")
> +	     (match_operand:V16QI 2 "register_operand" "w")] UNSPEC_AESD)
> +	  ] UNSPEC_AESIMC))]
> +  "TARGET_SIMD && TARGET_AES
> +   && aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)"
> +  "aesd\\t%0.16b, %2.16b\;aesimc\\t%0.16b, %0.16b"
> +  [(set_attr "type" "crypto_aese")
> +   (set_attr "length" "8")]
> +)
> +
>  ;; sha1
>  
>  (define_insn "aarch64_crypto_sha1hsi"
> diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..79fd6011ed946d746ed5f03d26c7fe661f3f8154
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-1.c
> @@ -0,0 +1,44 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
> +
> +#include <arm_neon.h>
> +
> +#define AESE(r, v, key) (r = vaeseq_u8 ((v), (key)));
> +#define AESMC(r, i) (r = vaesmcq_u8 (i))
> +
> +uint8x16_t dummy;
> +uint8x16_t a;
> +uint8x16_t b;
> +uint8x16_t c;
> +uint8x16_t d;
> +uint8x16_t e;
> +
> +void
> +foo (void)
> +{
> +  AESE (a, a, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (b, b, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (c, c, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (d, d, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +
> +  AESMC (a, a);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (b, b);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (c, c);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (d, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "crypto_aese_fused" 4 } } */
> \ No newline at end of file
> diff --git a/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ed9eb69e803b24ec16a72075c46a9b6e6898c2fe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/crypto-fuse-2.c
> @@ -0,0 +1,44 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mcpu=cortex-a72+crypto -dp" } */
> +
> +#include <arm_neon.h>
> +
> +#define AESE(r, v, key) (r = vaesdq_u8 ((v), (key)));
> +#define AESMC(r, i) (r = vaesimcq_u8 (i))
> +
> +uint8x16_t dummy;
> +uint8x16_t a;
> +uint8x16_t b;
> +uint8x16_t c;
> +uint8x16_t d;
> +uint8x16_t e;
> +
> +void
> +foo (void)
> +{
> +  AESE (a, a, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (b, b, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (c, c, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESE (d, d, e);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +
> +  AESMC (a, a);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (b, b);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (c, c);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  dummy = vaddq_u8 (dummy, dummy);
> +  AESMC (d, d);
> +}
> +
> +/* { dg-final { scan-assembler-times "crypto_aesd_fused" 4 } } */
> \ No newline at end of file
> 

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2018-05-14 16:18 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-11 13:32 [PATCH][AArch64] Add combine pattern to fuse AESE/AESMC instructions Kyrill Tkachov
2018-05-14 16:23 ` Richard Earnshaw (lists)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).