public inbox for gcc-cvs@sourceware.org help / color / mirror / Atom feed
From: Jonathan Wright <jonwri01@gcc.gnu.org> To: gcc-cvs@gcc.gnu.org Subject: [gcc r12-2783] aarch64: Use memcpy to copy structures in bfloat vst* intrinsics Date: Fri, 6 Aug 2021 10:05:02 +0000 (GMT) [thread overview] Message-ID: <20210806100502.7E89939A006B@sourceware.org> (raw) https://gcc.gnu.org/g:bc181adf26eae77eacb73d4397ac479dac114d2d commit r12-2783-gbc181adf26eae77eacb73d4397ac479dac114d2d Author: Jonathan Wright <jonathan.wright@arm.com> Date: Fri Jul 30 15:30:19 2021 +0100 aarch64: Use memcpy to copy structures in bfloat vst* intrinsics Use __builtin_memcpy to copy vector structures instead of using a union - or constructing a new opaque structure one vector at a time - in each of the vst[234][q] and vst1[q]_x[234] bfloat Neon intrinsics in arm_neon.h. Add new code generation tests to verify that superfluous move instructions are not generated for the vst[234]q or vst1q_x[234] bfloat intrinsics. gcc/ChangeLog: 2021-07-30 Jonathan Wright <jonathan.wright@arm.com> * config/aarch64/arm_neon.h (vst1_bf16_x2): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_oi one vector at a time. (vst1q_bf16_x2): Likewise. (vst1_bf16_x3): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_ci one vector at a time. (vst1q_bf16_x3): Likewise. (vst1_bf16_x4): Use __builtin_memcpy instead of a union. (vst1q_bf16_x4): Likewise. (vst2_bf16): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_oi one vector at a time. (vst2q_bf16): Likewise. (vst3_bf16): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_ci mode one vector at a time. (vst3q_bf16): Likewise. (vst4_bf16): Use __builtin_memcpy instead of constructing an additional __builtin_aarch64_simd_xi one vector at a time. (vst4q_bf16): Likewise. gcc/testsuite/ChangeLog: * gcc.target/aarch64/vector_structure_intrinsics.c: Add new tests. Diff: --- gcc/config/aarch64/arm_neon.h | 57 +++++++++------------- .../aarch64/vector_structure_intrinsics.c | 14 ++++-- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index cbae61d3c40..390cf9a7743 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -33839,8 +33839,7 @@ vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val) bfloat16x8x2_t __temp; __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x2v4bf (__a, __o); } @@ -33849,8 +33848,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x2v8bf (__a, __o); } @@ -33863,9 +33861,7 @@ vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val) __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33874,26 +33870,31 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val) +vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t __val) { - union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); + __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val) +vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t __val) { - union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; - __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); + __builtin_aarch64_simd_xi __o; + __builtin_memcpy (&__o, &__val, sizeof (__val)); + __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } __extension__ extern __inline void @@ -33925,8 +33926,7 @@ vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val) bfloat16x8x2_t __temp; __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st2v4bf (__a, __o); } @@ -33935,8 +33935,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val) { __builtin_aarch64_simd_oi __o; - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st2v8bf (__a, __o); } @@ -33949,9 +33948,7 @@ vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val) __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33960,9 +33957,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val) { __builtin_aarch64_simd_ci __o; - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33976,10 +33971,7 @@ vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val) __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __builtin_memcpy (&__o, &__temp, sizeof (__temp)); __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); } @@ -33988,10 +33980,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val) { __builtin_aarch64_simd_xi __o; - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2); - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3); + __builtin_memcpy (&__o, &__val, sizeof (__val)); __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); } diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c index e491d46394c..3e7e572bf39 100644 --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c @@ -95,6 +95,7 @@ TEST_STX (vst4q, int16x8x4_t, int16_t*, s16); TEST_STX (vst4q, uint16x8x4_t, uint16_t*, u16); TEST_STX (vst4q, poly16x8x4_t, poly16_t*, p16); TEST_STX (vst4q, float16x8x4_t, float16_t*, f16); +TEST_STX (vst4q, bfloat16x8x4_t, bfloat16_t*, bf16); TEST_STX (vst4q, int32x4x4_t, int32_t*, s32); TEST_STX (vst4q, uint32x4x4_t, uint32_t*, u32); TEST_STX (vst4q, float32x4x4_t, float32_t*, f32); @@ -110,6 +111,7 @@ TEST_STX (vst2q, int16x8x2_t, int16_t*, s16); TEST_STX (vst2q, uint16x8x2_t, uint16_t*, u16); TEST_STX (vst2q, poly16x8x2_t, poly16_t*, p16); TEST_STX (vst2q, float16x8x2_t, float16_t*, f16); +TEST_STX (vst2q, bfloat16x8x2_t, bfloat16_t*, bf16); TEST_STX (vst2q, int32x4x2_t, int32_t*, s32); TEST_STX (vst2q, uint32x4x2_t, uint32_t*, u32); TEST_STX (vst2q, float32x4x2_t, float32_t*, f32); @@ -131,6 +133,7 @@ TEST_ST3 (vst3q, int16x8x3_t, int16_t*, s16); TEST_ST3 (vst3q, uint16x8x3_t, uint16_t*, u16); TEST_ST3 (vst3q, poly16x8x3_t, poly16_t*, p16); TEST_ST3 (vst3q, float16x8x3_t, float16_t*, f16); +TEST_ST3 (vst3q, bfloat16x8x3_t, bfloat16_t*, bf16); TEST_ST3 (vst3q, int32x4x3_t, int32_t*, s32); TEST_ST3 (vst3q, uint32x4x3_t, uint32_t*, u32); TEST_ST3 (vst3q, float32x4x3_t, float32_t*, f32); @@ -212,6 +215,7 @@ TEST_ST1xN (vst1q, int16x8x4_t, int16_t*, s16, x4); TEST_ST1xN (vst1q, uint16x8x4_t, uint16_t*, u16, x4); TEST_ST1xN (vst1q, poly16x8x4_t, poly16_t*, p16, x4); TEST_ST1xN (vst1q, float16x8x4_t, float16_t*, f16, x4); +TEST_ST1xN (vst1q, bfloat16x8x4_t, bfloat16_t*, bf16, x4); TEST_ST1xN (vst1q, int32x4x4_t, int32_t*, s32, x4); TEST_ST1xN (vst1q, uint32x4x4_t, uint32_t*, u32, x4); TEST_ST1xN (vst1q, float32x4x4_t, float32_t*, f32, x4); @@ -227,6 +231,7 @@ TEST_ST1xN (vst1q, int16x8x2_t, int16_t*, s16, x2); TEST_ST1xN (vst1q, uint16x8x2_t, uint16_t*, u16, x2); TEST_ST1xN (vst1q, poly16x8x2_t, poly16_t*, p16, x2); TEST_ST1xN (vst1q, float16x8x2_t, float16_t*, f16, x2); +TEST_ST1xN (vst1q, bfloat16x8x2_t, bfloat16_t*, bf16, x2); TEST_ST1xN (vst1q, int32x4x2_t, int32_t*, s32, x2); TEST_ST1xN (vst1q, uint32x4x2_t, uint32_t*, u32, x2); TEST_ST1xN (vst1q, float32x4x2_t, float32_t*, f32, x2); @@ -249,6 +254,7 @@ TEST_ST1x3 (vst1q, int16x8x3_t, int16_t*, s16, x3); TEST_ST1x3 (vst1q, uint16x8x3_t, uint16_t*, u16, x3); TEST_ST1x3 (vst1q, poly16x8x3_t, poly16_t*, p16, x3); TEST_ST1x3 (vst1q, float16x8x3_t, float16_t*, f16, x3); +TEST_ST1x3 (vst1q, bfloat16x8x3_t, bfloat16_t*, bf16, x3); TEST_ST1x3 (vst1q, int32x4x3_t, int32_t*, s32, x3); TEST_ST1x3 (vst1q, uint32x4x3_t, uint32_t*, u32, x3); TEST_ST1x3 (vst1q, float32x4x3_t, float32_t*, f32, x3); @@ -261,7 +267,7 @@ TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3); /* { dg-final { scan-assembler-times "tbl\\t" 18} } */ /* { dg-final { scan-assembler-times "tbx\\t" 18} } */ -/* { dg-final { scan-assembler-times "st4\\t" 29} } */ -/* { dg-final { scan-assembler-times "st3\\t" 29} } */ -/* { dg-final { scan-assembler-times "st2\\t" 29} } */ -/* { dg-final { scan-assembler-times "st1\\t" 42} } */ +/* { dg-final { scan-assembler-times "st4\\t" 30} } */ +/* { dg-final { scan-assembler-times "st3\\t" 30} } */ +/* { dg-final { scan-assembler-times "st2\\t" 30} } */ +/* { dg-final { scan-assembler-times "st1\\t" 45} } */
reply other threads:[~2021-08-06 10:05 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20210806100502.7E89939A006B@sourceware.org \ --to=jonwri01@gcc.gnu.org \ --cc=gcc-cvs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).