* [PATCH 4/4] aarch64: Use memcpy to copy structures in bfloat vst* intrinsics
@ 2021-08-05 17:17 Jonathan Wright
2021-08-06 9:34 ` Richard Sandiford
0 siblings, 1 reply; 2+ messages in thread
From: Jonathan Wright @ 2021-08-05 17:17 UTC (permalink / raw)
To: gcc-patches; +Cc: Richard Sandiford
[-- Attachment #1: Type: text/plain, Size: 1605 bytes --]
Hi,
As subject, this patch uses __builtin_memcpy to copy vector structures
instead of using a union - or constructing a new opaque structure one
vector at a time - in each of the vst[234][q] and vst1[q]_x[234] bfloat
Neon intrinsics in arm_neon.h.
It also adds new code generation tests to verify that superfluous move
instructions are not generated for the vst[234]q or vst1q_x[234] bfloat
intrinsics.
Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.
Ok for master?
Thanks,
Jonathan
---
gcc/ChangeLog:
2021-07-30 Jonathan Wright <jonathan.wright@arm.com>
* config/aarch64/arm_neon.h (vst1_bf16_x2): Use
__builtin_memcpy instead of constructing an additional
__builtin_aarch64_simd_oi one vector at a time.
(vst1q_bf16_x2): Likewise.
(vst1_bf16_x3): Use __builtin_memcpy instead of constructing
an additional __builtin_aarch64_simd_ci one vector at a time.
(vst1q_bf16_x3): Likewise.
(vst1_bf16_x4): Use __builtin_memcpy instead of a union.
(vst1q_bf16_x4): Likewise.
(vst2_bf16): Use __builtin_memcpy instead of constructing an
additional __builtin_aarch64_simd_oi one vector at a time.
(vst2q_bf16): Likewise.
(vst3_bf16): Use __builtin_memcpy instead of constructing an
additional __builtin_aarch64_simd_ci mode one vector at a
time.
(vst3q_bf16): Likewise.
(vst4_bf16): Use __builtin_memcpy instead of constructing an
additional __builtin_aarch64_simd_xi one vector at a time.
(vst4q_bf16): Likewise.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
tests.
[-- Attachment #2: rb14731.patch --]
[-- Type: application/octet-stream, Size: 11231 bytes --]
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index ed6ce179d76f34e1f946adb75bb20a947b67ab82..a16ee4d534fb6f15047b08a951adcc87c5c9ac3f 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -33839,8 +33839,7 @@ vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val)
bfloat16x8x2_t __temp;
__temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
__temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
- __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
__builtin_aarch64_st1x2v4bf (__a, __o);
}
@@ -33849,8 +33848,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val)
{
__builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
+ __builtin_memcpy (&__o, &__val, sizeof (__val));
__builtin_aarch64_st1x2v8bf (__a, __o);
}
@@ -33863,9 +33861,7 @@ vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val)
__temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
__temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
__temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
__builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
}
@@ -33874,26 +33870,31 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val)
{
__builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+ __builtin_memcpy (&__o, &__val, sizeof (__val));
__builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val)
+vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t __val)
{
- union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
- __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
+ __builtin_aarch64_simd_xi __o;
+ bfloat16x8x4_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+ __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val)
+vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t __val)
{
- union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
- __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
+ __builtin_aarch64_simd_xi __o;
+ __builtin_memcpy (&__o, &__val, sizeof (__val));
+ __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
}
__extension__ extern __inline void
@@ -33925,8 +33926,7 @@ vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val)
bfloat16x8x2_t __temp;
__temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
__temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
- __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
__builtin_aarch64_st2v4bf (__a, __o);
}
@@ -33935,8 +33935,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val)
{
__builtin_aarch64_simd_oi __o;
- __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
- __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
+ __builtin_memcpy (&__o, &__val, sizeof (__val));
__builtin_aarch64_st2v8bf (__a, __o);
}
@@ -33949,9 +33948,7 @@ vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val)
__temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
__temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
__temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
__builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
}
@@ -33960,9 +33957,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val)
{
__builtin_aarch64_simd_ci __o;
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
- __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+ __builtin_memcpy (&__o, &__val, sizeof (__val));
__builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
}
@@ -33976,10 +33971,7 @@ vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val)
__temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
__temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
__temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0)));
- __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3);
+ __builtin_memcpy (&__o, &__temp, sizeof (__temp));
__builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
}
@@ -33988,10 +33980,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val)
{
__builtin_aarch64_simd_xi __o;
- __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
- __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
- __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
- __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3);
+ __builtin_memcpy (&__o, &__val, sizeof (__val));
__builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
}
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
index e491d46394c7985d80930d1f7d9e8bd77f13c3c2..3e7e572bf39659ecf2f17751d92a4a99a4f2bf8b 100644
--- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
@@ -95,6 +95,7 @@ TEST_STX (vst4q, int16x8x4_t, int16_t*, s16);
TEST_STX (vst4q, uint16x8x4_t, uint16_t*, u16);
TEST_STX (vst4q, poly16x8x4_t, poly16_t*, p16);
TEST_STX (vst4q, float16x8x4_t, float16_t*, f16);
+TEST_STX (vst4q, bfloat16x8x4_t, bfloat16_t*, bf16);
TEST_STX (vst4q, int32x4x4_t, int32_t*, s32);
TEST_STX (vst4q, uint32x4x4_t, uint32_t*, u32);
TEST_STX (vst4q, float32x4x4_t, float32_t*, f32);
@@ -110,6 +111,7 @@ TEST_STX (vst2q, int16x8x2_t, int16_t*, s16);
TEST_STX (vst2q, uint16x8x2_t, uint16_t*, u16);
TEST_STX (vst2q, poly16x8x2_t, poly16_t*, p16);
TEST_STX (vst2q, float16x8x2_t, float16_t*, f16);
+TEST_STX (vst2q, bfloat16x8x2_t, bfloat16_t*, bf16);
TEST_STX (vst2q, int32x4x2_t, int32_t*, s32);
TEST_STX (vst2q, uint32x4x2_t, uint32_t*, u32);
TEST_STX (vst2q, float32x4x2_t, float32_t*, f32);
@@ -131,6 +133,7 @@ TEST_ST3 (vst3q, int16x8x3_t, int16_t*, s16);
TEST_ST3 (vst3q, uint16x8x3_t, uint16_t*, u16);
TEST_ST3 (vst3q, poly16x8x3_t, poly16_t*, p16);
TEST_ST3 (vst3q, float16x8x3_t, float16_t*, f16);
+TEST_ST3 (vst3q, bfloat16x8x3_t, bfloat16_t*, bf16);
TEST_ST3 (vst3q, int32x4x3_t, int32_t*, s32);
TEST_ST3 (vst3q, uint32x4x3_t, uint32_t*, u32);
TEST_ST3 (vst3q, float32x4x3_t, float32_t*, f32);
@@ -212,6 +215,7 @@ TEST_ST1xN (vst1q, int16x8x4_t, int16_t*, s16, x4);
TEST_ST1xN (vst1q, uint16x8x4_t, uint16_t*, u16, x4);
TEST_ST1xN (vst1q, poly16x8x4_t, poly16_t*, p16, x4);
TEST_ST1xN (vst1q, float16x8x4_t, float16_t*, f16, x4);
+TEST_ST1xN (vst1q, bfloat16x8x4_t, bfloat16_t*, bf16, x4);
TEST_ST1xN (vst1q, int32x4x4_t, int32_t*, s32, x4);
TEST_ST1xN (vst1q, uint32x4x4_t, uint32_t*, u32, x4);
TEST_ST1xN (vst1q, float32x4x4_t, float32_t*, f32, x4);
@@ -227,6 +231,7 @@ TEST_ST1xN (vst1q, int16x8x2_t, int16_t*, s16, x2);
TEST_ST1xN (vst1q, uint16x8x2_t, uint16_t*, u16, x2);
TEST_ST1xN (vst1q, poly16x8x2_t, poly16_t*, p16, x2);
TEST_ST1xN (vst1q, float16x8x2_t, float16_t*, f16, x2);
+TEST_ST1xN (vst1q, bfloat16x8x2_t, bfloat16_t*, bf16, x2);
TEST_ST1xN (vst1q, int32x4x2_t, int32_t*, s32, x2);
TEST_ST1xN (vst1q, uint32x4x2_t, uint32_t*, u32, x2);
TEST_ST1xN (vst1q, float32x4x2_t, float32_t*, f32, x2);
@@ -249,6 +254,7 @@ TEST_ST1x3 (vst1q, int16x8x3_t, int16_t*, s16, x3);
TEST_ST1x3 (vst1q, uint16x8x3_t, uint16_t*, u16, x3);
TEST_ST1x3 (vst1q, poly16x8x3_t, poly16_t*, p16, x3);
TEST_ST1x3 (vst1q, float16x8x3_t, float16_t*, f16, x3);
+TEST_ST1x3 (vst1q, bfloat16x8x3_t, bfloat16_t*, bf16, x3);
TEST_ST1x3 (vst1q, int32x4x3_t, int32_t*, s32, x3);
TEST_ST1x3 (vst1q, uint32x4x3_t, uint32_t*, u32, x3);
TEST_ST1x3 (vst1q, float32x4x3_t, float32_t*, f32, x3);
@@ -261,7 +267,7 @@ TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
/* { dg-final { scan-assembler-times "tbl\\t" 18} } */
/* { dg-final { scan-assembler-times "tbx\\t" 18} } */
-/* { dg-final { scan-assembler-times "st4\\t" 29} } */
-/* { dg-final { scan-assembler-times "st3\\t" 29} } */
-/* { dg-final { scan-assembler-times "st2\\t" 29} } */
-/* { dg-final { scan-assembler-times "st1\\t" 42} } */
+/* { dg-final { scan-assembler-times "st4\\t" 30} } */
+/* { dg-final { scan-assembler-times "st3\\t" 30} } */
+/* { dg-final { scan-assembler-times "st2\\t" 30} } */
+/* { dg-final { scan-assembler-times "st1\\t" 45} } */
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH 4/4] aarch64: Use memcpy to copy structures in bfloat vst* intrinsics
2021-08-05 17:17 [PATCH 4/4] aarch64: Use memcpy to copy structures in bfloat vst* intrinsics Jonathan Wright
@ 2021-08-06 9:34 ` Richard Sandiford
0 siblings, 0 replies; 2+ messages in thread
From: Richard Sandiford @ 2021-08-06 9:34 UTC (permalink / raw)
To: Jonathan Wright; +Cc: gcc-patches
Jonathan Wright <Jonathan.Wright@arm.com> writes:
> Hi,
>
> As subject, this patch uses __builtin_memcpy to copy vector structures
> instead of using a union - or constructing a new opaque structure one
> vector at a time - in each of the vst[234][q] and vst1[q]_x[234] bfloat
> Neon intrinsics in arm_neon.h.
>
> It also adds new code generation tests to verify that superfluous move
> instructions are not generated for the vst[234]q or vst1q_x[234] bfloat
> intrinsics.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
OK, thanks.
Richard
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-30 Jonathan Wright <jonathan.wright@arm.com>
>
> * config/aarch64/arm_neon.h (vst1_bf16_x2): Use
> __builtin_memcpy instead of constructing an additional
> __builtin_aarch64_simd_oi one vector at a time.
> (vst1q_bf16_x2): Likewise.
> (vst1_bf16_x3): Use __builtin_memcpy instead of constructing
> an additional __builtin_aarch64_simd_ci one vector at a time.
> (vst1q_bf16_x3): Likewise.
> (vst1_bf16_x4): Use __builtin_memcpy instead of a union.
> (vst1q_bf16_x4): Likewise.
> (vst2_bf16): Use __builtin_memcpy instead of constructing an
> additional __builtin_aarch64_simd_oi one vector at a time.
> (vst2q_bf16): Likewise.
> (vst3_bf16): Use __builtin_memcpy instead of constructing an
> additional __builtin_aarch64_simd_ci mode one vector at a
> time.
> (vst3q_bf16): Likewise.
> (vst4_bf16): Use __builtin_memcpy instead of constructing an
> additional __builtin_aarch64_simd_xi one vector at a time.
> (vst4q_bf16): Likewise.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/vector_structure_intrinsics.c: Add new
> tests.
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index ed6ce179d76f34e1f946adb75bb20a947b67ab82..a16ee4d534fb6f15047b08a951adcc87c5c9ac3f 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -33839,8 +33839,7 @@ vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val)
> bfloat16x8x2_t __temp;
> __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
> - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
> + __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> __builtin_aarch64_st1x2v4bf (__a, __o);
> }
>
> @@ -33849,8 +33848,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val)
> {
> __builtin_aarch64_simd_oi __o;
> - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
> - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
> + __builtin_memcpy (&__o, &__val, sizeof (__val));
> __builtin_aarch64_st1x2v8bf (__a, __o);
> }
>
> @@ -33863,9 +33861,7 @@ vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val)
> __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
> + __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
> }
>
> @@ -33874,26 +33870,31 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val)
> {
> __builtin_aarch64_simd_ci __o;
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
> + __builtin_memcpy (&__o, &__val, sizeof (__val));
> __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
> }
>
> __extension__ extern __inline void
> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val)
> +vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t __val)
> {
> - union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
> - __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
> + __builtin_aarch64_simd_xi __o;
> + bfloat16x8x4_t __temp;
> + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> + __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> + __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> + __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
> }
>
> __extension__ extern __inline void
> __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> -vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val)
> +vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t __val)
> {
> - union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
> - __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
> + __builtin_aarch64_simd_xi __o;
> + __builtin_memcpy (&__o, &__val, sizeof (__val));
> + __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
> }
>
> __extension__ extern __inline void
> @@ -33925,8 +33926,7 @@ vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val)
> bfloat16x8x2_t __temp;
> __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
> - __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
> + __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> __builtin_aarch64_st2v4bf (__a, __o);
> }
>
> @@ -33935,8 +33935,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val)
> {
> __builtin_aarch64_simd_oi __o;
> - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
> - __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
> + __builtin_memcpy (&__o, &__val, sizeof (__val));
> __builtin_aarch64_st2v8bf (__a, __o);
> }
>
> @@ -33949,9 +33948,7 @@ vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val)
> __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
> + __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
> }
>
> @@ -33960,9 +33957,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val)
> {
> __builtin_aarch64_simd_ci __o;
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
> - __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
> + __builtin_memcpy (&__o, &__val, sizeof (__val));
> __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
> }
>
> @@ -33976,10 +33971,7 @@ vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val)
> __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0)));
> - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
> - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
> - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
> - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3);
> + __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
> }
>
> @@ -33988,10 +33980,7 @@ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val)
> {
> __builtin_aarch64_simd_xi __o;
> - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
> - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
> - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
> - __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3);
> + __builtin_memcpy (&__o, &__val, sizeof (__val));
> __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
> }
>
> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> index e491d46394c7985d80930d1f7d9e8bd77f13c3c2..3e7e572bf39659ecf2f17751d92a4a99a4f2bf8b 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> @@ -95,6 +95,7 @@ TEST_STX (vst4q, int16x8x4_t, int16_t*, s16);
> TEST_STX (vst4q, uint16x8x4_t, uint16_t*, u16);
> TEST_STX (vst4q, poly16x8x4_t, poly16_t*, p16);
> TEST_STX (vst4q, float16x8x4_t, float16_t*, f16);
> +TEST_STX (vst4q, bfloat16x8x4_t, bfloat16_t*, bf16);
> TEST_STX (vst4q, int32x4x4_t, int32_t*, s32);
> TEST_STX (vst4q, uint32x4x4_t, uint32_t*, u32);
> TEST_STX (vst4q, float32x4x4_t, float32_t*, f32);
> @@ -110,6 +111,7 @@ TEST_STX (vst2q, int16x8x2_t, int16_t*, s16);
> TEST_STX (vst2q, uint16x8x2_t, uint16_t*, u16);
> TEST_STX (vst2q, poly16x8x2_t, poly16_t*, p16);
> TEST_STX (vst2q, float16x8x2_t, float16_t*, f16);
> +TEST_STX (vst2q, bfloat16x8x2_t, bfloat16_t*, bf16);
> TEST_STX (vst2q, int32x4x2_t, int32_t*, s32);
> TEST_STX (vst2q, uint32x4x2_t, uint32_t*, u32);
> TEST_STX (vst2q, float32x4x2_t, float32_t*, f32);
> @@ -131,6 +133,7 @@ TEST_ST3 (vst3q, int16x8x3_t, int16_t*, s16);
> TEST_ST3 (vst3q, uint16x8x3_t, uint16_t*, u16);
> TEST_ST3 (vst3q, poly16x8x3_t, poly16_t*, p16);
> TEST_ST3 (vst3q, float16x8x3_t, float16_t*, f16);
> +TEST_ST3 (vst3q, bfloat16x8x3_t, bfloat16_t*, bf16);
> TEST_ST3 (vst3q, int32x4x3_t, int32_t*, s32);
> TEST_ST3 (vst3q, uint32x4x3_t, uint32_t*, u32);
> TEST_ST3 (vst3q, float32x4x3_t, float32_t*, f32);
> @@ -212,6 +215,7 @@ TEST_ST1xN (vst1q, int16x8x4_t, int16_t*, s16, x4);
> TEST_ST1xN (vst1q, uint16x8x4_t, uint16_t*, u16, x4);
> TEST_ST1xN (vst1q, poly16x8x4_t, poly16_t*, p16, x4);
> TEST_ST1xN (vst1q, float16x8x4_t, float16_t*, f16, x4);
> +TEST_ST1xN (vst1q, bfloat16x8x4_t, bfloat16_t*, bf16, x4);
> TEST_ST1xN (vst1q, int32x4x4_t, int32_t*, s32, x4);
> TEST_ST1xN (vst1q, uint32x4x4_t, uint32_t*, u32, x4);
> TEST_ST1xN (vst1q, float32x4x4_t, float32_t*, f32, x4);
> @@ -227,6 +231,7 @@ TEST_ST1xN (vst1q, int16x8x2_t, int16_t*, s16, x2);
> TEST_ST1xN (vst1q, uint16x8x2_t, uint16_t*, u16, x2);
> TEST_ST1xN (vst1q, poly16x8x2_t, poly16_t*, p16, x2);
> TEST_ST1xN (vst1q, float16x8x2_t, float16_t*, f16, x2);
> +TEST_ST1xN (vst1q, bfloat16x8x2_t, bfloat16_t*, bf16, x2);
> TEST_ST1xN (vst1q, int32x4x2_t, int32_t*, s32, x2);
> TEST_ST1xN (vst1q, uint32x4x2_t, uint32_t*, u32, x2);
> TEST_ST1xN (vst1q, float32x4x2_t, float32_t*, f32, x2);
> @@ -249,6 +254,7 @@ TEST_ST1x3 (vst1q, int16x8x3_t, int16_t*, s16, x3);
> TEST_ST1x3 (vst1q, uint16x8x3_t, uint16_t*, u16, x3);
> TEST_ST1x3 (vst1q, poly16x8x3_t, poly16_t*, p16, x3);
> TEST_ST1x3 (vst1q, float16x8x3_t, float16_t*, f16, x3);
> +TEST_ST1x3 (vst1q, bfloat16x8x3_t, bfloat16_t*, bf16, x3);
> TEST_ST1x3 (vst1q, int32x4x3_t, int32_t*, s32, x3);
> TEST_ST1x3 (vst1q, uint32x4x3_t, uint32_t*, u32, x3);
> TEST_ST1x3 (vst1q, float32x4x3_t, float32_t*, f32, x3);
> @@ -261,7 +267,7 @@ TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
>
> /* { dg-final { scan-assembler-times "tbl\\t" 18} } */
> /* { dg-final { scan-assembler-times "tbx\\t" 18} } */
> -/* { dg-final { scan-assembler-times "st4\\t" 29} } */
> -/* { dg-final { scan-assembler-times "st3\\t" 29} } */
> -/* { dg-final { scan-assembler-times "st2\\t" 29} } */
> -/* { dg-final { scan-assembler-times "st1\\t" 42} } */
> +/* { dg-final { scan-assembler-times "st4\\t" 30} } */
> +/* { dg-final { scan-assembler-times "st3\\t" 30} } */
> +/* { dg-final { scan-assembler-times "st2\\t" 30} } */
> +/* { dg-final { scan-assembler-times "st1\\t" 45} } */
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-08-06 9:34 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-05 17:17 [PATCH 4/4] aarch64: Use memcpy to copy structures in bfloat vst* intrinsics Jonathan Wright
2021-08-06 9:34 ` Richard Sandiford
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).