public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH 3/4] aarch64: Use memcpy to copy structures in vst2[q]_lane intrinsics
@ 2021-08-05 17:15 Jonathan Wright
  2021-08-06  9:31 ` Richard Sandiford
  0 siblings, 1 reply; 2+ messages in thread
From: Jonathan Wright @ 2021-08-05 17:15 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 1814 bytes --]

Hi,

As subject, this patch uses __builtin_memcpy to copy vector structures
instead of using a union - or constructing a new opaque structure one
vector at a time - in each of the vst2[q]_lane Neon intrinsics in
arm_neon.h.

It also adds new code generation tests to verify that superfluous move
instructions are not generated for the vst2q_lane intrinsics.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-30  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/arm_neon.h (__ST2_LANE_FUNC): Delete.
	(__ST2Q_LANE_FUNC): Delete.
	(vst2_lane_f16): Use __builtin_memcpy to copy vector
	structure instead of constructing __builtin_aarch64_simd_oi
	one vector at a time.
	(vst2_lane_f32): Likewise.
	(vst2_lane_f64): Likewise.
	(vst2_lane_p8): Likewise.
	(vst2_lane_p16): Likewise.
	(vst2_lane_p64): Likewise.
	(vst2_lane_s8): Likewise.
	(vst2_lane_s16): Likewise.
	(vst2_lane_s32): Likewise.
	(vst2_lane_s64): Likewise.
	(vst2_lane_u8): Likewise.
	(vst2_lane_u16): Likewise.
	(vst2_lane_u32): Likewise.
	(vst2_lane_u64): Likewise.
	(vst2_lane_bf16): Likewise.
	(vst2q_lane_f16): Use __builtin_memcpy to copy vector
	structure instead of using a union.
	(vst2q_lane_f32): Likewise.
	(vst2q_lane_f64): Likewise.
	(vst2q_lane_p8): Likewise.
	(vst2q_lane_p16): Likewise.
	(vst2q_lane_p64): Likewise.
	(vst2q_lane_s8): Likewise.
	(vst2q_lane_s16): Likewise.
	(vst2q_lane_s32): Likewise.
	(vst2q_lane_s64): Likewise.
	(vst2q_lane_u8): Likewise.
	(vst2q_lane_u16): Likewise.
	(vst2q_lane_u32): Likewise.
	(vst2q_lane_u64): Likewise.
	(vst2q_lane_bf16): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vector_structure_intrinsics.c: Add new
	tests.

[-- Attachment #2: rb14730.patch --]
[-- Type: application/octet-stream, Size: 20819 bytes --]

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index d78ced8968869d9317d76368554bf6ce8f7e3afe..ed6ce179d76f34e1f946adb75bb20a947b67ab82 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -9206,84 +9206,355 @@ __STRUCTN (float, 64, 4)
 #undef __STRUCTN
 
 
-#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
-			qmode, ptr_mode, funcsuffix, signedtype)	     \
-__extension__ extern __inline void					     \
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
-			  intype __b, const int __c)			     \
-{									     \
-  __builtin_aarch64_simd_oi __o;					     \
-  largetype __temp;							     \
-  __temp.val[0]								     \
-    = vcombine_##funcsuffix (__b.val[0],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __temp.val[1]								     \
-    = vcombine_##funcsuffix (__b.val[1],				     \
-			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
-  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
-					     (signedtype) __temp.val[0], 0); \
-  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
-					     (signedtype) __temp.val[1], 1); \
-  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
-				     __ptr, __o, __c);			     \
-}
-
-__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
-		 float16x8_t)
-__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
-		 float32x4_t)
-__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
-		 float64x2_t)
-__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
-		 int8x16_t)
-__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
-		 int16x8_t)
-__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
-		 poly64x2_t)
-__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
-		 int8x16_t)
-__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
-		 int16x8_t)
-__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
-		 int32x4_t)
-__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
-		 int64x2_t)
-__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
-		 int8x16_t)
-__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
-		 int16x8_t)
-__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
-		 int32x4_t)
-__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
-		 int64x2_t)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_f16 (float16_t *__ptr, float16x4x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  float16x8x2_t __temp;
+  __temp.val[0]	= vcombine_f16 (__val.val[0],
+				vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_f16 (__val.val[1],
+				vcreate_f16 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_f32 (float32_t *__ptr, float32x2x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  float32x4x2_t __temp;
+  __temp.val[0]	= vcombine_f32 (__val.val[0],
+				vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_f32 (__val.val[1],
+				vcreate_f32 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_f64 (float64_t *__ptr, float64x1x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  float64x2x2_t __temp;
+  __temp.val[0]	= vcombine_f64 (__val.val[0],
+				vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_f64 (__val.val[1],
+				vcreate_f64 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o,
+				__lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_p8 (poly8_t *__ptr, poly8x8x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly8x16x2_t __temp;
+  __temp.val[0]	= vcombine_p8 (__val.val[0],
+			       vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_p8 (__val.val[1],
+			       vcreate_p8 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_p16 (poly16_t *__ptr, poly16x4x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly16x8x2_t __temp;
+  __temp.val[0]	= vcombine_p16 (__val.val[0],
+				vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_p16 (__val.val[1],
+				vcreate_p16 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_p64 (poly64_t *__ptr, poly64x1x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  poly64x2x2_t __temp;
+  __temp.val[0]	= vcombine_p64 (__val.val[0],
+				vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_p64 (__val.val[1],
+				vcreate_p64 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
+				__lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_s8 (int8_t *__ptr, int8x8x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  int8x16x2_t __temp;
+  __temp.val[0]	= vcombine_s8 (__val.val[0],
+			       vcreate_s8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_s8 (__val.val[1],
+			       vcreate_s8 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_s16 (int16_t *__ptr, int16x4x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  int16x8x2_t __temp;
+  __temp.val[0]	= vcombine_s16 (__val.val[0],
+				vcreate_s16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_s16 (__val.val[1],
+				vcreate_s16 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_s32 (int32_t *__ptr, int32x2x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  int32x4x2_t __temp;
+  __temp.val[0]	= vcombine_s32 (__val.val[0],
+				vcreate_s32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_s32 (__val.val[1],
+				vcreate_s32 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_s64 (int64_t *__ptr, int64x1x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  int64x2x2_t __temp;
+  __temp.val[0]	= vcombine_s64 (__val.val[0],
+				vcreate_s64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_s64 (__val.val[1],
+				vcreate_s64 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
+				__lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_u8 (uint8_t *__ptr, uint8x8x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint8x16x2_t __temp;
+  __temp.val[0]	= vcombine_u8 (__val.val[0],
+			       vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_u8 (__val.val[1],
+			       vcreate_u8 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_u16 (uint16_t *__ptr, uint16x4x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint16x8x2_t __temp;
+  __temp.val[0]	= vcombine_u16 (__val.val[0],
+				vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_u16 (__val.val[1],
+				vcreate_u16 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
+				  __lane);
+}
 
-#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
-__extension__ extern __inline void					    \
-__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
-vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
-			   intype __b, const int __c)			    \
-{									    \
-  union { intype __i;							    \
-	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
-  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
-				    __ptr, __temp.__o, __c);		    \
-}
-
-__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
-__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
-__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
-__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
-__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
-__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
-__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
-__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
-__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
-__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
-__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
-__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
-__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
-__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_u32 (uint32_t *__ptr, uint32x2x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint32x4x2_t __temp;
+  __temp.val[0]	= vcombine_u32 (__val.val[0],
+				vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_u32 (__val.val[1],
+				vcreate_u32 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_u64 (uint64_t *__ptr, uint64x1x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  uint64x2x2_t __temp;
+  __temp.val[0]	= vcombine_u64 (__val.val[0],
+				vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_u64 (__val.val[1],
+				vcreate_u64 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
+				__lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_f16 (float16_t *__ptr, float16x8x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_f32 (float32_t *__ptr, float32x4x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_f64 (float64_t *__ptr, float64x2x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_p8 (poly8_t *__ptr, poly8x16x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
+				   __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_p16 (poly16_t *__ptr, poly16x8x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_p64 (poly64_t *__ptr, poly64x2x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_s8 (int8_t *__ptr, int8x16x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
+				   __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_s16 (int16_t *__ptr, int16x8x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_s32 (int32_t *__ptr, int32x4x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_s64 (int64_t *__ptr, int64x2x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_u8 (uint8_t *__ptr, uint8x16x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
+				   __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_u16 (uint16_t *__ptr, uint16x8x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_u32 (uint32_t *__ptr, uint32x4x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_u64 (uint64_t *__ptr, uint64x2x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
+				  __lane);
+}
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -34334,9 +34605,30 @@ __LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf,
 		 v8bf, bf, bf16, bfloat16x8_t)
 __LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
 
-__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf,
-		 bf16, bfloat16x8_t)
-__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16)
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  bfloat16x8x2_t __temp;
+  __temp.val[0]	= vcombine_bf16 (__val.val[0],
+				 vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __temp.val[1]	= vcombine_bf16 (__val.val[1],
+				 vcreate_bf16 (__AARCH64_UINT64_C (0)));
+  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
+  __builtin_aarch64_st2_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o,
+				  __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x2_t __val, const int __lane)
+{
+  __builtin_aarch64_simd_oi __o;
+  __builtin_memcpy (&__o, &__val, sizeof (__val));
+  __builtin_aarch64_st2_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o,
+				  __lane);
+}
 
 __extension__ extern __inline void
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
@@ -34613,7 +34905,5 @@ vaddq_p128 (poly128_t __a, poly128_t __b)
 #undef __LD3Q_LANE_FUNC
 #undef __LD4_LANE_FUNC
 #undef __LD4Q_LANE_FUNC
-#undef __ST2_LANE_FUNC
-#undef __ST2Q_LANE_FUNC
 
 #endif
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
index b8f993b375a87c3559f1e1836af90a7d84b0621c..e491d46394c7985d80930d1f7d9e8bd77f13c3c2 100644
--- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
+++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
@@ -161,6 +161,22 @@ TEST_STX_LANE (vst4q_lane, uint64x2x4_t, uint64_t*, u64);
 TEST_STX_LANE (vst4q_lane, float64x2x4_t, float64_t*, f64);
 TEST_STX_LANE (vst4q_lane, poly64x2x4_t, poly64_t*, p64);
 
+TEST_STX_LANE (vst2q_lane, int8x16x2_t, int8_t*, s8);
+TEST_STX_LANE (vst2q_lane, uint8x16x2_t, uint8_t*, u8);
+TEST_STX_LANE (vst2q_lane, poly8x16x2_t, poly8_t*, p8);
+TEST_STX_LANE (vst2q_lane, int16x8x2_t, int16_t*, s16);
+TEST_STX_LANE (vst2q_lane, uint16x8x2_t, uint16_t*, u16);
+TEST_STX_LANE (vst2q_lane, poly16x8x2_t, poly16_t*, p16);
+TEST_STX_LANE (vst2q_lane, float16x8x2_t, float16_t*, f16);
+TEST_STX_LANE (vst2q_lane, bfloat16x8x2_t, bfloat16_t*, bf16);
+TEST_STX_LANE (vst2q_lane, int32x4x2_t, int32_t*, s32);
+TEST_STX_LANE (vst2q_lane, uint32x4x2_t, uint32_t*, u32);
+TEST_STX_LANE (vst2q_lane, float32x4x2_t, float32_t*, f32);
+TEST_STX_LANE (vst2q_lane, int64x2x2_t, int64_t*, s64);
+TEST_STX_LANE (vst2q_lane, uint64x2x2_t, uint64_t*, u64);
+TEST_STX_LANE (vst2q_lane, float64x2x2_t, float64_t*, f64);
+TEST_STX_LANE (vst2q_lane, poly64x2x2_t, poly64_t*, p64);
+
 #define TEST_ST3_LANE(name, tbltype, ptrtype, ts) \
   void test_ ## name ## _ ## ts (ptrtype a, int8x8_t dummy, tbltype b) \
 	{ \
@@ -247,5 +263,5 @@ TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
 /* { dg-final { scan-assembler-times "tbx\\t" 18} }  */
 /* { dg-final { scan-assembler-times "st4\\t" 29} }  */
 /* { dg-final { scan-assembler-times "st3\\t" 29} }  */
-/* { dg-final { scan-assembler-times "st2\\t" 14} }  */
+/* { dg-final { scan-assembler-times "st2\\t" 29} }  */
 /* { dg-final { scan-assembler-times "st1\\t" 42} }  */

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH 3/4] aarch64: Use memcpy to copy structures in vst2[q]_lane intrinsics
  2021-08-05 17:15 [PATCH 3/4] aarch64: Use memcpy to copy structures in vst2[q]_lane intrinsics Jonathan Wright
@ 2021-08-06  9:31 ` Richard Sandiford
  0 siblings, 0 replies; 2+ messages in thread
From: Richard Sandiford @ 2021-08-06  9:31 UTC (permalink / raw)
  To: Jonathan Wright; +Cc: gcc-patches

Jonathan Wright <Jonathan.Wright@arm.com> writes:
> Hi,
>
> As subject, this patch uses __builtin_memcpy to copy vector structures
> instead of using a union - or constructing a new opaque structure one
> vector at a time - in each of the vst2[q]_lane Neon intrinsics in
> arm_neon.h.
>
> It also adds new code generation tests to verify that superfluous move
> instructions are not generated for the vst2q_lane intrinsics.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?

Ok with the same s/\t=/ =/ comment as for 1/4.

Thanks,
Richard

> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-30  Jonathan Wright  <jonathan.wright@arm.com>
>
>         * config/aarch64/arm_neon.h (__ST2_LANE_FUNC): Delete.
>         (__ST2Q_LANE_FUNC): Delete.
>         (vst2_lane_f16): Use __builtin_memcpy to copy vector
>         structure instead of constructing __builtin_aarch64_simd_oi
>         one vector at a time.
>         (vst2_lane_f32): Likewise.
>         (vst2_lane_f64): Likewise.
>         (vst2_lane_p8): Likewise.
>         (vst2_lane_p16): Likewise.
>         (vst2_lane_p64): Likewise.
>         (vst2_lane_s8): Likewise.
>         (vst2_lane_s16): Likewise.
>         (vst2_lane_s32): Likewise.
>         (vst2_lane_s64): Likewise.
>         (vst2_lane_u8): Likewise.
>         (vst2_lane_u16): Likewise.
>         (vst2_lane_u32): Likewise.
>         (vst2_lane_u64): Likewise.
>         (vst2_lane_bf16): Likewise.
>         (vst2q_lane_f16): Use __builtin_memcpy to copy vector
>         structure instead of using a union.
>         (vst2q_lane_f32): Likewise.
>         (vst2q_lane_f64): Likewise.
>         (vst2q_lane_p8): Likewise.
>         (vst2q_lane_p16): Likewise.
>         (vst2q_lane_p64): Likewise.
>         (vst2q_lane_s8): Likewise.
>         (vst2q_lane_s16): Likewise.
>         (vst2q_lane_s32): Likewise.
>         (vst2q_lane_s64): Likewise.
>         (vst2q_lane_u8): Likewise.
>         (vst2q_lane_u16): Likewise.
>         (vst2q_lane_u32): Likewise.
>         (vst2q_lane_u64): Likewise.
>         (vst2q_lane_bf16): Likewise.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vector_structure_intrinsics.c: Add new
>         tests.
>
> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> index d78ced8968869d9317d76368554bf6ce8f7e3afe..ed6ce179d76f34e1f946adb75bb20a947b67ab82 100644
> --- a/gcc/config/aarch64/arm_neon.h
> +++ b/gcc/config/aarch64/arm_neon.h
> @@ -9206,84 +9206,355 @@ __STRUCTN (float, 64, 4)
>  #undef __STRUCTN
>  
>  
> -#define __ST2_LANE_FUNC(intype, largetype, ptrtype, mode,		     \
> -			qmode, ptr_mode, funcsuffix, signedtype)	     \
> -__extension__ extern __inline void					     \
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
> -vst2_lane_ ## funcsuffix (ptrtype *__ptr,				     \
> -			  intype __b, const int __c)			     \
> -{									     \
> -  __builtin_aarch64_simd_oi __o;					     \
> -  largetype __temp;							     \
> -  __temp.val[0]								     \
> -    = vcombine_##funcsuffix (__b.val[0],				     \
> -			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
> -  __temp.val[1]								     \
> -    = vcombine_##funcsuffix (__b.val[1],				     \
> -			     vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
> -  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
> -					     (signedtype) __temp.val[0], 0); \
> -  __o = __builtin_aarch64_set_qregoi##qmode (__o,			     \
> -					     (signedtype) __temp.val[1], 1); \
> -  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *)  \
> -				     __ptr, __o, __c);			     \
> -}
> -
> -__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
> -		 float16x8_t)
> -__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
> -		 float32x4_t)
> -__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
> -		 float64x2_t)
> -__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v8qi, v16qi, qi, p8,
> -		 int8x16_t)
> -__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v4hi, v8hi, hi, p16,
> -		 int16x8_t)
> -__ST2_LANE_FUNC (poly64x1x2_t, poly64x2x2_t, poly64_t, di, v2di_ssps, di, p64,
> -		 poly64x2_t)
> -__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v8qi, v16qi, qi, s8,
> -		 int8x16_t)
> -__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v4hi, v8hi, hi, s16,
> -		 int16x8_t)
> -__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v2si, v4si, si, s32,
> -		 int32x4_t)
> -__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, di, v2di, di, s64,
> -		 int64x2_t)
> -__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v8qi, v16qi, qi, u8,
> -		 int8x16_t)
> -__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v4hi, v8hi, hi, u16,
> -		 int16x8_t)
> -__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32,
> -		 int32x4_t)
> -__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
> -		 int64x2_t)
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_f16 (float16_t *__ptr, float16x4x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  float16x8x2_t __temp;
> +  __temp.val[0]	= vcombine_f16 (__val.val[0],
> +				vcreate_f16 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_f16 (__val.val[1],
> +				vcreate_f16 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev4hf ((__builtin_aarch64_simd_hf *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_f32 (float32_t *__ptr, float32x2x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  float32x4x2_t __temp;
> +  __temp.val[0]	= vcombine_f32 (__val.val[0],
> +				vcreate_f32 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_f32 (__val.val[1],
> +				vcreate_f32 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev2sf ((__builtin_aarch64_simd_sf *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_f64 (float64_t *__ptr, float64x1x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  float64x2x2_t __temp;
> +  __temp.val[0]	= vcombine_f64 (__val.val[0],
> +				vcreate_f64 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_f64 (__val.val[1],
> +				vcreate_f64 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanedf ((__builtin_aarch64_simd_df *) __ptr, __o,
> +				__lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_p8 (poly8_t *__ptr, poly8x8x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  poly8x16x2_t __temp;
> +  __temp.val[0]	= vcombine_p8 (__val.val[0],
> +			       vcreate_p8 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_p8 (__val.val[1],
> +			       vcreate_p8 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_p16 (poly16_t *__ptr, poly16x4x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  poly16x8x2_t __temp;
> +  __temp.val[0]	= vcombine_p16 (__val.val[0],
> +				vcreate_p16 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_p16 (__val.val[1],
> +				vcreate_p16 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_p64 (poly64_t *__ptr, poly64x1x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  poly64x2x2_t __temp;
> +  __temp.val[0]	= vcombine_p64 (__val.val[0],
> +				vcreate_p64 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_p64 (__val.val[1],
> +				vcreate_p64 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
> +				__lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_s8 (int8_t *__ptr, int8x8x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  int8x16x2_t __temp;
> +  __temp.val[0]	= vcombine_s8 (__val.val[0],
> +			       vcreate_s8 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_s8 (__val.val[1],
> +			       vcreate_s8 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_s16 (int16_t *__ptr, int16x4x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  int16x8x2_t __temp;
> +  __temp.val[0]	= vcombine_s16 (__val.val[0],
> +				vcreate_s16 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_s16 (__val.val[1],
> +				vcreate_s16 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_s32 (int32_t *__ptr, int32x2x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  int32x4x2_t __temp;
> +  __temp.val[0]	= vcombine_s32 (__val.val[0],
> +				vcreate_s32 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_s32 (__val.val[1],
> +				vcreate_s32 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_s64 (int64_t *__ptr, int64x1x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  int64x2x2_t __temp;
> +  __temp.val[0]	= vcombine_s64 (__val.val[0],
> +				vcreate_s64 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_s64 (__val.val[1],
> +				vcreate_s64 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
> +				__lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_u8 (uint8_t *__ptr, uint8x8x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  uint8x16x2_t __temp;
> +  __temp.val[0]	= vcombine_u8 (__val.val[0],
> +			       vcreate_u8 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_u8 (__val.val[1],
> +			       vcreate_u8 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev8qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_u16 (uint16_t *__ptr, uint16x4x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  uint16x8x2_t __temp;
> +  __temp.val[0]	= vcombine_u16 (__val.val[0],
> +				vcreate_u16 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_u16 (__val.val[1],
> +				vcreate_u16 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev4hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
> +				  __lane);
> +}
>  
> -#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix)	    \
> -__extension__ extern __inline void					    \
> -__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
> -vst2q_lane_ ## funcsuffix (ptrtype *__ptr,				    \
> -			   intype __b, const int __c)			    \
> -{									    \
> -  union { intype __i;							    \
> -	  __builtin_aarch64_simd_oi __o; } __temp = { __b };		    \
> -  __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
> -				    __ptr, __temp.__o, __c);		    \
> -}
> -
> -__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
> -__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
> -__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
> -__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
> -__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
> -__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
> -__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
> -__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
> -__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
> -__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
> -__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
> -__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
> -__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
> -__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_u32 (uint32_t *__ptr, uint32x2x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  uint32x4x2_t __temp;
> +  __temp.val[0]	= vcombine_u32 (__val.val[0],
> +				vcreate_u32 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_u32 (__val.val[1],
> +				vcreate_u32 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev2si ((__builtin_aarch64_simd_si *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_u64 (uint64_t *__ptr, uint64x1x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  uint64x2x2_t __temp;
> +  __temp.val[0]	= vcombine_u64 (__val.val[0],
> +				vcreate_u64 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_u64 (__val.val[1],
> +				vcreate_u64 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanedi ((__builtin_aarch64_simd_di *) __ptr, __o,
> +				__lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_f16 (float16_t *__ptr, float16x8x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev8hf ((__builtin_aarch64_simd_hf *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_f32 (float32_t *__ptr, float32x4x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev4sf ((__builtin_aarch64_simd_sf *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_f64 (float64_t *__ptr, float64x2x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev2df ((__builtin_aarch64_simd_df *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_p8 (poly8_t *__ptr, poly8x16x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
> +				   __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_p16 (poly16_t *__ptr, poly16x8x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_p64 (poly64_t *__ptr, poly64x2x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_s8 (int8_t *__ptr, int8x16x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
> +				   __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_s16 (int16_t *__ptr, int16x8x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_s32 (int32_t *__ptr, int32x4x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_s64 (int64_t *__ptr, int64x2x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_u8 (uint8_t *__ptr, uint8x16x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev16qi ((__builtin_aarch64_simd_qi *) __ptr, __o,
> +				   __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_u16 (uint16_t *__ptr, uint16x8x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev8hi ((__builtin_aarch64_simd_hi *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_u32 (uint32_t *__ptr, uint32x4x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev4si ((__builtin_aarch64_simd_si *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_u64 (uint64_t *__ptr, uint64x2x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev2di ((__builtin_aarch64_simd_di *) __ptr, __o,
> +				  __lane);
> +}
>  
>  __extension__ extern __inline void
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> @@ -34334,9 +34605,30 @@ __LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf,
>  		 v8bf, bf, bf16, bfloat16x8_t)
>  __LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
>  
> -__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf,
> -		 bf16, bfloat16x8_t)
> -__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16)
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2_lane_bf16 (bfloat16_t *__ptr, bfloat16x4x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  bfloat16x8x2_t __temp;
> +  __temp.val[0]	= vcombine_bf16 (__val.val[0],
> +				 vcreate_bf16 (__AARCH64_UINT64_C (0)));
> +  __temp.val[1]	= vcombine_bf16 (__val.val[1],
> +				 vcreate_bf16 (__AARCH64_UINT64_C (0)));
> +  __builtin_memcpy (&__o, &__temp, sizeof (__temp));
> +  __builtin_aarch64_st2_lanev4bf ((__builtin_aarch64_simd_bf *) __ptr, __o,
> +				  __lane);
> +}
> +
> +__extension__ extern __inline void
> +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> +vst2q_lane_bf16 (bfloat16_t *__ptr, bfloat16x8x2_t __val, const int __lane)
> +{
> +  __builtin_aarch64_simd_oi __o;
> +  __builtin_memcpy (&__o, &__val, sizeof (__val));
> +  __builtin_aarch64_st2_lanev8bf ((__builtin_aarch64_simd_bf *) __ptr, __o,
> +				  __lane);
> +}
>  
>  __extension__ extern __inline void
>  __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
> @@ -34613,7 +34905,5 @@ vaddq_p128 (poly128_t __a, poly128_t __b)
>  #undef __LD3Q_LANE_FUNC
>  #undef __LD4_LANE_FUNC
>  #undef __LD4Q_LANE_FUNC
> -#undef __ST2_LANE_FUNC
> -#undef __ST2Q_LANE_FUNC
>  
>  #endif
> diff --git a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> index b8f993b375a87c3559f1e1836af90a7d84b0621c..e491d46394c7985d80930d1f7d9e8bd77f13c3c2 100644
> --- a/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> +++ b/gcc/testsuite/gcc.target/aarch64/vector_structure_intrinsics.c
> @@ -161,6 +161,22 @@ TEST_STX_LANE (vst4q_lane, uint64x2x4_t, uint64_t*, u64);
>  TEST_STX_LANE (vst4q_lane, float64x2x4_t, float64_t*, f64);
>  TEST_STX_LANE (vst4q_lane, poly64x2x4_t, poly64_t*, p64);
>  
> +TEST_STX_LANE (vst2q_lane, int8x16x2_t, int8_t*, s8);
> +TEST_STX_LANE (vst2q_lane, uint8x16x2_t, uint8_t*, u8);
> +TEST_STX_LANE (vst2q_lane, poly8x16x2_t, poly8_t*, p8);
> +TEST_STX_LANE (vst2q_lane, int16x8x2_t, int16_t*, s16);
> +TEST_STX_LANE (vst2q_lane, uint16x8x2_t, uint16_t*, u16);
> +TEST_STX_LANE (vst2q_lane, poly16x8x2_t, poly16_t*, p16);
> +TEST_STX_LANE (vst2q_lane, float16x8x2_t, float16_t*, f16);
> +TEST_STX_LANE (vst2q_lane, bfloat16x8x2_t, bfloat16_t*, bf16);
> +TEST_STX_LANE (vst2q_lane, int32x4x2_t, int32_t*, s32);
> +TEST_STX_LANE (vst2q_lane, uint32x4x2_t, uint32_t*, u32);
> +TEST_STX_LANE (vst2q_lane, float32x4x2_t, float32_t*, f32);
> +TEST_STX_LANE (vst2q_lane, int64x2x2_t, int64_t*, s64);
> +TEST_STX_LANE (vst2q_lane, uint64x2x2_t, uint64_t*, u64);
> +TEST_STX_LANE (vst2q_lane, float64x2x2_t, float64_t*, f64);
> +TEST_STX_LANE (vst2q_lane, poly64x2x2_t, poly64_t*, p64);
> +
>  #define TEST_ST3_LANE(name, tbltype, ptrtype, ts) \
>    void test_ ## name ## _ ## ts (ptrtype a, int8x8_t dummy, tbltype b) \
>  	{ \
> @@ -247,5 +263,5 @@ TEST_ST1x3 (vst1q, float64x2x3_t, float64_t*, f64, x3);
>  /* { dg-final { scan-assembler-times "tbx\\t" 18} }  */
>  /* { dg-final { scan-assembler-times "st4\\t" 29} }  */
>  /* { dg-final { scan-assembler-times "st3\\t" 29} }  */
> -/* { dg-final { scan-assembler-times "st2\\t" 14} }  */
> +/* { dg-final { scan-assembler-times "st2\\t" 29} }  */
>  /* { dg-final { scan-assembler-times "st1\\t" 42} }  */

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-08-06  9:31 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-05 17:15 [PATCH 3/4] aarch64: Use memcpy to copy structures in vst2[q]_lane intrinsics Jonathan Wright
2021-08-06  9:31 ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).