* Re: [PR66791][ARM] Replace __builtin_vext* with __buitlin_shuffle in vext intrinsics
2021-01-04 10:30 ` Kyrylo Tkachov
@ 2021-01-05 11:41 ` Prathamesh Kulkarni
2021-01-05 13:07 ` Kyrylo Tkachov
0 siblings, 1 reply; 4+ messages in thread
From: Prathamesh Kulkarni @ 2021-01-05 11:41 UTC (permalink / raw)
To: Kyrylo Tkachov; +Cc: gcc Patches
[-- Attachment #1: Type: text/plain, Size: 938 bytes --]
On Mon, 4 Jan 2021 at 16:01, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
>
> Hi Prathamesh
>
> > -----Original Message-----
> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> > Sent: 04 January 2021 10:27
> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > Subject: [PR66791][ARM] Replace __builtin_vext* with __buitlin_shuffle in
> > vext intrinsics
> >
> > Hi Kyrill,
> > The attached patch replaces __builtin_vextv8qi with __builtin_shuffle
> > for vext_s8.
> > Just wanted to confirm if this is in the correct direction ?
> > If yes, I will send a follow up patch that converts for all vext intrinsics.
>
> Yeah, that does look correct (aarch64 does it that way).
> As before, please make sure to delete any now-unused builtins as well.
Thanks, does the attached patch look OK ?
Testing in progress.
Thanks,
Prathamesh
>
> Thanks,
> Kyrill
>
> >
> > Thanks,
> > Prathamesh
[-- Attachment #2: vext-2.diff --]
[-- Type: application/octet-stream, Size: 13083 bytes --]
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 3efcfa45229..efdaceafcb5 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8733,77 +8733,131 @@ __extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_s8 (int8x8_t __a, int8x8_t __b, const int __c)
{
- return (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint8x8_t)
+ {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_s16 (int16x4_t __a, int16x4_t __b, const int __c)
{
- return (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint16x4_t)
+ {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t)
+ {__c, __c+1, __c+2, __c+3});
+#endif
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_s32 (int32x2_t __a, int32x2_t __b, const int __c)
{
- return (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint32x2_t)
+ {2-__c, 3-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t)
+ {__c, __c+1});
+#endif
}
__extension__ extern __inline int64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_s64 (int64x1_t __a, int64x1_t __b, const int __c)
{
- return (int64x1_t)__builtin_neon_vextdi (__a, __b, __c);
+ return __a;
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_f32 (float32x2_t __a, float32x2_t __b, const int __c)
{
- return (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint32x2_t)
+ {2-__c, 3-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t)
+ {__c, __c+1});
+#endif
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
{
- return (uint8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint8x8_t)
+ {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
{
- return (uint16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint16x4_t)
+ {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t)
+ {__c, __c+1, __c+2, __c+3});
+#endif
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
{
- return (uint32x2_t)__builtin_neon_vextv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint32x2_t)
+ {2-__c, 3-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t)
+ {__c, __c+1});
+#endif
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
{
- return (uint64x1_t)__builtin_neon_vextdi ((int64x1_t) __a, (int64x1_t) __b, __c);
+ return __a;
}
__extension__ extern __inline poly8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
{
- return (poly8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint8x8_t)
+ {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
}
__extension__ extern __inline poly16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
{
- return (poly16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint16x4_t)
+ {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t)
+ {__c, __c+1, __c+2, __c+3});
+#endif
}
#pragma GCC push_options
@@ -8812,7 +8866,13 @@ __extension__ extern __inline poly64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_p64 (poly64x2_t __a, poly64x2_t __b, const int __c)
{
- return (poly64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint32x2_t)
+ {2-__c, 3-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t)
+ {__c, __c+1});
+#endif
}
#pragma GCC pop_options
@@ -8820,77 +8880,145 @@ __extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c)
{
- return (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint8x16_t)
+ {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
+ 24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
+ __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
+#endif
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
{
- return (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint16x8_t)
+ {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
{
- return (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint32x4_t)
+ {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t)
+ {__c, __c+1, __c+2, __c+3});
+#endif
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c)
{
- return (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+#endif
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c)
{
- return (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint32x4_t)
+ {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t)
+ {__c, __c+1, __c+2, __c+3});
+#endif
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
{
- return (uint8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint8x16_t)
+ {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
+ 24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
+ __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
+#endif
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
{
- return (uint16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint16x8_t)
+ {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
{
- return (uint32x4_t)__builtin_neon_vextv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint32x4_t)
+ {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t)
+ {__c, __c+1, __c+2, __c+3});
+#endif
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
{
- return (uint64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint64x2_t) {2-__c, 3-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {__c, __c+1});
+#endif
}
__extension__ extern __inline poly8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
{
- return (poly8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint8x16_t)
+ {16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
+ 24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
+ __c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
+#endif
}
__extension__ extern __inline poly16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
{
- return (poly16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint16x8_t)
+ {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c, 15-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
}
__extension__ extern __inline int8x8_t
@@ -17907,14 +18035,26 @@ __extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vext_f16 (float16x4_t __a, float16x4_t __b, const int __c)
{
- return __builtin_neon_vextv4hf (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint16x4_t)
+ {4-__c, 5-__c, 6-__c, 7-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t)
+ {__c, __c+1, __c+2, __c+3});
+#endif
}
__extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_f16 (float16x8_t __a, float16x8_t __b, const int __c)
{
- return __builtin_neon_vextv8hf (__a, __b, __c);
+#ifdef __ARMEB__
+ return __builtin_shuffle (__b, __a, (uint16x8_t)
+ {8-__c, 9-__c, 10-__c, 11-__c, 12-__c, 13-__c, 14-__c,15-__c});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7});
+#endif
}
__extension__ extern __inline float16x4_t
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index ae104d5ba1b..6834b8dfdcd 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -247,9 +247,6 @@ VAR6 (MAC_N, vmls_n, v4hi, v2si, v2sf, v8hi, v4si, v4sf)
VAR2 (MAC_N, vmlsls_n, v4hi, v2si)
VAR2 (MAC_N, vmlslu_n, v4hi, v2si)
VAR2 (MAC_N, vqdmlsl_n, v4hi, v2si)
-VAR10 (SETLANE, vext,
- v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR2 (SETLANE, vext, v8hf, v4hf)
VAR8 (UNOP, vrev64, v8qi, v4hi, v2si, v2sf, v16qi, v8hi, v4si, v4sf)
VAR4 (UNOP, vrev32, v8qi, v4hi, v16qi, v8hi)
VAR2 (UNOP, vrev16, v8qi, v16qi)
^ permalink raw reply [flat|nested] 4+ messages in thread