[ARM] PR66791: Replace builtins for vdup_n and vmov

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
@ 2021-06-24 11:11 Prathamesh Kulkarni
  2021-06-24 16:28 ` Kyrylo Tkachov
  0 siblings, 1 reply; 11+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-24 11:11 UTC (permalink / raw)
  To: gcc Patches, Kyrill Tkachov

[-- Attachment #1: Type: text/plain, Size: 703 bytes --]

Hi,
This patch replaces builtins for vdup_n and vmov_n.
The patch results in regression for pr51534.c.
Consider following function:

uint8x8_t f1 (uint8x8_t a) {
  return vcgt_u8(a, vdup_n_u8(0));
}

code-gen before patch:
f1:
        vmov.i32  d16, #0  @ v8qi
        vcgt.u8     d0, d0, d16
        bx             lr

code-gen after patch:
f1:
        vceq.i8 d0, d0, #0
        vmvn    d0, d0
        bx         lr

I am not sure which one is better tho ?

Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
which is due to a missed opt in lowering. I had filed it as
PR98435, and posted a fix for it here:
https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572648.html

Thanks,
Prathamesh

[-- Attachment #2: vdup-1.txt --]
[-- Type: text/plain, Size: 16366 bytes --]

2021-06-24  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

	PR target/66791
	* gcc/config/arm/arm_neon.h (vdup_n_s8): Replace call to builtin
	with constructor.
	(vdup_n_s16): Likewise.
	(vdup_n_s32): Likewise.
	(vdup_n_s64): Likewise.
	(vdup_n_u8): Likewise.
	(vdup_n_u16): Likewise.
	(vdup_n_u32): Likewise.
	(vdup_n_u64): Likewise.
	(vdup_n_p8): Likewise.
	(vdup_n_p16): Likewise.
	(vdup_n_p64): Likewise.
	(vdup_n_f16): Likewise.
	(vdup_n_f32): Likewise.
	(vdupq_n_s8): Likewise.
	(vdupq_n_s16): Likewise.
	(vdupq_n_s32): Likewise.
	(vdupq_n_s64): Likewise.
	(vdupq_n_u8): Likewise.
	(vdupq_n_u16): Likewise.
	(vdupq_n_u32): Likewise.
	(vdupq_n_u64): Likewise.
	(vdupq_n_p8): Likewise.
	(vdupq_n_p16): Likewise.
	(vdupq_n_p64): Likewise.
	(vdupq_n_f16): Likewise.
	(vdupq_n_f32): Likewise.
	(vmov_n_s8): Replace call to builtin with call to corresponding
	vdup intrinsic.
	(vmov_n_s16): Likewise.
	(vmov_n_s32): Likewise.
	(vmov_n_s64): Likewise.
	(vmov_n_u8): Likewise.
	(vmov_n_u16): Likewise.
	(vmov_n_u32): Likewise.
	(vmov_n_u64): Likewise.
	(vmov_n_p8): Likewise.
	(vmov_n_p16): Likewise.
	(vmov_n_f16): Likewise.
	(vmov_n_f32): Likewise.	
	(vmovq_n_s8): Likewise. 
	(vmovq_n_s16): Likewise.
	(vmovq_n_s32): Likewise.
	(vmovq_n_s64): Likewise.
	(vmovq_n_u8): Likewise.
	(vmovq_n_u16): Likewise.
	(vmovq_n_u32): Likewise.
	(vmovq_n_u64): Likewise.
	(vmovq_n_p8): Likewise.
	(vmovq_n_p16): Likewise.
	(vmovq_n_f16): Likewise.
	(vmovq_n_f32): Likewise.
	* config/arm/arm_neon_builtins.def: Remove entries for vdup_n.

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 3efcfa45229..bf26cd49d53 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -6625,63 +6625,63 @@ __extension__ extern __inline int8x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s8 (int8_t __a)
 {
-  return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+  return (int8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s16 (int16_t __a)
 {
-  return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+  return (int16x4_t) {__a, __a, __a, __a};
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s32 (int32_t __a)
 {
-  return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+  return (int32x2_t) {__a, __a};
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_f32 (float32_t __a)
 {
-  return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
+  return (float32x2_t) {__a, __a};
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u8 (uint8_t __a)
 {
-  return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+  return (uint8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u16 (uint16_t __a)
 {
-  return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+  return (uint16x4_t) {__a, __a, __a, __a};
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u32 (uint32_t __a)
 {
-  return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+  return (uint32x2_t) {__a, __a};
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_p8 (poly8_t __a)
 {
-  return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+  return (poly8x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline poly16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_p16 (poly16_t __a)
 {
-  return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+  return (poly16x4_t) {__a, __a, __a, __a};
 }
 
 #pragma GCC push_options
@@ -6690,7 +6690,7 @@ __extension__ extern __inline poly64x1_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_p64 (poly64_t __a)
 {
-  return (poly64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+  return (poly64x1_t) {__a};
 }
 
 #pragma GCC pop_options
@@ -6698,14 +6698,14 @@ __extension__ extern __inline int64x1_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_s64 (int64_t __a)
 {
-  return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+  return (int64x1_t) {__a};
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_u64 (uint64_t __a)
 {
-  return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+  return (uint64x1_t) {__a};
 }
 
 #pragma GCC push_options
@@ -6714,7 +6714,7 @@ __extension__ extern __inline poly64x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_p64 (poly64_t __a)
 {
-  return (poly64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+  return (poly64x2_t) {__a, __a};
 }
 
 #pragma GCC pop_options
@@ -6722,231 +6722,234 @@ __extension__ extern __inline int8x16_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_s8 (int8_t __a)
 {
-  return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+  return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+		      __a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_s16 (int16_t __a)
 {
-  return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+  return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_s32 (int32_t __a)
 {
-  return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+  return (int32x4_t) {__a, __a, __a, __a};
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_f32 (float32_t __a)
 {
-  return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
+  return (float32x4_t) {__a, __a, __a, __a};
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_u8 (uint8_t __a)
 {
-  return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+  return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+		       __a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_u16 (uint16_t __a)
 {
-  return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+  return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_u32 (uint32_t __a)
 {
-  return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+  return (uint32x4_t) {__a, __a, __a, __a};
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_p8 (poly8_t __a)
 {
-  return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+  return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
+		       __a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline poly16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_p16 (poly16_t __a)
 {
-  return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+  return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_s64 (int64_t __a)
 {
-  return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+  return (int64x2_t) {__a, __a};
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_u64 (uint64_t __a)
 {
-  return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+  return (uint64x2_t) {__a, __a};
 }
 
 __extension__ extern __inline int8x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s8 (int8_t __a)
 {
-  return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+  return vdup_n_s8 (__a);
 }
 
 __extension__ extern __inline int16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s16 (int16_t __a)
 {
-  return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+  return vdup_n_s16 (__a);
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s32 (int32_t __a)
 {
-  return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+  return vdup_n_s32 (__a);
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_f32 (float32_t __a)
 {
-  return (float32x2_t)__builtin_neon_vdup_nv2sf ((__builtin_neon_sf) __a);
+  return vdup_n_f32 (__a);
 }
 
 __extension__ extern __inline uint8x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u8 (uint8_t __a)
 {
-  return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+  return vdup_n_u8 (__a);
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u16 (uint16_t __a)
 {
-  return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+  return vdup_n_u16 (__a);
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u32 (uint32_t __a)
 {
-  return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+  return vdup_n_u32 (__a);
 }
 
 __extension__ extern __inline poly8x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_p8 (poly8_t __a)
 {
-  return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+  return vdup_n_p8 (__a);
 }
 
 __extension__ extern __inline poly16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_p16 (poly16_t __a)
 {
-  return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+  return vdup_n_p16 (__a);
 }
 
 __extension__ extern __inline int64x1_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_s64 (int64_t __a)
 {
-  return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+  return vdup_n_s64 (__a);
 }
 
 __extension__ extern __inline uint64x1_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_u64 (uint64_t __a)
 {
-  return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+  return vdup_n_u64 (__a);
 }
 
 __extension__ extern __inline int8x16_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_s8 (int8_t __a)
 {
-  return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+  return vdupq_n_s8 (__a);
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_s16 (int16_t __a)
 {
-  return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+  return vdupq_n_s16 (__a);
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_s32 (int32_t __a)
 {
-  return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+  return vdupq_n_s32 (__a);
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_f32 (float32_t __a)
 {
-  return (float32x4_t)__builtin_neon_vdup_nv4sf ((__builtin_neon_sf) __a);
+  return vdupq_n_f32 (__a);
 }
 
 __extension__ extern __inline uint8x16_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_u8 (uint8_t __a)
 {
-  return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+  return vdupq_n_u8 (__a);
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_u16 (uint16_t __a)
 {
-  return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+  return vdupq_n_u16 (__a);
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_u32 (uint32_t __a)
 {
-  return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+  return vdupq_n_u32 (__a);
 }
 
 __extension__ extern __inline poly8x16_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_p8 (poly8_t __a)
 {
-  return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+  return vdupq_n_p8 (__a);
 }
 
 __extension__ extern __inline poly16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_p16 (poly16_t __a)
 {
-  return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+  return vdupq_n_p16 (__a);
 }
 
 __extension__ extern __inline int64x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_s64 (int64_t __a)
 {
-  return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+  return vdupq_n_s64 (__a);
 }
 
 __extension__ extern __inline uint64x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_u64 (uint64_t __a)
 {
-  return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+  return vdupq_n_u64 (__a);
 }
 
 __extension__ extern __inline int8x8_t
@@ -17879,14 +17882,14 @@ __extension__ extern __inline float16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_f16 (float16_t __a)
 {
-  return __builtin_neon_vdup_nv4hf (__a);
+  return (float16x4_t) {__a, __a, __a, __a};
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_f16 (float16_t __a)
 {
-  return __builtin_neon_vdup_nv8hf (__a);
+  return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline float16x4_t
@@ -17921,14 +17924,14 @@ __extension__ extern __inline float16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmov_n_f16 (float16_t __a)
 {
-  return __builtin_neon_vdup_nv4hf (__a);
+  return vdup_n_f16 (__a);
 }
 
 __extension__ extern __inline float16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmovq_n_f16 (float16_t __a)
 {
-  return __builtin_neon_vdup_nv8hf (__a);
+  return vdupq_n_f16 (__a);
 }
 
 __extension__ extern __inline float16x4_t
@@ -18852,14 +18855,14 @@ __extension__ extern __inline bfloat16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdup_n_bf16 (bfloat16_t __a)
 {
-  return __builtin_neon_vdup_nv4bf (__a);
+  return (bfloat16x4_t) {__a, __a, __a, __a};
 }
 
 __extension__ extern __inline bfloat16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vdupq_n_bf16 (bfloat16_t __a)
 {
-  return __builtin_neon_vdup_nv8bf (__a);
+  return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
 }
 
 __extension__ extern __inline bfloat16x4_t
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def
index ae104d5ba1b..a233e9bbd9e 100644
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -214,9 +214,6 @@ VAR10 (GETLANE, vget_lane,
 VAR6 (GETLANE, vget_laneu, v8qi, v4hi, v2si, v16qi, v8hi, v4si)
 VAR10 (SETLANE, vset_lane,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR10 (UNOP, vdup_n,
-	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
-VAR4 (UNOP, vdup_n, v8hf, v4hf, v8bf, v4bf)
 VAR10 (GETLANE, vdup_lane,
 	 v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di)
 VAR4 (GETLANE, vdup_lane, v8hf, v4hf, v8bf, v4bf)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-06-24 11:11 [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics Prathamesh Kulkarni
@ 2021-06-24 16:28 ` Kyrylo Tkachov
  2021-08-11 16:53   ` Christophe Lyon
  0 siblings, 1 reply; 11+ messages in thread
From: Kyrylo Tkachov @ 2021-06-24 16:28 UTC (permalink / raw)
  To: Prathamesh Kulkarni, gcc Patches



> -----Original Message-----
> From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> Sent: 24 June 2021 12:11
> To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
> <Kyrylo.Tkachov@arm.com>
> Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
> 
> Hi,
> This patch replaces builtins for vdup_n and vmov_n.
> The patch results in regression for pr51534.c.
> Consider following function:
> 
> uint8x8_t f1 (uint8x8_t a) {
>   return vcgt_u8(a, vdup_n_u8(0));
> }
> 
> code-gen before patch:
> f1:
>         vmov.i32  d16, #0  @ v8qi
>         vcgt.u8     d0, d0, d16
>         bx             lr
> 
> code-gen after patch:
> f1:
>         vceq.i8 d0, d0, #0
>         vmvn    d0, d0
>         bx         lr
> 
> I am not sure which one is better tho ?

I think they're equivalent in practice, in any case the patch itself is good (move away from RTL builtins).
Ok.
Thanks,
Kyrill

> 
> Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
> which is due to a missed opt in lowering. I had filed it as
> PR98435, and posted a fix for it here:
> https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572648.html
> 
> Thanks,
> Prathamesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-06-24 16:28 ` Kyrylo Tkachov
@ 2021-08-11 16:53   ` Christophe Lyon
  2021-08-12 11:54     ` Prathamesh Kulkarni
  0 siblings, 1 reply; 11+ messages in thread
From: Christophe Lyon @ 2021-08-11 16:53 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: Prathamesh Kulkarni, gcc Patches

On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches <
gcc-patches@gcc.gnu.org> wrote:

>
>
> > -----Original Message-----
> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> > Sent: 24 June 2021 12:11
> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
> > <Kyrylo.Tkachov@arm.com>
> > Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
> >
> > Hi,
> > This patch replaces builtins for vdup_n and vmov_n.
> > The patch results in regression for pr51534.c.
> > Consider following function:
> >
> > uint8x8_t f1 (uint8x8_t a) {
> >   return vcgt_u8(a, vdup_n_u8(0));
> > }
> >
> > code-gen before patch:
> > f1:
> >         vmov.i32  d16, #0  @ v8qi
> >         vcgt.u8     d0, d0, d16
> >         bx             lr
> >
> > code-gen after patch:
> > f1:
> >         vceq.i8 d0, d0, #0
> >         vmvn    d0, d0
> >         bx         lr
> >
> > I am not sure which one is better tho ?
>
>
Hi Prathamesh,

This patch introduces a regression on non-hardfp configs (eg
arm-linux-gnueabi or arm-eabi):
FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c
scan-assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c
scan-assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3

Can you fix this?

Thanks

Christophe



> I think they're equivalent in practice, in any case the patch itself is
> good (move away from RTL builtins).
> Ok.
> Thanks,
> Kyrill
>
> >
> > Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
> > which is due to a missed opt in lowering. I had filed it as
> > PR98435, and posted a fix for it here:
> > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572648.html
> >
> > Thanks,
> > Prathamesh
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-08-11 16:53   ` Christophe Lyon
@ 2021-08-12 11:54     ` Prathamesh Kulkarni
  2021-08-12 13:34       ` Christophe Lyon
  0 siblings, 1 reply; 11+ messages in thread
From: Prathamesh Kulkarni @ 2021-08-12 11:54 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Kyrylo Tkachov, gcc Patches

On Wed, 11 Aug 2021 at 22:23, Christophe Lyon
<christophe.lyon.oss@gmail.com> wrote:
>
>
>
> On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>>
>>
>>
>> > -----Original Message-----
>> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
>> > Sent: 24 June 2021 12:11
>> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
>> > <Kyrylo.Tkachov@arm.com>
>> > Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
>> >
>> > Hi,
>> > This patch replaces builtins for vdup_n and vmov_n.
>> > The patch results in regression for pr51534.c.
>> > Consider following function:
>> >
>> > uint8x8_t f1 (uint8x8_t a) {
>> >   return vcgt_u8(a, vdup_n_u8(0));
>> > }
>> >
>> > code-gen before patch:
>> > f1:
>> >         vmov.i32  d16, #0  @ v8qi
>> >         vcgt.u8     d0, d0, d16
>> >         bx             lr
>> >
>> > code-gen after patch:
>> > f1:
>> >         vceq.i8 d0, d0, #0
>> >         vmvn    d0, d0
>> >         bx         lr
>> >
>> > I am not sure which one is better tho ?
>>
>
> Hi Prathamesh,
>
> This patch introduces a regression on non-hardfp configs (eg arm-linux-gnueabi or arm-eabi):
> FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
> FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3
>
> Can you fix this?
The issue is, for following test:

#include <arm_neon.h>

uint8x8_t f1 (uint8x8_t a) {
  return vcge_u8(a, vdup_n_u8(0));
}

armhf code-gen:
f1:
        vmov.i32  d0, #0xffffffff  @ v8qi
        bx            lr

arm softfp code-gen:
f1:
        mov     r0, #-1
        mov     r1, #-1
        bx      lr

The code-gen for both is same upto split2 pass:

(insn 10 6 11 2 (set (reg/i:V8QI 16 s0)
        (const_vector:V8QI [
                (const_int -1 [0xffffffffffffffff]) repeated x8
            ])) "foo.c":5:1 1052 {*neon_movv8qi}
     (expr_list:REG_EQUAL (const_vector:V8QI [
                (const_int -1 [0xffffffffffffffff]) repeated x8
            ])
        (nil)))
(insn 11 10 13 2 (use (reg/i:V8QI 16 s0)) "foo.c":5:1 -1
     (nil))

and for softfp target, split2 pass splits the assignment to r0 and r1:

(insn 15 6 16 2 (set (reg:SI 0 r0)
        (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740 {*thumb2_movsi_vfp}
     (nil))
(insn 16 15 11 2 (set (reg:SI 1 r1 [+4 ])
        (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740 {*thumb2_movsi_vfp}
     (nil))
(insn 11 16 13 2 (use (reg/i:V8QI 0 r0)) "foo.c":5:1 -1
     (nil))

I suppose we could use a dg-scan for r[0-9]+, #-1 for softfp targets ?

Thanks,
Prathamesh
>
> Thanks
>
> Christophe
>
>
>>
>> I think they're equivalent in practice, in any case the patch itself is good (move away from RTL builtins).
>> Ok.
>> Thanks,
>> Kyrill
>>
>> >
>> > Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
>> > which is due to a missed opt in lowering. I had filed it as
>> > PR98435, and posted a fix for it here:
>> > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572648.html
>> >
>> > Thanks,
>> > Prathamesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-08-12 11:54     ` Prathamesh Kulkarni
@ 2021-08-12 13:34       ` Christophe Lyon
  2021-08-17  6:25         ` Prathamesh Kulkarni
  0 siblings, 1 reply; 11+ messages in thread
From: Christophe Lyon @ 2021-08-12 13:34 UTC (permalink / raw)
  To: Prathamesh Kulkarni; +Cc: Kyrylo Tkachov, gcc Patches

On Thu, Aug 12, 2021 at 1:54 PM Prathamesh Kulkarni <
prathamesh.kulkarni@linaro.org> wrote:

> On Wed, 11 Aug 2021 at 22:23, Christophe Lyon
> <christophe.lyon.oss@gmail.com> wrote:
> >
> >
> >
> > On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches <
> gcc-patches@gcc.gnu.org> wrote:
> >>
> >>
> >>
> >> > -----Original Message-----
> >> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> >> > Sent: 24 June 2021 12:11
> >> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
> >> > <Kyrylo.Tkachov@arm.com>
> >> > Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
> intrinsics
> >> >
> >> > Hi,
> >> > This patch replaces builtins for vdup_n and vmov_n.
> >> > The patch results in regression for pr51534.c.
> >> > Consider following function:
> >> >
> >> > uint8x8_t f1 (uint8x8_t a) {
> >> >   return vcgt_u8(a, vdup_n_u8(0));
> >> > }
> >> >
> >> > code-gen before patch:
> >> > f1:
> >> >         vmov.i32  d16, #0  @ v8qi
> >> >         vcgt.u8     d0, d0, d16
> >> >         bx             lr
> >> >
> >> > code-gen after patch:
> >> > f1:
> >> >         vceq.i8 d0, d0, #0
> >> >         vmvn    d0, d0
> >> >         bx         lr
> >> >
> >> > I am not sure which one is better tho ?
> >>
> >
> > Hi Prathamesh,
> >
> > This patch introduces a regression on non-hardfp configs (eg
> arm-linux-gnueabi or arm-eabi):
> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c
> scan-assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c
> scan-assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3
> >
> > Can you fix this?
> The issue is, for following test:
>
> #include <arm_neon.h>
>
> uint8x8_t f1 (uint8x8_t a) {
>   return vcge_u8(a, vdup_n_u8(0));
> }
>
> armhf code-gen:
> f1:
>         vmov.i32  d0, #0xffffffff  @ v8qi
>         bx            lr
>
> arm softfp code-gen:
> f1:
>         mov     r0, #-1
>         mov     r1, #-1
>         bx      lr
>
> The code-gen for both is same upto split2 pass:
>
> (insn 10 6 11 2 (set (reg/i:V8QI 16 s0)
>         (const_vector:V8QI [
>                 (const_int -1 [0xffffffffffffffff]) repeated x8
>             ])) "foo.c":5:1 1052 {*neon_movv8qi}
>      (expr_list:REG_EQUAL (const_vector:V8QI [
>                 (const_int -1 [0xffffffffffffffff]) repeated x8
>             ])
>         (nil)))
> (insn 11 10 13 2 (use (reg/i:V8QI 16 s0)) "foo.c":5:1 -1
>      (nil))
>
> and for softfp target, split2 pass splits the assignment to r0 and r1:
>
> (insn 15 6 16 2 (set (reg:SI 0 r0)
>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
> {*thumb2_movsi_vfp}
>      (nil))
> (insn 16 15 11 2 (set (reg:SI 1 r1 [+4 ])
>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
> {*thumb2_movsi_vfp}
>      (nil))
> (insn 11 16 13 2 (use (reg/i:V8QI 0 r0)) "foo.c":5:1 -1
>      (nil))
>
> I suppose we could use a dg-scan for r[0-9]+, #-1 for softfp targets ?
>
> Yes, probably, or try with check-function-bodies.

 Christophe

Thanks,
> Prathamesh
> >
> > Thanks
> >
> > Christophe
> >
> >
> >>
> >> I think they're equivalent in practice, in any case the patch itself is
> good (move away from RTL builtins).
> >> Ok.
> >> Thanks,
> >> Kyrill
> >>
> >> >
> >> > Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
> >> > which is due to a missed opt in lowering. I had filed it as
> >> > PR98435, and posted a fix for it here:
> >> > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572648.html
> >> >
> >> > Thanks,
> >> > Prathamesh
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-08-12 13:34       ` Christophe Lyon
@ 2021-08-17  6:25         ` Prathamesh Kulkarni
  2021-08-24  8:00           ` Prathamesh Kulkarni
  0 siblings, 1 reply; 11+ messages in thread
From: Prathamesh Kulkarni @ 2021-08-17  6:25 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Kyrylo Tkachov, gcc Patches

[-- Attachment #1: Type: text/plain, Size: 3829 bytes --]

On Thu, 12 Aug 2021 at 19:04, Christophe Lyon
<christophe.lyon.oss@gmail.com> wrote:
>
>
>
> On Thu, Aug 12, 2021 at 1:54 PM Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> wrote:
>>
>> On Wed, 11 Aug 2021 at 22:23, Christophe Lyon
>> <christophe.lyon.oss@gmail.com> wrote:
>> >
>> >
>> >
>> > On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>> >>
>> >>
>> >>
>> >> > -----Original Message-----
>> >> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
>> >> > Sent: 24 June 2021 12:11
>> >> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
>> >> > <Kyrylo.Tkachov@arm.com>
>> >> > Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
>> >> >
>> >> > Hi,
>> >> > This patch replaces builtins for vdup_n and vmov_n.
>> >> > The patch results in regression for pr51534.c.
>> >> > Consider following function:
>> >> >
>> >> > uint8x8_t f1 (uint8x8_t a) {
>> >> >   return vcgt_u8(a, vdup_n_u8(0));
>> >> > }
>> >> >
>> >> > code-gen before patch:
>> >> > f1:
>> >> >         vmov.i32  d16, #0  @ v8qi
>> >> >         vcgt.u8     d0, d0, d16
>> >> >         bx             lr
>> >> >
>> >> > code-gen after patch:
>> >> > f1:
>> >> >         vceq.i8 d0, d0, #0
>> >> >         vmvn    d0, d0
>> >> >         bx         lr
>> >> >
>> >> > I am not sure which one is better tho ?
>> >>
>> >
>> > Hi Prathamesh,
>> >
>> > This patch introduces a regression on non-hardfp configs (eg arm-linux-gnueabi or arm-eabi):
>> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
>> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3
>> >
>> > Can you fix this?
>> The issue is, for following test:
>>
>> #include <arm_neon.h>
>>
>> uint8x8_t f1 (uint8x8_t a) {
>>   return vcge_u8(a, vdup_n_u8(0));
>> }
>>
>> armhf code-gen:
>> f1:
>>         vmov.i32  d0, #0xffffffff  @ v8qi
>>         bx            lr
>>
>> arm softfp code-gen:
>> f1:
>>         mov     r0, #-1
>>         mov     r1, #-1
>>         bx      lr
>>
>> The code-gen for both is same upto split2 pass:
>>
>> (insn 10 6 11 2 (set (reg/i:V8QI 16 s0)
>>         (const_vector:V8QI [
>>                 (const_int -1 [0xffffffffffffffff]) repeated x8
>>             ])) "foo.c":5:1 1052 {*neon_movv8qi}
>>      (expr_list:REG_EQUAL (const_vector:V8QI [
>>                 (const_int -1 [0xffffffffffffffff]) repeated x8
>>             ])
>>         (nil)))
>> (insn 11 10 13 2 (use (reg/i:V8QI 16 s0)) "foo.c":5:1 -1
>>      (nil))
>>
>> and for softfp target, split2 pass splits the assignment to r0 and r1:
>>
>> (insn 15 6 16 2 (set (reg:SI 0 r0)
>>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740 {*thumb2_movsi_vfp}
>>      (nil))
>> (insn 16 15 11 2 (set (reg:SI 1 r1 [+4 ])
>>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740 {*thumb2_movsi_vfp}
>>      (nil))
>> (insn 11 16 13 2 (use (reg/i:V8QI 0 r0)) "foo.c":5:1 -1
>>      (nil))
>>
>> I suppose we could use a dg-scan for r[0-9]+, #-1 for softfp targets ?
>>
> Yes, probably, or try with check-function-bodies.
Hi,
Sorry for the late response. Does the attached patch look OK ?

Thanks,
Prathamesh
>
>  Christophe
>
>> Thanks,
>> Prathamesh
>> >
>> > Thanks
>> >
>> > Christophe
>> >
>> >
>> >>
>> >> I think they're equivalent in practice, in any case the patch itself is good (move away from RTL builtins).
>> >> Ok.
>> >> Thanks,
>> >> Kyrill
>> >>
>> >> >
>> >> > Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
>> >> > which is due to a missed opt in lowering. I had filed it as
>> >> > PR98435, and posted a fix for it here:
>> >> > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572648.html
>> >> >
>> >> > Thanks,
>> >> > Prathamesh

[-- Attachment #2: vdup_n-test-fix-1.txt --]
[-- Type: text/plain, Size: 1262 bytes --]

diff --git a/gcc/testsuite/gcc.target/arm/pr51534.c b/gcc/testsuite/gcc.target/arm/pr51534.c
index ac7f1ea4722..5e121f5fb99 100644
--- a/gcc/testsuite/gcc.target/arm/pr51534.c
+++ b/gcc/testsuite/gcc.target/arm/pr51534.c
@@ -64,8 +64,9 @@ GEN_COND_TESTS(vceq)
 /* { dg-final { scan-assembler-times "vceq\.i8\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 4 } } */
 /* { dg-final { scan-assembler-times "vceq\.i16\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 4 } } */
 /* { dg-final { scan-assembler-times "vceq\.i32\[ 	\]+\[qQ\]\[0-9\]+, \[qQ\]\[0-9\]+, #0" 4 } } */
-/* { dg-final { scan-assembler-times "vmov\.i32\[ 	\]+\[dD\]\[0-9\]+, #0xffffffff" 3 } } */
-/* { dg-final { scan-assembler-times "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4294967295" 3 } } */
+/* { dg-final { scan-assembler-times "vmov\.i32\[ 	\]+\[dD\]\[0-9\]+, #0xffffffff" 3 { target { arm_hard_ok } } } } */
+/* { dg-final { scan-assembler-times "vmov\.i32\[ 	\]+\[qQ\]\[0-9\]+, #4294967295" 3 { target { arm_hard_ok } } } } */
+/* { dg-final { scan-assembler-times "mov\[ 	\]+r\[0-9\]+, #-1" 6 { target { arm_softfp_ok } } } } */
 
 /* And ensure we don't have unexpected output too.  */
 /* { dg-final { scan-assembler-not "vc\[gl\]\[te\]\.u\[0-9\]+\[ 	\]+\[qQdD\]\[0-9\]+, \[qQdD\]\[0-9\]+, #0" } } */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-08-17  6:25         ` Prathamesh Kulkarni
@ 2021-08-24  8:00           ` Prathamesh Kulkarni
  2021-08-24  8:17             ` Kyrylo Tkachov
  0 siblings, 1 reply; 11+ messages in thread
From: Prathamesh Kulkarni @ 2021-08-24  8:00 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Kyrylo Tkachov, gcc Patches

On Tue, 17 Aug 2021 at 11:55, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Thu, 12 Aug 2021 at 19:04, Christophe Lyon
> <christophe.lyon.oss@gmail.com> wrote:
> >
> >
> >
> > On Thu, Aug 12, 2021 at 1:54 PM Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> wrote:
> >>
> >> On Wed, 11 Aug 2021 at 22:23, Christophe Lyon
> >> <christophe.lyon.oss@gmail.com> wrote:
> >> >
> >> >
> >> >
> >> > On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> >> >>
> >> >>
> >> >>
> >> >> > -----Original Message-----
> >> >> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> >> >> > Sent: 24 June 2021 12:11
> >> >> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
> >> >> > <Kyrylo.Tkachov@arm.com>
> >> >> > Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
> >> >> >
> >> >> > Hi,
> >> >> > This patch replaces builtins for vdup_n and vmov_n.
> >> >> > The patch results in regression for pr51534.c.
> >> >> > Consider following function:
> >> >> >
> >> >> > uint8x8_t f1 (uint8x8_t a) {
> >> >> >   return vcgt_u8(a, vdup_n_u8(0));
> >> >> > }
> >> >> >
> >> >> > code-gen before patch:
> >> >> > f1:
> >> >> >         vmov.i32  d16, #0  @ v8qi
> >> >> >         vcgt.u8     d0, d0, d16
> >> >> >         bx             lr
> >> >> >
> >> >> > code-gen after patch:
> >> >> > f1:
> >> >> >         vceq.i8 d0, d0, #0
> >> >> >         vmvn    d0, d0
> >> >> >         bx         lr
> >> >> >
> >> >> > I am not sure which one is better tho ?
> >> >>
> >> >
> >> > Hi Prathamesh,
> >> >
> >> > This patch introduces a regression on non-hardfp configs (eg arm-linux-gnueabi or arm-eabi):
> >> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
> >> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3
> >> >
> >> > Can you fix this?
> >> The issue is, for following test:
> >>
> >> #include <arm_neon.h>
> >>
> >> uint8x8_t f1 (uint8x8_t a) {
> >>   return vcge_u8(a, vdup_n_u8(0));
> >> }
> >>
> >> armhf code-gen:
> >> f1:
> >>         vmov.i32  d0, #0xffffffff  @ v8qi
> >>         bx            lr
> >>
> >> arm softfp code-gen:
> >> f1:
> >>         mov     r0, #-1
> >>         mov     r1, #-1
> >>         bx      lr
> >>
> >> The code-gen for both is same upto split2 pass:
> >>
> >> (insn 10 6 11 2 (set (reg/i:V8QI 16 s0)
> >>         (const_vector:V8QI [
> >>                 (const_int -1 [0xffffffffffffffff]) repeated x8
> >>             ])) "foo.c":5:1 1052 {*neon_movv8qi}
> >>      (expr_list:REG_EQUAL (const_vector:V8QI [
> >>                 (const_int -1 [0xffffffffffffffff]) repeated x8
> >>             ])
> >>         (nil)))
> >> (insn 11 10 13 2 (use (reg/i:V8QI 16 s0)) "foo.c":5:1 -1
> >>      (nil))
> >>
> >> and for softfp target, split2 pass splits the assignment to r0 and r1:
> >>
> >> (insn 15 6 16 2 (set (reg:SI 0 r0)
> >>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740 {*thumb2_movsi_vfp}
> >>      (nil))
> >> (insn 16 15 11 2 (set (reg:SI 1 r1 [+4 ])
> >>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740 {*thumb2_movsi_vfp}
> >>      (nil))
> >> (insn 11 16 13 2 (use (reg/i:V8QI 0 r0)) "foo.c":5:1 -1
> >>      (nil))
> >>
> >> I suppose we could use a dg-scan for r[0-9]+, #-1 for softfp targets ?
> >>
> > Yes, probably, or try with check-function-bodies.
> Hi,
> Sorry for the late response. Does the attached patch look OK ?
ping https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577532.html

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> >  Christophe
> >
> >> Thanks,
> >> Prathamesh
> >> >
> >> > Thanks
> >> >
> >> > Christophe
> >> >
> >> >
> >> >>
> >> >> I think they're equivalent in practice, in any case the patch itself is good (move away from RTL builtins).
> >> >> Ok.
> >> >> Thanks,
> >> >> Kyrill
> >> >>
> >> >> >
> >> >> > Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
> >> >> > which is due to a missed opt in lowering. I had filed it as
> >> >> > PR98435, and posted a fix for it here:
> >> >> > https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572648.html
> >> >> >
> >> >> > Thanks,
> >> >> > Prathamesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-08-24  8:00           ` Prathamesh Kulkarni
@ 2021-08-24  8:17             ` Kyrylo Tkachov
  2021-09-02  9:02               ` Christophe Lyon
  0 siblings, 1 reply; 11+ messages in thread
From: Kyrylo Tkachov @ 2021-08-24  8:17 UTC (permalink / raw)
  To: Prathamesh Kulkarni, Christophe Lyon; +Cc: gcc Patches



> -----Original Message-----
> From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> Sent: 24 August 2021 09:01
> To: Christophe Lyon <christophe.lyon.oss@gmail.com>
> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; gcc Patches <gcc-
> patches@gcc.gnu.org>
> Subject: Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
> intrinsics
> 
> On Tue, 17 Aug 2021 at 11:55, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Thu, 12 Aug 2021 at 19:04, Christophe Lyon
> > <christophe.lyon.oss@gmail.com> wrote:
> > >
> > >
> > >
> > > On Thu, Aug 12, 2021 at 1:54 PM Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> > >>
> > >> On Wed, 11 Aug 2021 at 22:23, Christophe Lyon
> > >> <christophe.lyon.oss@gmail.com> wrote:
> > >> >
> > >> >
> > >> >
> > >> > On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches <gcc-
> patches@gcc.gnu.org> wrote:
> > >> >>
> > >> >>
> > >> >>
> > >> >> > -----Original Message-----
> > >> >> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> > >> >> > Sent: 24 June 2021 12:11
> > >> >> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
> > >> >> > <Kyrylo.Tkachov@arm.com>
> > >> >> > Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
> intrinsics
> > >> >> >
> > >> >> > Hi,
> > >> >> > This patch replaces builtins for vdup_n and vmov_n.
> > >> >> > The patch results in regression for pr51534.c.
> > >> >> > Consider following function:
> > >> >> >
> > >> >> > uint8x8_t f1 (uint8x8_t a) {
> > >> >> >   return vcgt_u8(a, vdup_n_u8(0));
> > >> >> > }
> > >> >> >
> > >> >> > code-gen before patch:
> > >> >> > f1:
> > >> >> >         vmov.i32  d16, #0  @ v8qi
> > >> >> >         vcgt.u8     d0, d0, d16
> > >> >> >         bx             lr
> > >> >> >
> > >> >> > code-gen after patch:
> > >> >> > f1:
> > >> >> >         vceq.i8 d0, d0, #0
> > >> >> >         vmvn    d0, d0
> > >> >> >         bx         lr
> > >> >> >
> > >> >> > I am not sure which one is better tho ?
> > >> >>
> > >> >
> > >> > Hi Prathamesh,
> > >> >
> > >> > This patch introduces a regression on non-hardfp configs (eg arm-
> linux-gnueabi or arm-eabi):
> > >> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-
> assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
> > >> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-
> assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3
> > >> >
> > >> > Can you fix this?
> > >> The issue is, for following test:
> > >>
> > >> #include <arm_neon.h>
> > >>
> > >> uint8x8_t f1 (uint8x8_t a) {
> > >>   return vcge_u8(a, vdup_n_u8(0));
> > >> }
> > >>
> > >> armhf code-gen:
> > >> f1:
> > >>         vmov.i32  d0, #0xffffffff  @ v8qi
> > >>         bx            lr
> > >>
> > >> arm softfp code-gen:
> > >> f1:
> > >>         mov     r0, #-1
> > >>         mov     r1, #-1
> > >>         bx      lr
> > >>
> > >> The code-gen for both is same upto split2 pass:
> > >>
> > >> (insn 10 6 11 2 (set (reg/i:V8QI 16 s0)
> > >>         (const_vector:V8QI [
> > >>                 (const_int -1 [0xffffffffffffffff]) repeated x8
> > >>             ])) "foo.c":5:1 1052 {*neon_movv8qi}
> > >>      (expr_list:REG_EQUAL (const_vector:V8QI [
> > >>                 (const_int -1 [0xffffffffffffffff]) repeated x8
> > >>             ])
> > >>         (nil)))
> > >> (insn 11 10 13 2 (use (reg/i:V8QI 16 s0)) "foo.c":5:1 -1
> > >>      (nil))
> > >>
> > >> and for softfp target, split2 pass splits the assignment to r0 and r1:
> > >>
> > >> (insn 15 6 16 2 (set (reg:SI 0 r0)
> > >>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
> {*thumb2_movsi_vfp}
> > >>      (nil))
> > >> (insn 16 15 11 2 (set (reg:SI 1 r1 [+4 ])
> > >>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
> {*thumb2_movsi_vfp}
> > >>      (nil))
> > >> (insn 11 16 13 2 (use (reg/i:V8QI 0 r0)) "foo.c":5:1 -1
> > >>      (nil))
> > >>
> > >> I suppose we could use a dg-scan for r[0-9]+, #-1 for softfp targets ?
> > >>
> > > Yes, probably, or try with check-function-bodies.
> > Hi,
> > Sorry for the late response. Does the attached patch look OK ?
> ping https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577532.html

Ok.
Thanks,
Kyrill

> 
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Prathamesh
> > >
> > >  Christophe
> > >
> > >> Thanks,
> > >> Prathamesh
> > >> >
> > >> > Thanks
> > >> >
> > >> > Christophe
> > >> >
> > >> >
> > >> >>
> > >> >> I think they're equivalent in practice, in any case the patch itself is
> good (move away from RTL builtins).
> > >> >> Ok.
> > >> >> Thanks,
> > >> >> Kyrill
> > >> >>
> > >> >> >
> > >> >> > Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
> > >> >> > which is due to a missed opt in lowering. I had filed it as
> > >> >> > PR98435, and posted a fix for it here:
> > >> >> > https://gcc.gnu.org/pipermail/gcc-patches/2021-
> June/572648.html
> > >> >> >
> > >> >> > Thanks,
> > >> >> > Prathamesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-08-24  8:17             ` Kyrylo Tkachov
@ 2021-09-02  9:02               ` Christophe Lyon
  2021-09-03  8:35                 ` Prathamesh Kulkarni
  0 siblings, 1 reply; 11+ messages in thread
From: Christophe Lyon @ 2021-09-02  9:02 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: Prathamesh Kulkarni, gcc Patches

On Tue, Aug 24, 2021 at 10:17 AM Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
wrote:

>
>
> > -----Original Message-----
> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> > Sent: 24 August 2021 09:01
> > To: Christophe Lyon <christophe.lyon.oss@gmail.com>
> > Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; gcc Patches <gcc-
> > patches@gcc.gnu.org>
> > Subject: Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
> > intrinsics
> >
> > On Tue, 17 Aug 2021 at 11:55, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Thu, 12 Aug 2021 at 19:04, Christophe Lyon
> > > <christophe.lyon.oss@gmail.com> wrote:
> > > >
> > > >
> > > >
> > > > On Thu, Aug 12, 2021 at 1:54 PM Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > > >>
> > > >> On Wed, 11 Aug 2021 at 22:23, Christophe Lyon
> > > >> <christophe.lyon.oss@gmail.com> wrote:
> > > >> >
> > > >> >
> > > >> >
> > > >> > On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches
> <gcc-
> > patches@gcc.gnu.org> wrote:
> > > >> >>
> > > >> >>
> > > >> >>
> > > >> >> > -----Original Message-----
> > > >> >> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> > > >> >> > Sent: 24 June 2021 12:11
> > > >> >> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
> > > >> >> > <Kyrylo.Tkachov@arm.com>
> > > >> >> > Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
> > intrinsics
> > > >> >> >
> > > >> >> > Hi,
> > > >> >> > This patch replaces builtins for vdup_n and vmov_n.
> > > >> >> > The patch results in regression for pr51534.c.
> > > >> >> > Consider following function:
> > > >> >> >
> > > >> >> > uint8x8_t f1 (uint8x8_t a) {
> > > >> >> >   return vcgt_u8(a, vdup_n_u8(0));
> > > >> >> > }
> > > >> >> >
> > > >> >> > code-gen before patch:
> > > >> >> > f1:
> > > >> >> >         vmov.i32  d16, #0  @ v8qi
> > > >> >> >         vcgt.u8     d0, d0, d16
> > > >> >> >         bx             lr
> > > >> >> >
> > > >> >> > code-gen after patch:
> > > >> >> > f1:
> > > >> >> >         vceq.i8 d0, d0, #0
> > > >> >> >         vmvn    d0, d0
> > > >> >> >         bx         lr
> > > >> >> >
> > > >> >> > I am not sure which one is better tho ?
> > > >> >>
> > > >> >
> > > >> > Hi Prathamesh,
> > > >> >
> > > >> > This patch introduces a regression on non-hardfp configs (eg arm-
> > linux-gnueabi or arm-eabi):
> > > >> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-
> > assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
> > > >> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-
> > assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3
> > > >> >
> > > >> > Can you fix this?
> > > >> The issue is, for following test:
> > > >>
> > > >> #include <arm_neon.h>
> > > >>
> > > >> uint8x8_t f1 (uint8x8_t a) {
> > > >>   return vcge_u8(a, vdup_n_u8(0));
> > > >> }
> > > >>
> > > >> armhf code-gen:
> > > >> f1:
> > > >>         vmov.i32  d0, #0xffffffff  @ v8qi
> > > >>         bx            lr
> > > >>
> > > >> arm softfp code-gen:
> > > >> f1:
> > > >>         mov     r0, #-1
> > > >>         mov     r1, #-1
> > > >>         bx      lr
> > > >>
> > > >> The code-gen for both is same upto split2 pass:
> > > >>
> > > >> (insn 10 6 11 2 (set (reg/i:V8QI 16 s0)
> > > >>         (const_vector:V8QI [
> > > >>                 (const_int -1 [0xffffffffffffffff]) repeated x8
> > > >>             ])) "foo.c":5:1 1052 {*neon_movv8qi}
> > > >>      (expr_list:REG_EQUAL (const_vector:V8QI [
> > > >>                 (const_int -1 [0xffffffffffffffff]) repeated x8
> > > >>             ])
> > > >>         (nil)))
> > > >> (insn 11 10 13 2 (use (reg/i:V8QI 16 s0)) "foo.c":5:1 -1
> > > >>      (nil))
> > > >>
> > > >> and for softfp target, split2 pass splits the assignment to r0 and
> r1:
> > > >>
> > > >> (insn 15 6 16 2 (set (reg:SI 0 r0)
> > > >>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
> > {*thumb2_movsi_vfp}
> > > >>      (nil))
> > > >> (insn 16 15 11 2 (set (reg:SI 1 r1 [+4 ])
> > > >>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
> > {*thumb2_movsi_vfp}
> > > >>      (nil))
> > > >> (insn 11 16 13 2 (use (reg/i:V8QI 0 r0)) "foo.c":5:1 -1
> > > >>      (nil))
> > > >>
> > > >> I suppose we could use a dg-scan for r[0-9]+, #-1 for softfp
> targets ?
> > > >>
> > > > Yes, probably, or try with check-function-bodies.
> > > Hi,
> > > Sorry for the late response. Does the attached patch look OK ?
> > ping https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577532.html
>
> Ok.
>


Sorry Prathamesh, this does not quite work. See
https://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/r12-3294-g7a6f40d0452ec76e126c2612dcfa32f3c73e2315/report-build-info.html
(red cells in the gcc column)

Can you have a look?

Thanks

Christophe

Thanks,
> Kyrill
>
> >
> > Thanks,
> > Prathamesh
> > >
> > > Thanks,
> > > Prathamesh
> > > >
> > > >  Christophe
> > > >
> > > >> Thanks,
> > > >> Prathamesh
> > > >> >
> > > >> > Thanks
> > > >> >
> > > >> > Christophe
> > > >> >
> > > >> >
> > > >> >>
> > > >> >> I think they're equivalent in practice, in any case the patch
> itself is
> > good (move away from RTL builtins).
> > > >> >> Ok.
> > > >> >> Thanks,
> > > >> >> Kyrill
> > > >> >>
> > > >> >> >
> > > >> >> > Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
> > > >> >> > which is due to a missed opt in lowering. I had filed it as
> > > >> >> > PR98435, and posted a fix for it here:
> > > >> >> > https://gcc.gnu.org/pipermail/gcc-patches/2021-
> > June/572648.html
> > > >> >> >
> > > >> >> > Thanks,
> > > >> >> > Prathamesh
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-09-02  9:02               ` Christophe Lyon
@ 2021-09-03  8:35                 ` Prathamesh Kulkarni
  2021-09-03 10:58                   ` Christophe LYON
  0 siblings, 1 reply; 11+ messages in thread
From: Prathamesh Kulkarni @ 2021-09-03  8:35 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Kyrylo Tkachov, gcc Patches

On Thu, 2 Sept 2021 at 14:32, Christophe Lyon
<christophe.lyon.oss@gmail.com> wrote:
>
>
>
> On Tue, Aug 24, 2021 at 10:17 AM Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
>>
>>
>>
>> > -----Original Message-----
>> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
>> > Sent: 24 August 2021 09:01
>> > To: Christophe Lyon <christophe.lyon.oss@gmail.com>
>> > Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; gcc Patches <gcc-
>> > patches@gcc.gnu.org>
>> > Subject: Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
>> > intrinsics
>> >
>> > On Tue, 17 Aug 2021 at 11:55, Prathamesh Kulkarni
>> > <prathamesh.kulkarni@linaro.org> wrote:
>> > >
>> > > On Thu, 12 Aug 2021 at 19:04, Christophe Lyon
>> > > <christophe.lyon.oss@gmail.com> wrote:
>> > > >
>> > > >
>> > > >
>> > > > On Thu, Aug 12, 2021 at 1:54 PM Prathamesh Kulkarni
>> > <prathamesh.kulkarni@linaro.org> wrote:
>> > > >>
>> > > >> On Wed, 11 Aug 2021 at 22:23, Christophe Lyon
>> > > >> <christophe.lyon.oss@gmail.com> wrote:
>> > > >> >
>> > > >> >
>> > > >> >
>> > > >> > On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches <gcc-
>> > patches@gcc.gnu.org> wrote:
>> > > >> >>
>> > > >> >>
>> > > >> >>
>> > > >> >> > -----Original Message-----
>> > > >> >> > From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
>> > > >> >> > Sent: 24 June 2021 12:11
>> > > >> >> > To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
>> > > >> >> > <Kyrylo.Tkachov@arm.com>
>> > > >> >> > Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
>> > intrinsics
>> > > >> >> >
>> > > >> >> > Hi,
>> > > >> >> > This patch replaces builtins for vdup_n and vmov_n.
>> > > >> >> > The patch results in regression for pr51534.c.
>> > > >> >> > Consider following function:
>> > > >> >> >
>> > > >> >> > uint8x8_t f1 (uint8x8_t a) {
>> > > >> >> >   return vcgt_u8(a, vdup_n_u8(0));
>> > > >> >> > }
>> > > >> >> >
>> > > >> >> > code-gen before patch:
>> > > >> >> > f1:
>> > > >> >> >         vmov.i32  d16, #0  @ v8qi
>> > > >> >> >         vcgt.u8     d0, d0, d16
>> > > >> >> >         bx             lr
>> > > >> >> >
>> > > >> >> > code-gen after patch:
>> > > >> >> > f1:
>> > > >> >> >         vceq.i8 d0, d0, #0
>> > > >> >> >         vmvn    d0, d0
>> > > >> >> >         bx         lr
>> > > >> >> >
>> > > >> >> > I am not sure which one is better tho ?
>> > > >> >>
>> > > >> >
>> > > >> > Hi Prathamesh,
>> > > >> >
>> > > >> > This patch introduces a regression on non-hardfp configs (eg arm-
>> > linux-gnueabi or arm-eabi):
>> > > >> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-
>> > assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
>> > > >> > FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-
>> > assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3
>> > > >> >
>> > > >> > Can you fix this?
>> > > >> The issue is, for following test:
>> > > >>
>> > > >> #include <arm_neon.h>
>> > > >>
>> > > >> uint8x8_t f1 (uint8x8_t a) {
>> > > >>   return vcge_u8(a, vdup_n_u8(0));
>> > > >> }
>> > > >>
>> > > >> armhf code-gen:
>> > > >> f1:
>> > > >>         vmov.i32  d0, #0xffffffff  @ v8qi
>> > > >>         bx            lr
>> > > >>
>> > > >> arm softfp code-gen:
>> > > >> f1:
>> > > >>         mov     r0, #-1
>> > > >>         mov     r1, #-1
>> > > >>         bx      lr
>> > > >>
>> > > >> The code-gen for both is same upto split2 pass:
>> > > >>
>> > > >> (insn 10 6 11 2 (set (reg/i:V8QI 16 s0)
>> > > >>         (const_vector:V8QI [
>> > > >>                 (const_int -1 [0xffffffffffffffff]) repeated x8
>> > > >>             ])) "foo.c":5:1 1052 {*neon_movv8qi}
>> > > >>      (expr_list:REG_EQUAL (const_vector:V8QI [
>> > > >>                 (const_int -1 [0xffffffffffffffff]) repeated x8
>> > > >>             ])
>> > > >>         (nil)))
>> > > >> (insn 11 10 13 2 (use (reg/i:V8QI 16 s0)) "foo.c":5:1 -1
>> > > >>      (nil))
>> > > >>
>> > > >> and for softfp target, split2 pass splits the assignment to r0 and r1:
>> > > >>
>> > > >> (insn 15 6 16 2 (set (reg:SI 0 r0)
>> > > >>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
>> > {*thumb2_movsi_vfp}
>> > > >>      (nil))
>> > > >> (insn 16 15 11 2 (set (reg:SI 1 r1 [+4 ])
>> > > >>         (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
>> > {*thumb2_movsi_vfp}
>> > > >>      (nil))
>> > > >> (insn 11 16 13 2 (use (reg/i:V8QI 0 r0)) "foo.c":5:1 -1
>> > > >>      (nil))
>> > > >>
>> > > >> I suppose we could use a dg-scan for r[0-9]+, #-1 for softfp targets ?
>> > > >>
>> > > > Yes, probably, or try with check-function-bodies.
>> > > Hi,
>> > > Sorry for the late response. Does the attached patch look OK ?
>> > ping https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577532.html
>>
>> Ok.
>
>
>
> Sorry Prathamesh, this does not quite work. See  https://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/r12-3294-g7a6f40d0452ec76e126c2612dcfa32f3c73e2315/report-build-info.html
> (red cells in the gcc column)
>
> Can you have a look?
Ah, for arm-none-eabi target the code-gen for test_vcge_u8_uint8x8_t is:
        mvn     r0, #0
        mvn     r1, #0
        bx      lr

instead of:
        mov     r0, #-1
        mov     r1, #-1
        bx      lr

I guess both code-gen sequences are correct, but I am not sure under
which circumstance either sequence gets generated.
To accomodate for both, I tried to check for either pattern with OR:
/* { dg-final { scan-assembler-times "(mov\[    \]+r\[0-9\]+,
#-1)|(mvn\[    \]+r\[0-9\]+, #0)" 6 { target { arm_softfp_ok } } } }
*/
but that doesn't seem to work.

Do you have any suggestions on how to proceed ?

Thanks,
Prathamesh
>
> Thanks
>
> Christophe
>
>> Thanks,
>> Kyrill
>>
>> >
>> > Thanks,
>> > Prathamesh
>> > >
>> > > Thanks,
>> > > Prathamesh
>> > > >
>> > > >  Christophe
>> > > >
>> > > >> Thanks,
>> > > >> Prathamesh
>> > > >> >
>> > > >> > Thanks
>> > > >> >
>> > > >> > Christophe
>> > > >> >
>> > > >> >
>> > > >> >>
>> > > >> >> I think they're equivalent in practice, in any case the patch itself is
>> > good (move away from RTL builtins).
>> > > >> >> Ok.
>> > > >> >> Thanks,
>> > > >> >> Kyrill
>> > > >> >>
>> > > >> >> >
>> > > >> >> > Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
>> > > >> >> > which is due to a missed opt in lowering. I had filed it as
>> > > >> >> > PR98435, and posted a fix for it here:
>> > > >> >> > https://gcc.gnu.org/pipermail/gcc-patches/2021-
>> > June/572648.html
>> > > >> >> >
>> > > >> >> > Thanks,
>> > > >> >> > Prathamesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics
  2021-09-03  8:35                 ` Prathamesh Kulkarni
@ 2021-09-03 10:58                   ` Christophe LYON
  0 siblings, 0 replies; 11+ messages in thread
From: Christophe LYON @ 2021-09-03 10:58 UTC (permalink / raw)
  To: gcc-patches


On 03/09/2021 10:35, Prathamesh Kulkarni via Gcc-patches wrote:
> On Thu, 2 Sept 2021 at 14:32, Christophe Lyon
> <christophe.lyon.oss@gmail.com> wrote:
>>
>>
>> On Tue, Aug 24, 2021 at 10:17 AM Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
>>>
>>>
>>>> -----Original Message-----
>>>> From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
>>>> Sent: 24 August 2021 09:01
>>>> To: Christophe Lyon <christophe.lyon.oss@gmail.com>
>>>> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>; gcc Patches <gcc-
>>>> patches@gcc.gnu.org>
>>>> Subject: Re: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
>>>> intrinsics
>>>>
>>>> On Tue, 17 Aug 2021 at 11:55, Prathamesh Kulkarni
>>>> <prathamesh.kulkarni@linaro.org> wrote:
>>>>> On Thu, 12 Aug 2021 at 19:04, Christophe Lyon
>>>>> <christophe.lyon.oss@gmail.com> wrote:
>>>>>>
>>>>>>
>>>>>> On Thu, Aug 12, 2021 at 1:54 PM Prathamesh Kulkarni
>>>> <prathamesh.kulkarni@linaro.org> wrote:
>>>>>>> On Wed, 11 Aug 2021 at 22:23, Christophe Lyon
>>>>>>> <christophe.lyon.oss@gmail.com> wrote:
>>>>>>>>
>>>>>>>>
>>>>>>>> On Thu, Jun 24, 2021 at 6:29 PM Kyrylo Tkachov via Gcc-patches <gcc-
>>>> patches@gcc.gnu.org> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
>>>>>>>>>> Sent: 24 June 2021 12:11
>>>>>>>>>> To: gcc Patches <gcc-patches@gcc.gnu.org>; Kyrylo Tkachov
>>>>>>>>>> <Kyrylo.Tkachov@arm.com>
>>>>>>>>>> Subject: [ARM] PR66791: Replace builtins for vdup_n and vmov_n
>>>> intrinsics
>>>>>>>>>> Hi,
>>>>>>>>>> This patch replaces builtins for vdup_n and vmov_n.
>>>>>>>>>> The patch results in regression for pr51534.c.
>>>>>>>>>> Consider following function:
>>>>>>>>>>
>>>>>>>>>> uint8x8_t f1 (uint8x8_t a) {
>>>>>>>>>>    return vcgt_u8(a, vdup_n_u8(0));
>>>>>>>>>> }
>>>>>>>>>>
>>>>>>>>>> code-gen before patch:
>>>>>>>>>> f1:
>>>>>>>>>>          vmov.i32  d16, #0  @ v8qi
>>>>>>>>>>          vcgt.u8     d0, d0, d16
>>>>>>>>>>          bx             lr
>>>>>>>>>>
>>>>>>>>>> code-gen after patch:
>>>>>>>>>> f1:
>>>>>>>>>>          vceq.i8 d0, d0, #0
>>>>>>>>>>          vmvn    d0, d0
>>>>>>>>>>          bx         lr
>>>>>>>>>>
>>>>>>>>>> I am not sure which one is better tho ?
>>>>>>>> Hi Prathamesh,
>>>>>>>>
>>>>>>>> This patch introduces a regression on non-hardfp configs (eg arm-
>>>> linux-gnueabi or arm-eabi):
>>>>>>>> FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-
>>>> assembler-times vmov.i32[ \t]+[dD][0-9]+, #0xffffffff 3
>>>>>>>> FAIL:  gcc:gcc.target/arm/arm.exp=gcc.target/arm/pr51534.c scan-
>>>> assembler-times vmov.i32[ \t]+[qQ][0-9]+, #4294967295 3
>>>>>>>> Can you fix this?
>>>>>>> The issue is, for following test:
>>>>>>>
>>>>>>> #include <arm_neon.h>
>>>>>>>
>>>>>>> uint8x8_t f1 (uint8x8_t a) {
>>>>>>>    return vcge_u8(a, vdup_n_u8(0));
>>>>>>> }
>>>>>>>
>>>>>>> armhf code-gen:
>>>>>>> f1:
>>>>>>>          vmov.i32  d0, #0xffffffff  @ v8qi
>>>>>>>          bx            lr
>>>>>>>
>>>>>>> arm softfp code-gen:
>>>>>>> f1:
>>>>>>>          mov     r0, #-1
>>>>>>>          mov     r1, #-1
>>>>>>>          bx      lr
>>>>>>>
>>>>>>> The code-gen for both is same upto split2 pass:
>>>>>>>
>>>>>>> (insn 10 6 11 2 (set (reg/i:V8QI 16 s0)
>>>>>>>          (const_vector:V8QI [
>>>>>>>                  (const_int -1 [0xffffffffffffffff]) repeated x8
>>>>>>>              ])) "foo.c":5:1 1052 {*neon_movv8qi}
>>>>>>>       (expr_list:REG_EQUAL (const_vector:V8QI [
>>>>>>>                  (const_int -1 [0xffffffffffffffff]) repeated x8
>>>>>>>              ])
>>>>>>>          (nil)))
>>>>>>> (insn 11 10 13 2 (use (reg/i:V8QI 16 s0)) "foo.c":5:1 -1
>>>>>>>       (nil))
>>>>>>>
>>>>>>> and for softfp target, split2 pass splits the assignment to r0 and r1:
>>>>>>>
>>>>>>> (insn 15 6 16 2 (set (reg:SI 0 r0)
>>>>>>>          (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
>>>> {*thumb2_movsi_vfp}
>>>>>>>       (nil))
>>>>>>> (insn 16 15 11 2 (set (reg:SI 1 r1 [+4 ])
>>>>>>>          (const_int -1 [0xffffffffffffffff])) "foo.c":5:1 740
>>>> {*thumb2_movsi_vfp}
>>>>>>>       (nil))
>>>>>>> (insn 11 16 13 2 (use (reg/i:V8QI 0 r0)) "foo.c":5:1 -1
>>>>>>>       (nil))
>>>>>>>
>>>>>>> I suppose we could use a dg-scan for r[0-9]+, #-1 for softfp targets ?
>>>>>>>
>>>>>> Yes, probably, or try with check-function-bodies.
>>>>> Hi,
>>>>> Sorry for the late response. Does the attached patch look OK ?
>>>> ping https://gcc.gnu.org/pipermail/gcc-patches/2021-August/577532.html
>>> Ok.
>>
>>
>> Sorry Prathamesh, this does not quite work. See  https://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/r12-3294-g7a6f40d0452ec76e126c2612dcfa32f3c73e2315/report-build-info.html
>> (red cells in the gcc column)
>>
>> Can you have a look?
> Ah, for arm-none-eabi target the code-gen for test_vcge_u8_uint8x8_t is:
>          mvn     r0, #0
>          mvn     r1, #0
>          bx      lr
>
> instead of:
>          mov     r0, #-1
>          mov     r1, #-1
>          bx      lr
>
> I guess both code-gen sequences are correct, but I am not sure under
> which circumstance either sequence gets generated.
> To accomodate for both, I tried to check for either pattern with OR:
> /* { dg-final { scan-assembler-times "(mov\[    \]+r\[0-9\]+,
> #-1)|(mvn\[    \]+r\[0-9\]+, #0)" 6 { target { arm_softfp_ok } } } }
> */
> but that doesn't seem to work.
>
> Do you have any suggestions on how to proceed ?

Sorry I have no magic syntax for this :-)

Maybe it's also worth checking why we have this codegen difference?


Thanks,


Christophe



>
> Thanks,
> Prathamesh
>> Thanks
>>
>> Christophe
>>
>>> Thanks,
>>> Kyrill
>>>
>>>> Thanks,
>>>> Prathamesh
>>>>> Thanks,
>>>>> Prathamesh
>>>>>>   Christophe
>>>>>>
>>>>>>> Thanks,
>>>>>>> Prathamesh
>>>>>>>> Thanks
>>>>>>>>
>>>>>>>> Christophe
>>>>>>>>
>>>>>>>>
>>>>>>>>> I think they're equivalent in practice, in any case the patch itself is
>>>> good (move away from RTL builtins).
>>>>>>>>> Ok.
>>>>>>>>> Thanks,
>>>>>>>>> Kyrill
>>>>>>>>>
>>>>>>>>>> Also, this patch regressed bf16_dup.c on arm-linux-gnueabi,
>>>>>>>>>> which is due to a missed opt in lowering. I had filed it as
>>>>>>>>>> PR98435, and posted a fix for it here:
>>>>>>>>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-
>>>> June/572648.html
>>>>>>>>>> Thanks,
>>>>>>>>>> Prathamesh

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-09-03 10:58 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-24 11:11 [ARM] PR66791: Replace builtins for vdup_n and vmov_n intrinsics Prathamesh Kulkarni
2021-06-24 16:28 ` Kyrylo Tkachov
2021-08-11 16:53   ` Christophe Lyon
2021-08-12 11:54     ` Prathamesh Kulkarni
2021-08-12 13:34       ` Christophe Lyon
2021-08-17  6:25         ` Prathamesh Kulkarni
2021-08-24  8:00           ` Prathamesh Kulkarni
2021-08-24  8:17             ` Kyrylo Tkachov
2021-09-02  9:02               ` Christophe Lyon
2021-09-03  8:35                 ` Prathamesh Kulkarni
2021-09-03 10:58                   ` Christophe LYON

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).