* [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
@ 2021-05-26 7:11 Prathamesh Kulkarni
2021-05-26 8:37 ` Marc Glisse
0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-05-26 7:11 UTC (permalink / raw)
To: Kyrill Tkachov, gcc Patches
[-- Attachment #1: Type: text/plain, Size: 2060 bytes --]
Hi,
The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
For float variants, it gates multiplication on __FAST_MATH__.
Since we are not removing all calls to builtins, I am not sure if we
should remove
entry for vmul_n from arm_neon_builtins.def ?
Testing the patch showed fallout for armv8_2-fp16-neon-2.c, because the patch
generates better code.
Code-gen diff:
--- armv8_2-fp16-neon-2.s 2021-05-26 11:34:30.870304900 +0530
+++ armv8_2-fp16-neon-2-after.s 2021-05-26 11:19:13.990304900 +0530
@@ -84,21 +84,9 @@
test_vmul_n_16x4:
- @ args = 0, pretend = 0, frame = 8
+ @ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
- sub sp, sp, #8
- vldr d7, .L8
- add r3, sp, #6
- vst1.16 {d1[0]}, [r3]
- vld1.16 {d7[0]}, [r3]
- vmul.f16 d0, d0, d7[0]
- add sp, sp, #8
- @ sp needed
+ vmov.f16 r3, s2 @ __fp16
+ vdup.16 d16, r3
+ vmul.f16 d0, d16, d0
bx lr
-.L9:
- .align 3
-.L8:
- .short 0
- .short 0
- .short 0
- .short 0
.size test_vmul_n_16x4, .-test_vmul_n_16x4
@@ -113,21 +101,9 @@
test_vmul_n_16x8:
- @ args = 0, pretend = 0, frame = 8
+ @ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
@ link register save eliminated.
- sub sp, sp, #8
- vldr d7, .L12
- add r3, sp, #6
- vst1.16 {d2[0]}, [r3]
- vld1.16 {d7[0]}, [r3]
- vmul.f16 q0, q0, d7[0]
- add sp, sp, #8
- @ sp needed
+ vmov.f16 r3, s4 @ __fp16
+ vdup.16 q8, r3
+ vmul.f16 q0, q8, q0
bx lr
-.L13:
- .align 3
-.L12:
- .short 0
- .short 0
- .short 0
- .short 0
.size test_vmul_n_16x8, .-test_vmul_n_16x8
Adjusted the test, to fix the failing tests.
OK to commit if testing passes ?
Thanks,
Prathamesh
[-- Attachment #2: vmul_n-4.txt --]
[-- Type: text/plain, Size: 5723 bytes --]
2021-26-05 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
PR target/66791
* config/arm/arm_neon.h (vmul_n_s16): Replace call to builtin with __a * __b.
(vmul_n_s32): Likewise.
(vmul_n_u16): Likewise.
(vmul_n_u32): Likewise.
(vmulq_n_s16): Likewise.
(vmulq_n_s32): Likewise.
(vmulq_n_u16): Likewise.
(vmulq_n_u32): Likewise.
(vmul_n_f32): Gate __a * __b conditionally on __FAST_MATH__.
(vmulq_n_f32): Likewise.
(vmul_n_f16): Likewise.
(vmulq_n_f16): Likewise.
testsuite/
* gcc.target/arm/armv8_2-fp16-neon-2.c: Adjust.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index dcd533fd003..8ac00774e6c 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8331,70 +8331,78 @@ __extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_s16 (int16x4_t __a, int16_t __b)
{
- return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b);
+ return __a * __b;
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_s32 (int32x2_t __a, int32_t __b)
{
- return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b);
+ return __a * __b;
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_f32 (float32x2_t __a, float32_t __b)
{
+#ifdef __FAST_MATH__
+ return __a * __b;
+#else
return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b);
+#endif
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_u16 (uint16x4_t __a, uint16_t __b)
{
- return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
+ return __a * __b;
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_u32 (uint32x2_t __a, uint32_t __b)
{
- return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
+ return __a * __b;
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_s16 (int16x8_t __a, int16_t __b)
{
- return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b);
+ return __a * __b;
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_s32 (int32x4_t __a, int32_t __b)
{
- return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b);
+ return __a * __b;
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_f32 (float32x4_t __a, float32_t __b)
{
+#ifdef __FAST_MATH__
+ return __a * __b;
+#else
return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b);
+#endif
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
{
- return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b);
+ return __a * __b;
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
{
- return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b);
+ return __a * __b;
}
__extension__ extern __inline int32x4_t
@@ -17661,7 +17669,11 @@ __extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_f16 (float16x4_t __a, float16_t __b)
{
+#ifdef __FAST_MATH__
+ return __a * __b;
+#else
return __builtin_neon_vmul_nv4hf (__a, __b);
+#endif
}
__extension__ extern __inline float16x8_t
@@ -17686,7 +17698,11 @@ __extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_f16 (float16x8_t __a, float16_t __b)
{
+#ifdef __FAST_MATH__
+ return __a * __b;
+#else
return __builtin_neon_vmul_nv8hf (__a, __b);
+#endif
}
__extension__ extern __inline float16x4_t
diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
index 50f689352ca..2d26bc0ac26 100644
--- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
+++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
@@ -327,13 +327,13 @@ BINOP_TEST (vminnm)
BINOP_TEST (vmul)
/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
- { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+ { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */
BINOP_LANE_TEST (vmul, 2)
/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[2\]} 1 } }
{ dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]} 1 } } */
BINOP_N_TEST (vmul)
-/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]} 1 } }
- { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]} 1 } }*/
+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
+ { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } }*/
float16x4_t
test_vpadd_16x4 (float16x4_t a, float16x4_t b)
@@ -387,7 +387,7 @@ test_vdup_n_f16 (float16_t a)
{
return vdup_n_f16 (a);
}
-/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 3 } } */
float16x8_t
test_vmovq_n_f16 (float16_t a)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
2021-05-26 7:11 [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b Prathamesh Kulkarni
@ 2021-05-26 8:37 ` Marc Glisse
2021-05-31 9:52 ` Prathamesh Kulkarni
0 siblings, 1 reply; 9+ messages in thread
From: Marc Glisse @ 2021-05-26 8:37 UTC (permalink / raw)
To: Prathamesh Kulkarni; +Cc: Kyrill Tkachov, gcc Patches
On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
is vmul_n already undefined in case of overflow?
--
Marc Glisse
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
2021-05-26 8:37 ` Marc Glisse
@ 2021-05-31 9:52 ` Prathamesh Kulkarni
2021-05-31 10:31 ` Prathamesh Kulkarni
0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-05-31 9:52 UTC (permalink / raw)
To: gcc Patches
On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
>
> On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
>
> > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
>
> I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> is vmul_n already undefined in case of overflow?
Hi Marc,
Sorry for late reply, for vmul_n_s*, I think they are signed
(int<width>x<width>_t).
I am not sure how should the intrinsic behave in case of signed overflow,
but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
Thanks,
Prathamesh
>
> --
> Marc Glisse
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
2021-05-31 9:52 ` Prathamesh Kulkarni
@ 2021-05-31 10:31 ` Prathamesh Kulkarni
2021-06-07 7:15 ` Prathamesh Kulkarni
0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-05-31 10:31 UTC (permalink / raw)
To: gcc Patches
On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> >
> > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> >
> > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> >
> > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > is vmul_n already undefined in case of overflow?
> Hi Marc,
> Sorry for late reply, for vmul_n_s*, I think they are signed
> (int<width>x<width>_t).
Oops, I meant int<width>x<nelems>_t.
> I am not sure how should the intrinsic behave in case of signed overflow,
> but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
>
> Thanks,
> Prathamesh
> >
> > --
> > Marc Glisse
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
2021-05-31 10:31 ` Prathamesh Kulkarni
@ 2021-06-07 7:15 ` Prathamesh Kulkarni
2021-06-14 7:57 ` Prathamesh Kulkarni
0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-07 7:15 UTC (permalink / raw)
To: gcc Patches
[-- Attachment #1: Type: text/plain, Size: 1127 bytes --]
On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> > >
> > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> > >
> > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> > >
> > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > > is vmul_n already undefined in case of overflow?
> > Hi Marc,
> > Sorry for late reply, for vmul_n_s*, I think they are signed
> > (int<width>x<width>_t).
> Oops, I meant int<width>x<nelems>_t.
> > I am not sure how should the intrinsic behave in case of signed overflow,
> > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
The attached version fixes one fallout I missed earlier.
Is this OK to commit ?
Thanks,
Prathamesh
> >
> > Thanks,
> > Prathamesh
> > >
> > > --
> > > Marc Glisse
[-- Attachment #2: vmul_n-5.txt --]
[-- Type: text/plain, Size: 6111 bytes --]
2021-06-07 Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
PR target/66791
* config/arm/arm_neon.h (vmul_n_s16): Replace call to builtin
with __a * __b.
(vmul_n_s32): Likewise.
(vmul_n_u16): Likewise.
(vmul_n_u32): Likewise.
(vmulq_n_s16): Likewise.
(vmulq_n_s32): Likewise.
(vmulq_n_u16): Likewise.
(vmulq_n_u32): Likewise.
(vmul_n_f32): Gate __a * __b conditionally on __FAST_MATH__.
(vmulq_n_f32): Likewise.
(vmul_n_f16): Likewise.
(vmulq_n_f16): Likewise.
testsuite/
* gcc.target/arm/armv8_2-fp16-neon-2.c: Adjust.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index dcd533fd003..8ac00774e6c 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8331,70 +8331,78 @@ __extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_s16 (int16x4_t __a, int16_t __b)
{
- return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b);
+ return __a * __b;
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_s32 (int32x2_t __a, int32_t __b)
{
- return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b);
+ return __a * __b;
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_f32 (float32x2_t __a, float32_t __b)
{
+#ifdef __FAST_MATH__
+ return __a * __b;
+#else
return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b);
+#endif
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_u16 (uint16x4_t __a, uint16_t __b)
{
- return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
+ return __a * __b;
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_u32 (uint32x2_t __a, uint32_t __b)
{
- return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
+ return __a * __b;
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_s16 (int16x8_t __a, int16_t __b)
{
- return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b);
+ return __a * __b;
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_s32 (int32x4_t __a, int32_t __b)
{
- return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b);
+ return __a * __b;
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_f32 (float32x4_t __a, float32_t __b)
{
+#ifdef __FAST_MATH__
+ return __a * __b;
+#else
return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b);
+#endif
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
{
- return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b);
+ return __a * __b;
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
{
- return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b);
+ return __a * __b;
}
__extension__ extern __inline int32x4_t
@@ -17661,7 +17669,11 @@ __extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmul_n_f16 (float16x4_t __a, float16_t __b)
{
+#ifdef __FAST_MATH__
+ return __a * __b;
+#else
return __builtin_neon_vmul_nv4hf (__a, __b);
+#endif
}
__extension__ extern __inline float16x8_t
@@ -17686,7 +17698,11 @@ __extension__ extern __inline float16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmulq_n_f16 (float16x8_t __a, float16_t __b)
{
+#ifdef __FAST_MATH__
+ return __a * __b;
+#else
return __builtin_neon_vmul_nv8hf (__a, __b);
+#endif
}
__extension__ extern __inline float16x4_t
diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
index 50f689352ca..6808576ce59 100644
--- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
+++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
@@ -327,13 +327,13 @@ BINOP_TEST (vminnm)
BINOP_TEST (vmul)
/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
- { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } } */
+ { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } } */
BINOP_LANE_TEST (vmul, 2)
/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[2\]} 1 } }
{ dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]} 1 } } */
BINOP_N_TEST (vmul)
-/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]} 1 } }
- { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]} 1 } }*/
+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
+ { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } }*/
float16x4_t
test_vpadd_16x4 (float16x4_t a, float16x4_t b)
@@ -387,7 +387,7 @@ test_vdup_n_f16 (float16_t a)
{
return vdup_n_f16 (a);
}
-/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 3 } } */
float16x8_t
test_vmovq_n_f16 (float16_t a)
@@ -400,7 +400,7 @@ test_vdupq_n_f16 (float16_t a)
{
return vdupq_n_f16 (a);
}
-/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 2 } } */
+/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 3 } } */
float16x4_t
test_vdup_lane_f16 (float16x4_t a)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
2021-06-07 7:15 ` Prathamesh Kulkarni
@ 2021-06-14 7:57 ` Prathamesh Kulkarni
2021-06-21 8:34 ` Prathamesh Kulkarni
0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-14 7:57 UTC (permalink / raw)
To: gcc Patches
On Mon, 7 Jun 2021 at 12:45, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> > > >
> > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> > > >
> > > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> > > >
> > > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > > > is vmul_n already undefined in case of overflow?
> > > Hi Marc,
> > > Sorry for late reply, for vmul_n_s*, I think they are signed
> > > (int<width>x<width>_t).
> > Oops, I meant int<width>x<nelems>_t.
> > > I am not sure how should the intrinsic behave in case of signed overflow,
> > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
> The attached version fixes one fallout I missed earlier.
> Is this OK to commit ?
ping https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> > >
> > > Thanks,
> > > Prathamesh
> > > >
> > > > --
> > > > Marc Glisse
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
2021-06-14 7:57 ` Prathamesh Kulkarni
@ 2021-06-21 8:34 ` Prathamesh Kulkarni
2021-06-29 7:21 ` Prathamesh Kulkarni
0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-21 8:34 UTC (permalink / raw)
To: gcc Patches
On Mon, 14 Jun 2021 at 13:27, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 7 Jun 2021 at 12:45, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> > > <prathamesh.kulkarni@linaro.org> wrote:
> > > >
> > > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> > > > >
> > > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> > > > >
> > > > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> > > > >
> > > > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > > > > is vmul_n already undefined in case of overflow?
> > > > Hi Marc,
> > > > Sorry for late reply, for vmul_n_s*, I think they are signed
> > > > (int<width>x<width>_t).
> > > Oops, I meant int<width>x<nelems>_t.
> > > > I am not sure how should the intrinsic behave in case of signed overflow,
> > > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> > > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
> > The attached version fixes one fallout I missed earlier.
> > Is this OK to commit ?
> ping https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
ping * 2 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Prathamesh
> > > >
> > > > Thanks,
> > > > Prathamesh
> > > > >
> > > > > --
> > > > > Marc Glisse
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
2021-06-21 8:34 ` Prathamesh Kulkarni
@ 2021-06-29 7:21 ` Prathamesh Kulkarni
2021-06-30 8:27 ` Kyrylo Tkachov
0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-29 7:21 UTC (permalink / raw)
To: gcc Patches
On Mon, 21 Jun 2021 at 14:04, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 14 Jun 2021 at 13:27, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Mon, 7 Jun 2021 at 12:45, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
> > > <prathamesh.kulkarni@linaro.org> wrote:
> > > >
> > > > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> > > > <prathamesh.kulkarni@linaro.org> wrote:
> > > > >
> > > > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> > > > > >
> > > > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> > > > > >
> > > > > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> > > > > >
> > > > > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > > > > > is vmul_n already undefined in case of overflow?
> > > > > Hi Marc,
> > > > > Sorry for late reply, for vmul_n_s*, I think they are signed
> > > > > (int<width>x<width>_t).
> > > > Oops, I meant int<width>x<nelems>_t.
> > > > > I am not sure how should the intrinsic behave in case of signed overflow,
> > > > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> > > > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
> > > The attached version fixes one fallout I missed earlier.
> > > Is this OK to commit ?
> > ping https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
> ping * 2 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
ping * 3 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Prathamesh
> > >
> > > Thanks,
> > > Prathamesh
> > > > >
> > > > > Thanks,
> > > > > Prathamesh
> > > > > >
> > > > > > --
> > > > > > Marc Glisse
^ permalink raw reply [flat|nested] 9+ messages in thread
* RE: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
2021-06-29 7:21 ` Prathamesh Kulkarni
@ 2021-06-30 8:27 ` Kyrylo Tkachov
0 siblings, 0 replies; 9+ messages in thread
From: Kyrylo Tkachov @ 2021-06-30 8:27 UTC (permalink / raw)
To: Prathamesh Kulkarni, gcc Patches
Hi Prathamesh,
> -----Original Message-----
> From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> Sent: 29 June 2021 08:22
> To: gcc Patches <gcc-patches@gcc.gnu.org>
> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics
> with __a * __b
>
> On Mon, 21 Jun 2021 at 14:04, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Mon, 14 Jun 2021 at 13:27, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Mon, 7 Jun 2021 at 12:45, Prathamesh Kulkarni
> > > <prathamesh.kulkarni@linaro.org> wrote:
> > > >
> > > > On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
> > > > <prathamesh.kulkarni@linaro.org> wrote:
> > > > >
> > > > > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> > > > > <prathamesh.kulkarni@linaro.org> wrote:
> > > > > >
> > > > > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr>
> wrote:
> > > > > > >
> > > > > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches
> wrote:
> > > > > > >
> > > > > > > > The attached patch removes calls to builtins in vmul_n* (a, b)
> with __a * __b.
> > > > > > >
> > > > > > > I am not familiar with neon, but are __a and __b unsigned here?
> Otherwise,
> > > > > > > is vmul_n already undefined in case of overflow?
> > > > > > Hi Marc,
> > > > > > Sorry for late reply, for vmul_n_s*, I think they are signed
> > > > > > (int<width>x<width>_t).
> > > > > Oops, I meant int<width>x<nelems>_t.
> > > > > > I am not sure how should the intrinsic behave in case of signed
> overflow,
> > > > > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined
> too.
> > > > > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of
> overflow ?
> > > > The attached version fixes one fallout I missed earlier.
> > > > Is this OK to commit ?
> > > ping https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
> > ping * 2 https://gcc.gnu.org/pipermail/gcc-patches/2021-
> June/572037.html
> ping * 3 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
I'm a bit wary of leaving them undefined for signed overflow. I see that aarch64 leaves them too so maybe it's not such a big deal, but I'd like to consider that separately.
Can you please split this into the unsigned and floating-point parts, followed by the signed intrinsics? The unsigned and FP parts are okay, lets' review the signed intrinsics separately.
Thanks,
Kyrill
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Prathamesh
> > >
> > > Thanks,
> > > Prathamesh
> > > >
> > > > Thanks,
> > > > Prathamesh
> > > > > >
> > > > > > Thanks,
> > > > > > Prathamesh
> > > > > > >
> > > > > > > --
> > > > > > > Marc Glisse
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2021-06-30 8:27 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-26 7:11 [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b Prathamesh Kulkarni
2021-05-26 8:37 ` Marc Glisse
2021-05-31 9:52 ` Prathamesh Kulkarni
2021-05-31 10:31 ` Prathamesh Kulkarni
2021-06-07 7:15 ` Prathamesh Kulkarni
2021-06-14 7:57 ` Prathamesh Kulkarni
2021-06-21 8:34 ` Prathamesh Kulkarni
2021-06-29 7:21 ` Prathamesh Kulkarni
2021-06-30 8:27 ` Kyrylo Tkachov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).