public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
@ 2021-05-26  7:11 Prathamesh Kulkarni
  2021-05-26  8:37 ` Marc Glisse
  0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-05-26  7:11 UTC (permalink / raw)
  To: Kyrill Tkachov, gcc Patches

[-- Attachment #1: Type: text/plain, Size: 2060 bytes --]

Hi,
The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
For float variants, it gates multiplication on __FAST_MATH__.
Since we are not removing all calls to builtins, I am not sure if we
should remove
entry for vmul_n from arm_neon_builtins.def ?

Testing the patch showed fallout for armv8_2-fp16-neon-2.c, because the patch
generates better code.
Code-gen diff:

--- armv8_2-fp16-neon-2.s       2021-05-26 11:34:30.870304900 +0530
+++ armv8_2-fp16-neon-2-after.s 2021-05-26 11:19:13.990304900 +0530
@@ -84,21 +84,9 @@
 test_vmul_n_16x4:
-       @ args = 0, pretend = 0, frame = 8
+       @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
-       sub     sp, sp, #8
-       vldr    d7, .L8
-       add     r3, sp, #6
-       vst1.16 {d1[0]}, [r3]
-       vld1.16 {d7[0]}, [r3]
-       vmul.f16        d0, d0, d7[0]
-       add     sp, sp, #8
-       @ sp needed
+       vmov.f16        r3, s2  @ __fp16
+       vdup.16 d16, r3
+       vmul.f16        d0, d16, d0
        bx      lr
-.L9:
-       .align  3
-.L8:
-       .short  0
-       .short  0
-       .short  0
-       .short  0
        .size   test_vmul_n_16x4, .-test_vmul_n_16x4
@@ -113,21 +101,9 @@
 test_vmul_n_16x8:
-       @ args = 0, pretend = 0, frame = 8
+       @ args = 0, pretend = 0, frame = 0
        @ frame_needed = 0, uses_anonymous_args = 0
        @ link register save eliminated.
-       sub     sp, sp, #8
-       vldr    d7, .L12
-       add     r3, sp, #6
-       vst1.16 {d2[0]}, [r3]
-       vld1.16 {d7[0]}, [r3]
-       vmul.f16        q0, q0, d7[0]
-       add     sp, sp, #8
-       @ sp needed
+       vmov.f16        r3, s4  @ __fp16
+       vdup.16 q8, r3
+       vmul.f16        q0, q8, q0
        bx      lr
-.L13:
-       .align  3
-.L12:
-       .short  0
-       .short  0
-       .short  0
-       .short  0
        .size   test_vmul_n_16x8, .-test_vmul_n_16x8

Adjusted the test, to fix the failing tests.
OK to commit if testing passes ?

Thanks,
Prathamesh

[-- Attachment #2: vmul_n-4.txt --]
[-- Type: text/plain, Size: 5723 bytes --]

2021-26-05  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

	PR target/66791
	* config/arm/arm_neon.h (vmul_n_s16): Replace call to builtin with __a * __b.
	(vmul_n_s32): Likewise.
	(vmul_n_u16): Likewise.
	(vmul_n_u32): Likewise.
	(vmulq_n_s16): Likewise.
	(vmulq_n_s32): Likewise.
	(vmulq_n_u16): Likewise.
	(vmulq_n_u32): Likewise.
	(vmul_n_f32): Gate __a * __b conditionally on __FAST_MATH__.
	(vmulq_n_f32): Likewise.
	(vmul_n_f16): Likewise.
	(vmulq_n_f16): Likewise.

testsuite/
	* gcc.target/arm/armv8_2-fp16-neon-2.c: Adjust.

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index dcd533fd003..8ac00774e6c 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8331,70 +8331,78 @@ __extension__ extern __inline int16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_s16 (int16x4_t __a, int16_t __b)
 {
-  return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_s32 (int32x2_t __a, int32_t __b)
 {
-  return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_f32 (float32x2_t __a, float32_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b);
+#endif
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_u16 (uint16x4_t __a, uint16_t __b)
 {
-  return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_u32 (uint32x2_t __a, uint32_t __b)
 {
-  return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_f32 (float32x4_t __a, float32_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b);
+#endif
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x4_t
@@ -17661,7 +17669,11 @@ __extension__ extern __inline float16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_f16 (float16x4_t __a, float16_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return __builtin_neon_vmul_nv4hf (__a, __b);
+#endif
 }
 
 __extension__ extern __inline float16x8_t
@@ -17686,7 +17698,11 @@ __extension__ extern __inline float16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_f16 (float16x8_t __a, float16_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return __builtin_neon_vmul_nv8hf (__a, __b);
+#endif
 }
 
 __extension__ extern __inline float16x4_t
diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
index 50f689352ca..2d26bc0ac26 100644
--- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
+++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
@@ -327,13 +327,13 @@ BINOP_TEST (vminnm)
 
 BINOP_TEST (vmul)
 /* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
-   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } }  */
 BINOP_LANE_TEST (vmul, 2)
 /* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[2\]} 1 } }
    { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]} 1 } }  */
 BINOP_N_TEST (vmul)
-/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]} 1 } }
-   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]} 1 } }*/
+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } }*/
 
 float16x4_t
 test_vpadd_16x4 (float16x4_t a, float16x4_t b)
@@ -387,7 +387,7 @@ test_vdup_n_f16 (float16_t a)
 {
   return vdup_n_f16 (a);
 }
-/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 3 } }  */
 
 float16x8_t
 test_vmovq_n_f16 (float16_t a)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
  2021-05-26  7:11 [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b Prathamesh Kulkarni
@ 2021-05-26  8:37 ` Marc Glisse
  2021-05-31  9:52   ` Prathamesh Kulkarni
  0 siblings, 1 reply; 9+ messages in thread
From: Marc Glisse @ 2021-05-26  8:37 UTC (permalink / raw)
  To: Prathamesh Kulkarni; +Cc: Kyrill Tkachov, gcc Patches

On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:

> The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.

I am not familiar with neon, but are __a and __b unsigned here? Otherwise, 
is vmul_n already undefined in case of overflow?

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
  2021-05-26  8:37 ` Marc Glisse
@ 2021-05-31  9:52   ` Prathamesh Kulkarni
  2021-05-31 10:31     ` Prathamesh Kulkarni
  0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-05-31  9:52 UTC (permalink / raw)
  To: gcc Patches

On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
>
> On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
>
> > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
>
> I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> is vmul_n already undefined in case of overflow?
Hi Marc,
Sorry for late reply, for vmul_n_s*, I think they are signed
(int<width>x<width>_t).
I am not sure how should the intrinsic behave in case of signed overflow,
but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?

Thanks,
Prathamesh
>
> --
> Marc Glisse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
  2021-05-31  9:52   ` Prathamesh Kulkarni
@ 2021-05-31 10:31     ` Prathamesh Kulkarni
  2021-06-07  7:15       ` Prathamesh Kulkarni
  0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-05-31 10:31 UTC (permalink / raw)
  To: gcc Patches

On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> >
> > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> >
> > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> >
> > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > is vmul_n already undefined in case of overflow?
> Hi Marc,
> Sorry for late reply, for vmul_n_s*, I think they are signed
> (int<width>x<width>_t).
Oops, I meant int<width>x<nelems>_t.
> I am not sure how should the intrinsic behave in case of signed overflow,
> but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
>
> Thanks,
> Prathamesh
> >
> > --
> > Marc Glisse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
  2021-05-31 10:31     ` Prathamesh Kulkarni
@ 2021-06-07  7:15       ` Prathamesh Kulkarni
  2021-06-14  7:57         ` Prathamesh Kulkarni
  0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-07  7:15 UTC (permalink / raw)
  To: gcc Patches

[-- Attachment #1: Type: text/plain, Size: 1127 bytes --]

On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> > >
> > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> > >
> > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> > >
> > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > > is vmul_n already undefined in case of overflow?
> > Hi Marc,
> > Sorry for late reply, for vmul_n_s*, I think they are signed
> > (int<width>x<width>_t).
> Oops, I meant int<width>x<nelems>_t.
> > I am not sure how should the intrinsic behave in case of signed overflow,
> > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
The attached version fixes one fallout I missed earlier.
Is this OK to commit ?

Thanks,
Prathamesh
> >
> > Thanks,
> > Prathamesh
> > >
> > > --
> > > Marc Glisse

[-- Attachment #2: vmul_n-5.txt --]
[-- Type: text/plain, Size: 6111 bytes --]

2021-06-07  Prathamesh Kulkarni  <prathamesh.kulkarni@linaro.org>

        PR target/66791
        * config/arm/arm_neon.h (vmul_n_s16): Replace call to builtin
	with __a * __b.
        (vmul_n_s32): Likewise.
        (vmul_n_u16): Likewise.
        (vmul_n_u32): Likewise.
        (vmulq_n_s16): Likewise.
        (vmulq_n_s32): Likewise.
        (vmulq_n_u16): Likewise.
        (vmulq_n_u32): Likewise.
        (vmul_n_f32): Gate __a * __b conditionally on __FAST_MATH__.
        (vmulq_n_f32): Likewise.
        (vmul_n_f16): Likewise.
        (vmulq_n_f16): Likewise.

testsuite/
        * gcc.target/arm/armv8_2-fp16-neon-2.c: Adjust.

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index dcd533fd003..8ac00774e6c 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -8331,70 +8331,78 @@ __extension__ extern __inline int16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_s16 (int16x4_t __a, int16_t __b)
 {
-  return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_s32 (int32x2_t __a, int32_t __b)
 {
-  return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline float32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_f32 (float32x2_t __a, float32_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, (__builtin_neon_sf) __b);
+#endif
 }
 
 __extension__ extern __inline uint16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_u16 (uint16x4_t __a, uint16_t __b)
 {
-  return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline uint32x2_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_u32 (uint32x2_t __a, uint32_t __b)
 {
-  return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_s16 (int16x8_t __a, int16_t __b)
 {
-  return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_s32 (int32x4_t __a, int32_t __b)
 {
-  return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline float32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_f32 (float32x4_t __a, float32_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, (__builtin_neon_sf) __b);
+#endif
 }
 
 __extension__ extern __inline uint16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
 {
-  return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline uint32x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
 {
-  return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b);
+  return __a * __b;
 }
 
 __extension__ extern __inline int32x4_t
@@ -17661,7 +17669,11 @@ __extension__ extern __inline float16x4_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmul_n_f16 (float16x4_t __a, float16_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return __builtin_neon_vmul_nv4hf (__a, __b);
+#endif
 }
 
 __extension__ extern __inline float16x8_t
@@ -17686,7 +17698,11 @@ __extension__ extern __inline float16x8_t
 __attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
 vmulq_n_f16 (float16x8_t __a, float16_t __b)
 {
+#ifdef __FAST_MATH__
+  return __a * __b;
+#else
   return __builtin_neon_vmul_nv8hf (__a, __b);
+#endif
 }
 
 __extension__ extern __inline float16x4_t
diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
index 50f689352ca..6808576ce59 100644
--- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
+++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-neon-2.c
@@ -327,13 +327,13 @@ BINOP_TEST (vminnm)
 
 BINOP_TEST (vmul)
 /* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
-   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 1 } }  */
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } }  */
 BINOP_LANE_TEST (vmul, 2)
 /* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[2\]} 1 } }
    { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[2\]} 1 } }  */
 BINOP_N_TEST (vmul)
-/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+\[0\]} 1 } }
-   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, d[0-9]+\[0\]} 1 } }*/
+/* { dg-final { scan-assembler-times {vmul\.f16\td[0-9]+, d[0-9]+, d[0-9]+} 3 } }
+   { dg-final { scan-assembler-times {vmul\.f16\tq[0-9]+, q[0-9]+, q[0-9]+} 2 } }*/
 
 float16x4_t
 test_vpadd_16x4 (float16x4_t a, float16x4_t b)
@@ -387,7 +387,7 @@ test_vdup_n_f16 (float16_t a)
 {
   return vdup_n_f16 (a);
 }
-/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vdup\.16\td[0-9]+, r[0-9]+} 3 } }  */
 
 float16x8_t
 test_vmovq_n_f16 (float16_t a)
@@ -400,7 +400,7 @@ test_vdupq_n_f16 (float16_t a)
 {
   return vdupq_n_f16 (a);
 }
-/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 2 } }  */
+/* { dg-final { scan-assembler-times {vdup\.16\tq[0-9]+, r[0-9]+} 3 } }  */
 
 float16x4_t
 test_vdup_lane_f16 (float16x4_t a)

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
  2021-06-07  7:15       ` Prathamesh Kulkarni
@ 2021-06-14  7:57         ` Prathamesh Kulkarni
  2021-06-21  8:34           ` Prathamesh Kulkarni
  0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-14  7:57 UTC (permalink / raw)
  To: gcc Patches

On Mon, 7 Jun 2021 at 12:45, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> > > >
> > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> > > >
> > > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> > > >
> > > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > > > is vmul_n already undefined in case of overflow?
> > > Hi Marc,
> > > Sorry for late reply, for vmul_n_s*, I think they are signed
> > > (int<width>x<width>_t).
> > Oops, I meant int<width>x<nelems>_t.
> > > I am not sure how should the intrinsic behave in case of signed overflow,
> > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
> The attached version fixes one fallout I missed earlier.
> Is this OK to commit ?
ping https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> > >
> > > Thanks,
> > > Prathamesh
> > > >
> > > > --
> > > > Marc Glisse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
  2021-06-14  7:57         ` Prathamesh Kulkarni
@ 2021-06-21  8:34           ` Prathamesh Kulkarni
  2021-06-29  7:21             ` Prathamesh Kulkarni
  0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-21  8:34 UTC (permalink / raw)
  To: gcc Patches

On Mon, 14 Jun 2021 at 13:27, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 7 Jun 2021 at 12:45, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> > > <prathamesh.kulkarni@linaro.org> wrote:
> > > >
> > > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> > > > >
> > > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> > > > >
> > > > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> > > > >
> > > > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > > > > is vmul_n already undefined in case of overflow?
> > > > Hi Marc,
> > > > Sorry for late reply, for vmul_n_s*, I think they are signed
> > > > (int<width>x<width>_t).
> > > Oops, I meant int<width>x<nelems>_t.
> > > > I am not sure how should the intrinsic behave in case of signed overflow,
> > > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> > > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
> > The attached version fixes one fallout I missed earlier.
> > Is this OK to commit ?
> ping https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
ping * 2 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Prathamesh
> > > >
> > > > Thanks,
> > > > Prathamesh
> > > > >
> > > > > --
> > > > > Marc Glisse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
  2021-06-21  8:34           ` Prathamesh Kulkarni
@ 2021-06-29  7:21             ` Prathamesh Kulkarni
  2021-06-30  8:27               ` Kyrylo Tkachov
  0 siblings, 1 reply; 9+ messages in thread
From: Prathamesh Kulkarni @ 2021-06-29  7:21 UTC (permalink / raw)
  To: gcc Patches

On Mon, 21 Jun 2021 at 14:04, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 14 Jun 2021 at 13:27, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Mon, 7 Jun 2021 at 12:45, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
> > > <prathamesh.kulkarni@linaro.org> wrote:
> > > >
> > > > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> > > > <prathamesh.kulkarni@linaro.org> wrote:
> > > > >
> > > > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr> wrote:
> > > > > >
> > > > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches wrote:
> > > > > >
> > > > > > > The attached patch removes calls to builtins in vmul_n* (a, b) with __a * __b.
> > > > > >
> > > > > > I am not familiar with neon, but are __a and __b unsigned here? Otherwise,
> > > > > > is vmul_n already undefined in case of overflow?
> > > > > Hi Marc,
> > > > > Sorry for late reply, for vmul_n_s*, I think they are signed
> > > > > (int<width>x<width>_t).
> > > > Oops, I meant int<width>x<nelems>_t.
> > > > > I am not sure how should the intrinsic behave in case of signed overflow,
> > > > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined too.
> > > > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of overflow ?
> > > The attached version fixes one fallout I missed earlier.
> > > Is this OK to commit ?
> > ping https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
> ping * 2 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
ping * 3 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html

Thanks,
Prathamesh
>
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Prathamesh
> > >
> > > Thanks,
> > > Prathamesh
> > > > >
> > > > > Thanks,
> > > > > Prathamesh
> > > > > >
> > > > > > --
> > > > > > Marc Glisse

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b
  2021-06-29  7:21             ` Prathamesh Kulkarni
@ 2021-06-30  8:27               ` Kyrylo Tkachov
  0 siblings, 0 replies; 9+ messages in thread
From: Kyrylo Tkachov @ 2021-06-30  8:27 UTC (permalink / raw)
  To: Prathamesh Kulkarni, gcc Patches

Hi Prathamesh,

> -----Original Message-----
> From: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
> Sent: 29 June 2021 08:22
> To: gcc Patches <gcc-patches@gcc.gnu.org>
> Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: Re: [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics
> with __a * __b
> 
> On Mon, 21 Jun 2021 at 14:04, Prathamesh Kulkarni
> <prathamesh.kulkarni@linaro.org> wrote:
> >
> > On Mon, 14 Jun 2021 at 13:27, Prathamesh Kulkarni
> > <prathamesh.kulkarni@linaro.org> wrote:
> > >
> > > On Mon, 7 Jun 2021 at 12:45, Prathamesh Kulkarni
> > > <prathamesh.kulkarni@linaro.org> wrote:
> > > >
> > > > On Mon, 31 May 2021 at 16:01, Prathamesh Kulkarni
> > > > <prathamesh.kulkarni@linaro.org> wrote:
> > > > >
> > > > > On Mon, 31 May 2021 at 15:22, Prathamesh Kulkarni
> > > > > <prathamesh.kulkarni@linaro.org> wrote:
> > > > > >
> > > > > > On Wed, 26 May 2021 at 14:07, Marc Glisse <marc.glisse@inria.fr>
> wrote:
> > > > > > >
> > > > > > > On Wed, 26 May 2021, Prathamesh Kulkarni via Gcc-patches
> wrote:
> > > > > > >
> > > > > > > > The attached patch removes calls to builtins in vmul_n* (a, b)
> with __a * __b.
> > > > > > >
> > > > > > > I am not familiar with neon, but are __a and __b unsigned here?
> Otherwise,
> > > > > > > is vmul_n already undefined in case of overflow?
> > > > > > Hi Marc,
> > > > > > Sorry for late reply, for vmul_n_s*, I think they are signed
> > > > > > (int<width>x<width>_t).
> > > > > Oops, I meant int<width>x<nelems>_t.
> > > > > > I am not sure how should the intrinsic behave in case of signed
> overflow,
> > > > > > but I am assuming it's OK since vmul_s* intrinsics leave it undefined
> too.
> > > > > > Kyrill, is it OK to leave vmul_s* and vmul_n_s* undefined in case of
> overflow ?
> > > > The attached version fixes one fallout I missed earlier.
> > > > Is this OK to commit ?
> > > ping https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html
> > ping * 2 https://gcc.gnu.org/pipermail/gcc-patches/2021-
> June/572037.html
> ping * 3 https://gcc.gnu.org/pipermail/gcc-patches/2021-June/572037.html

I'm a bit wary of leaving them undefined for signed overflow. I see that aarch64 leaves them too so maybe it's not such a big deal, but I'd like to consider that separately.
Can you please split this into the unsigned and floating-point parts, followed by the signed intrinsics? The unsigned and FP parts are okay, lets' review the signed intrinsics separately.

Thanks,
Kyrill

> 
> Thanks,
> Prathamesh
> >
> > Thanks,
> > Prathamesh
> > >
> > > Thanks,
> > > Prathamesh
> > > >
> > > > Thanks,
> > > > Prathamesh
> > > > > >
> > > > > > Thanks,
> > > > > > Prathamesh
> > > > > > >
> > > > > > > --
> > > > > > > Marc Glisse

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-06-30  8:27 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-26  7:11 [ARM] PR66791: Replace calls to builtin in vmul_n (a, b) intrinsics with __a * __b Prathamesh Kulkarni
2021-05-26  8:37 ` Marc Glisse
2021-05-31  9:52   ` Prathamesh Kulkarni
2021-05-31 10:31     ` Prathamesh Kulkarni
2021-06-07  7:15       ` Prathamesh Kulkarni
2021-06-14  7:57         ` Prathamesh Kulkarni
2021-06-21  8:34           ` Prathamesh Kulkarni
2021-06-29  7:21             ` Prathamesh Kulkarni
2021-06-30  8:27               ` Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).