public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
@ 2020-10-15 17:23 Christophe Lyon
  2020-10-15 18:10 ` Andrea Corallo
  2020-11-05  9:36 ` Kyrylo Tkachov
  0 siblings, 2 replies; 11+ messages in thread
From: Christophe Lyon @ 2020-10-15 17:23 UTC (permalink / raw)
  To: gcc-patches

This patch adds implementations for vceqq_p64, vceqz_p64 and
vceqzq_p64 intrinsics.

vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
into their high and low halves.

vceqz[q] simply call the vceq and vceqq with a second argument equal
to zero.

The added (executable) testcases make sure that the poly64x2_t
variants have results with one element of all zeroes (false) and the
other element with all bits set to one (true).

2020-10-15  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64): New.

	gcc/testsuite/
	* gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
	vceqz_p64, vceqq_p64 and vceqzq_p64.
---
 gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
 .../aarch64/advsimd-intrinsics/p64_p128.c          | 46 +++++++++++++++++++++-
 2 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index aa21730..f7eff37 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
   return vreinterpret_u64_u32 (__m);
 }
 
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_p64 (poly64x1_t __a)
+{
+  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
+  return vceq_p64 (__a, __b);
+}
+
+/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+  poly64_t __high_a = vget_high_p64 (__a);
+  poly64_t __high_b = vget_high_p64 (__b);
+  uint64x1_t __high = vceq_p64(__high_a, __high_b);
+
+  poly64_t __low_a = vget_low_p64 (__a);
+  poly64_t __low_b = vget_low_p64 (__b);
+  uint64x1_t __low = vceq_p64(__low_a, __low_b);
+  return vcombine_u64 (__low, __high);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_p64 (poly64x2_t __a)
+{
+  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
+  return vceqq_p64 (__a, __b);
+}
+
 /* The vtst_p64 intrinsic does not map to a single instruction.
    We emulate it in way similar to vceq_p64 above but here we do
    a reduction with max since if any two corresponding bits
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
index a3210a9..6aed096 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
@@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1,
 
 /* Expected results: vceq.  */
 VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
+VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
+
+/* Expected results: vceqz.  */
+VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
+VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
 
 /* Expected results: vcombine.  */
 VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 };
@@ -213,7 +218,7 @@ int main (void)
 
   /* vceq_p64 tests. */
 #undef TEST_MSG
-#define TEST_MSG "VCEQ"
+#define TEST_MSG "VCEQ/VCEQQ"
 
 #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
   VECT_VAR(vceq_vector_res, T3, W, N) =					\
@@ -227,16 +232,55 @@ int main (void)
   DECL_VARIABLE(vceq_vector, poly, 64, 1);
   DECL_VARIABLE(vceq_vector2, poly, 64, 1);
   DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
+  DECL_VARIABLE(vceq_vector, poly, 64, 2);
+  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
+  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
 
   CLEAN(result, uint, 64, 1);
+  CLEAN(result, uint, 64, 2);
 
   VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
 
   VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
+  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
+  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
 
   TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
+  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
 
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
+
+  /* vceqz_p64 tests. */
+#undef TEST_MSG
+#define TEST_MSG "VCEQZ/VCEQZQ"
+
+#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)				\
+  VECT_VAR(vceqz_vector_res, T3, W, N) =				\
+    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));		\
+  vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceqz_vector_res, T3, W, N))
+
+#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)				\
+  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
+
+  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
+  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
+  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
+  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
+
+  CLEAN(result, uint, 64, 1);
+  CLEAN(result, uint, 64, 2);
+
+  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
+  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
+
+  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
+  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
+
+  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
 
   /* vcombine_p64 tests.  */
 #undef TEST_MSG
-- 
2.7.4


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-10-15 17:23 [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics Christophe Lyon
@ 2020-10-15 18:10 ` Andrea Corallo
  2020-10-16  8:40   ` Christophe Lyon
  2020-10-16  8:41   ` Christophe Lyon
  2020-11-05  9:36 ` Kyrylo Tkachov
  1 sibling, 2 replies; 11+ messages in thread
From: Andrea Corallo @ 2020-10-15 18:10 UTC (permalink / raw)
  To: Christophe Lyon via Gcc-patches

Hi Christophe,

I've spotted two very minors.

Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes:

[...]

> +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> +__extension__ extern __inline uint64x2_t
> +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> +{
> +  poly64_t __high_a = vget_high_p64 (__a);
> +  poly64_t __high_b = vget_high_p64 (__b);
> +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
                                ^^^
                               space
> +
> +  poly64_t __low_a = vget_low_p64 (__a);
> +  poly64_t __low_b = vget_low_p64 (__b);
> +  uint64x1_t __low = vceq_p64(__low_a, __low_b);

Same

> +  return vcombine_u64 (__low, __high);
> +}
> +
> +__extension__ extern __inline uint64x2_t
> +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> +vceqzq_p64 (poly64x2_t __a)
> +{
> +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> +  return vceqq_p64 (__a, __b);
> +}

Thanks

  Andrea

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-10-15 18:10 ` Andrea Corallo
@ 2020-10-16  8:40   ` Christophe Lyon
  2020-10-16  8:41   ` Christophe Lyon
  1 sibling, 0 replies; 11+ messages in thread
From: Christophe Lyon @ 2020-10-16  8:40 UTC (permalink / raw)
  To: Andrea Corallo; +Cc: Christophe Lyon via Gcc-patches

On Thu, 15 Oct 2020 at 20:10, Andrea Corallo <andrea.corallo@arm.com> wrote:
>
> Hi Christophe,
>
> I've spotted two very minors.
>
> Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>
> [...]
>
> > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > +__extension__ extern __inline uint64x2_t
> > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > +{
> > +  poly64_t __high_a = vget_high_p64 (__a);
> > +  poly64_t __high_b = vget_high_p64 (__b);
> > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
>                                 ^^^
>                                space
> > +
> > +  poly64_t __low_a = vget_low_p64 (__a);
> > +  poly64_t __low_b = vget_low_p64 (__b);
> > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
>
> Same
>
> > +  return vcombine_u64 (__low, __high);
> > +}
> > +
> > +__extension__ extern __inline uint64x2_t
> > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > +vceqzq_p64 (poly64x2_t __a)
> > +{
> > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > +  return vceqq_p64 (__a, __b);
> > +}
>
> Thanks
>
>   Andrea

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-10-15 18:10 ` Andrea Corallo
  2020-10-16  8:40   ` Christophe Lyon
@ 2020-10-16  8:41   ` Christophe Lyon
  2020-10-23 17:20     ` Christophe Lyon
  1 sibling, 1 reply; 11+ messages in thread
From: Christophe Lyon @ 2020-10-16  8:41 UTC (permalink / raw)
  To: Andrea Corallo; +Cc: Christophe Lyon via Gcc-patches

On Thu, 15 Oct 2020 at 20:10, Andrea Corallo <andrea.corallo@arm.com> wrote:
>
> Hi Christophe,
>
> I've spotted two very minors.
>
> Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>
> [...]
>
> > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > +__extension__ extern __inline uint64x2_t
> > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > +{
> > +  poly64_t __high_a = vget_high_p64 (__a);
> > +  poly64_t __high_b = vget_high_p64 (__b);
> > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
>                                 ^^^
>                                space

Thanks for catching this, I'll fix it before committing if the rest is approved.

Christophe

> > +
> > +  poly64_t __low_a = vget_low_p64 (__a);
> > +  poly64_t __low_b = vget_low_p64 (__b);
> > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
>
> Same
>
> > +  return vcombine_u64 (__low, __high);
> > +}
> > +
> > +__extension__ extern __inline uint64x2_t
> > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > +vceqzq_p64 (poly64x2_t __a)
> > +{
> > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > +  return vceqq_p64 (__a, __b);
> > +}
>
> Thanks
>
>   Andrea

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-10-16  8:41   ` Christophe Lyon
@ 2020-10-23 17:20     ` Christophe Lyon
  2020-11-04 20:17       ` Christophe Lyon
  0 siblings, 1 reply; 11+ messages in thread
From: Christophe Lyon @ 2020-10-23 17:20 UTC (permalink / raw)
  To: Andrea Corallo; +Cc: Christophe Lyon via Gcc-patches

ping?

On Fri, 16 Oct 2020 at 10:41, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> On Thu, 15 Oct 2020 at 20:10, Andrea Corallo <andrea.corallo@arm.com> wrote:
> >
> > Hi Christophe,
> >
> > I've spotted two very minors.
> >
> > Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> >
> > [...]
> >
> > > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > > +__extension__ extern __inline uint64x2_t
> > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > > +{
> > > +  poly64_t __high_a = vget_high_p64 (__a);
> > > +  poly64_t __high_b = vget_high_p64 (__b);
> > > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> >                                 ^^^
> >                                space
>
> Thanks for catching this, I'll fix it before committing if the rest is approved.
>
> Christophe
>
> > > +
> > > +  poly64_t __low_a = vget_low_p64 (__a);
> > > +  poly64_t __low_b = vget_low_p64 (__b);
> > > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> >
> > Same
> >
> > > +  return vcombine_u64 (__low, __high);
> > > +}
> > > +
> > > +__extension__ extern __inline uint64x2_t
> > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > +vceqzq_p64 (poly64x2_t __a)
> > > +{
> > > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > > +  return vceqq_p64 (__a, __b);
> > > +}
> >
> > Thanks
> >
> >   Andrea

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-10-23 17:20     ` Christophe Lyon
@ 2020-11-04 20:17       ` Christophe Lyon
  0 siblings, 0 replies; 11+ messages in thread
From: Christophe Lyon @ 2020-11-04 20:17 UTC (permalink / raw)
  To: Andrea Corallo; +Cc: Christophe Lyon via Gcc-patches

ping?
https://gcc.gnu.org/pipermail/gcc-patches/2020-October/556299.html

On Fri, 23 Oct 2020 at 19:20, Christophe Lyon
<christophe.lyon@linaro.org> wrote:
>
> ping?
>
> On Fri, 16 Oct 2020 at 10:41, Christophe Lyon
> <christophe.lyon@linaro.org> wrote:
> >
> > On Thu, 15 Oct 2020 at 20:10, Andrea Corallo <andrea.corallo@arm.com> wrote:
> > >
> > > Hi Christophe,
> > >
> > > I've spotted two very minors.
> > >
> > > Christophe Lyon via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > >
> > > [...]
> > >
> > > > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > > > +__extension__ extern __inline uint64x2_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > > > +{
> > > > +  poly64_t __high_a = vget_high_p64 (__a);
> > > > +  poly64_t __high_b = vget_high_p64 (__b);
> > > > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> > >                                 ^^^
> > >                                space
> >
> > Thanks for catching this, I'll fix it before committing if the rest is approved.
> >
> > Christophe
> >
> > > > +
> > > > +  poly64_t __low_a = vget_low_p64 (__a);
> > > > +  poly64_t __low_b = vget_low_p64 (__b);
> > > > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> > >
> > > Same
> > >
> > > > +  return vcombine_u64 (__low, __high);
> > > > +}
> > > > +
> > > > +__extension__ extern __inline uint64x2_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqzq_p64 (poly64x2_t __a)
> > > > +{
> > > > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > > > +  return vceqq_p64 (__a, __b);
> > > > +}
> > >
> > > Thanks
> > >
> > >   Andrea

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-10-15 17:23 [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics Christophe Lyon
  2020-10-15 18:10 ` Andrea Corallo
@ 2020-11-05  9:36 ` Kyrylo Tkachov
  2020-11-05 11:55   ` Christophe Lyon
  1 sibling, 1 reply; 11+ messages in thread
From: Kyrylo Tkachov @ 2020-11-05  9:36 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches

H, Christophe,

> -----Original Message-----
> From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> Christophe Lyon via Gcc-patches
> Sent: 15 October 2020 18:23
> To: gcc-patches@gcc.gnu.org
> Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
> intrinsics
> 
> This patch adds implementations for vceqq_p64, vceqz_p64 and
> vceqzq_p64 intrinsics.
> 
> vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
> into their high and low halves.
> 
> vceqz[q] simply call the vceq and vceqq with a second argument equal
> to zero.
> 
> The added (executable) testcases make sure that the poly64x2_t
> variants have results with one element of all zeroes (false) and the
> other element with all bits set to one (true).
> 
> 2020-10-15  Christophe Lyon  <christophe.lyon@linaro.org>
> 
> 	gcc/
> 	* config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64):
> New.
> 
> 	gcc/testsuite/
> 	* gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
> 	vceqz_p64, vceqq_p64 and vceqzq_p64.
> ---
>  gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
>  .../aarch64/advsimd-intrinsics/p64_p128.c          | 46
> +++++++++++++++++++++-
>  2 files changed, 76 insertions(+), 1 deletion(-)
> 
> diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> index aa21730..f7eff37 100644
> --- a/gcc/config/arm/arm_neon.h
> +++ b/gcc/config/arm/arm_neon.h
> @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
>    return vreinterpret_u64_u32 (__m);
>  }
> 
> +__extension__ extern __inline uint64x1_t
> +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> +vceqz_p64 (poly64x1_t __a)
> +{
> +  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
> +  return vceq_p64 (__a, __b);
> +}

This approach is okay, but can we have some kind of test to confirm it generates the VCEQ instruction with immediate zero rather than having a separate DUP...
Thanks,
Kyrill

> +
> +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> +__extension__ extern __inline uint64x2_t
> +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> +{
> +  poly64_t __high_a = vget_high_p64 (__a);
> +  poly64_t __high_b = vget_high_p64 (__b);
> +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> +
> +  poly64_t __low_a = vget_low_p64 (__a);
> +  poly64_t __low_b = vget_low_p64 (__b);
> +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> +  return vcombine_u64 (__low, __high);
> +}
> +
> +__extension__ extern __inline uint64x2_t
> +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> +vceqzq_p64 (poly64x2_t __a)
> +{
> +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> +  return vceqq_p64 (__a, __b);
> +}
> +
>  /* The vtst_p64 intrinsic does not map to a single instruction.
>     We emulate it in way similar to vceq_p64 above but here we do
>     a reduction with max since if any two corresponding bits
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> index a3210a9..6aed096 100644
> --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] =
> { 0xfffffff1,
> 
>  /* Expected results: vceq.  */
>  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
> +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> +
> +/* Expected results: vceqz.  */
> +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
> +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> 
>  /* Expected results: vcombine.  */
>  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0,
> 0x88 };
> @@ -213,7 +218,7 @@ int main (void)
> 
>    /* vceq_p64 tests. */
>  #undef TEST_MSG
> -#define TEST_MSG "VCEQ"
> +#define TEST_MSG "VCEQ/VCEQQ"
> 
>  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
> 	\
>    VECT_VAR(vceq_vector_res, T3, W, N) =
> 	\
> @@ -227,16 +232,55 @@ int main (void)
>    DECL_VARIABLE(vceq_vector, poly, 64, 1);
>    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
>    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
> +  DECL_VARIABLE(vceq_vector, poly, 64, 2);
> +  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
> +  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
> 
>    CLEAN(result, uint, 64, 1);
> +  CLEAN(result, uint, 64, 2);
> 
>    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
> +  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
> 
>    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
> +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
> +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
> 
>    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
> +  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
> 
>    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
> +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
> +
> +  /* vceqz_p64 tests. */
> +#undef TEST_MSG
> +#define TEST_MSG "VCEQZ/VCEQZQ"
> +
> +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> 	\
> +  VECT_VAR(vceqz_vector_res, T3, W, N) =				\
> +    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));		\
> +  vst1##Q##_u##W(VECT_VAR(result, T3, W, N),
> VECT_VAR(vceqz_vector_res, T3, W, N))
> +
> +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)
> 	\
> +  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> +
> +  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
> +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
> +  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
> +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
> +
> +  CLEAN(result, uint, 64, 1);
> +  CLEAN(result, uint, 64, 2);
> +
> +  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
> +  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
> +  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
> +
> +  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
> +  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
> +
> +  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
> +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
> 
>    /* vcombine_p64 tests.  */
>  #undef TEST_MSG
> --
> 2.7.4


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-11-05  9:36 ` Kyrylo Tkachov
@ 2020-11-05 11:55   ` Christophe Lyon
  2020-11-06 15:22     ` Christophe Lyon
  0 siblings, 1 reply; 11+ messages in thread
From: Christophe Lyon @ 2020-11-05 11:55 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: gcc-patches

On Thu, 5 Nov 2020 at 10:36, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
>
> H, Christophe,
>
> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> > Christophe Lyon via Gcc-patches
> > Sent: 15 October 2020 18:23
> > To: gcc-patches@gcc.gnu.org
> > Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
> > intrinsics
> >
> > This patch adds implementations for vceqq_p64, vceqz_p64 and
> > vceqzq_p64 intrinsics.
> >
> > vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
> > into their high and low halves.
> >
> > vceqz[q] simply call the vceq and vceqq with a second argument equal
> > to zero.
> >
> > The added (executable) testcases make sure that the poly64x2_t
> > variants have results with one element of all zeroes (false) and the
> > other element with all bits set to one (true).
> >
> > 2020-10-15  Christophe Lyon  <christophe.lyon@linaro.org>
> >
> >       gcc/
> >       * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64):
> > New.
> >
> >       gcc/testsuite/
> >       * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
> >       vceqz_p64, vceqq_p64 and vceqzq_p64.
> > ---
> >  gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
> >  .../aarch64/advsimd-intrinsics/p64_p128.c          | 46
> > +++++++++++++++++++++-
> >  2 files changed, 76 insertions(+), 1 deletion(-)
> >
> > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> > index aa21730..f7eff37 100644
> > --- a/gcc/config/arm/arm_neon.h
> > +++ b/gcc/config/arm/arm_neon.h
> > @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
> >    return vreinterpret_u64_u32 (__m);
> >  }
> >
> > +__extension__ extern __inline uint64x1_t
> > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > +vceqz_p64 (poly64x1_t __a)
> > +{
> > +  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
> > +  return vceq_p64 (__a, __b);
> > +}
>
> This approach is okay, but can we have some kind of test to confirm it generates the VCEQ instruction with immediate zero rather than having a separate DUP...

I had checked that manually, but I'll add a test.
However, I have noticed that although vceqz_p64 uses vceq.i32 dX, dY, #0,
the vceqzq_64 version below first sets
vmov dZ, #0
and then emits two
vmoz dX, dY, dZ

I'm looking at why this happens.

Thanks,

Christophe


> Thanks,
> Kyrill
>
> > +
> > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > +__extension__ extern __inline uint64x2_t
> > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > +{
> > +  poly64_t __high_a = vget_high_p64 (__a);
> > +  poly64_t __high_b = vget_high_p64 (__b);
> > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> > +
> > +  poly64_t __low_a = vget_low_p64 (__a);
> > +  poly64_t __low_b = vget_low_p64 (__b);
> > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> > +  return vcombine_u64 (__low, __high);
> > +}
> > +
> > +__extension__ extern __inline uint64x2_t
> > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > +vceqzq_p64 (poly64x2_t __a)
> > +{
> > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > +  return vceqq_p64 (__a, __b);
> > +}
> > +
> >  /* The vtst_p64 intrinsic does not map to a single instruction.
> >     We emulate it in way similar to vceq_p64 above but here we do
> >     a reduction with max since if any two corresponding bits
> > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > index a3210a9..6aed096 100644
> > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] =
> > { 0xfffffff1,
> >
> >  /* Expected results: vceq.  */
> >  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
> > +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> > +
> > +/* Expected results: vceqz.  */
> > +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
> > +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> >
> >  /* Expected results: vcombine.  */
> >  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0,
> > 0x88 };
> > @@ -213,7 +218,7 @@ int main (void)
> >
> >    /* vceq_p64 tests. */
> >  #undef TEST_MSG
> > -#define TEST_MSG "VCEQ"
> > +#define TEST_MSG "VCEQ/VCEQQ"
> >
> >  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
> >       \
> >    VECT_VAR(vceq_vector_res, T3, W, N) =
> >       \
> > @@ -227,16 +232,55 @@ int main (void)
> >    DECL_VARIABLE(vceq_vector, poly, 64, 1);
> >    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
> >    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
> > +  DECL_VARIABLE(vceq_vector, poly, 64, 2);
> > +  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
> > +  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
> >
> >    CLEAN(result, uint, 64, 1);
> > +  CLEAN(result, uint, 64, 2);
> >
> >    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
> > +  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
> >
> >    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
> > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
> > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
> >
> >    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
> > +  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
> >
> >    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
> > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
> > +
> > +  /* vceqz_p64 tests. */
> > +#undef TEST_MSG
> > +#define TEST_MSG "VCEQZ/VCEQZQ"
> > +
> > +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> >       \
> > +  VECT_VAR(vceqz_vector_res, T3, W, N) =                             \
> > +    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));             \
> > +  vst1##Q##_u##W(VECT_VAR(result, T3, W, N),
> > VECT_VAR(vceqz_vector_res, T3, W, N))
> > +
> > +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)
> >       \
> > +  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > +
> > +  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
> > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
> > +  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
> > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
> > +
> > +  CLEAN(result, uint, 64, 1);
> > +  CLEAN(result, uint, 64, 2);
> > +
> > +  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
> > +  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
> > +  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
> > +
> > +  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
> > +  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
> > +
> > +  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
> > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
> >
> >    /* vcombine_p64 tests.  */
> >  #undef TEST_MSG
> > --
> > 2.7.4
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-11-05 11:55   ` Christophe Lyon
@ 2020-11-06 15:22     ` Christophe Lyon
  2021-01-15 10:47       ` Christophe Lyon
  2021-01-15 11:15       ` Kyrylo Tkachov
  0 siblings, 2 replies; 11+ messages in thread
From: Christophe Lyon @ 2020-11-06 15:22 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 9280 bytes --]

On Thu, 5 Nov 2020 at 12:55, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> On Thu, 5 Nov 2020 at 10:36, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
> >
> > H, Christophe,
> >
> > > -----Original Message-----
> > > From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> > > Christophe Lyon via Gcc-patches
> > > Sent: 15 October 2020 18:23
> > > To: gcc-patches@gcc.gnu.org
> > > Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
> > > intrinsics
> > >
> > > This patch adds implementations for vceqq_p64, vceqz_p64 and
> > > vceqzq_p64 intrinsics.
> > >
> > > vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
> > > into their high and low halves.
> > >
> > > vceqz[q] simply call the vceq and vceqq with a second argument equal
> > > to zero.
> > >
> > > The added (executable) testcases make sure that the poly64x2_t
> > > variants have results with one element of all zeroes (false) and the
> > > other element with all bits set to one (true).
> > >
> > > 2020-10-15  Christophe Lyon  <christophe.lyon@linaro.org>
> > >
> > >       gcc/
> > >       * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64):
> > > New.
> > >
> > >       gcc/testsuite/
> > >       * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
> > >       vceqz_p64, vceqq_p64 and vceqzq_p64.
> > > ---
> > >  gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
> > >  .../aarch64/advsimd-intrinsics/p64_p128.c          | 46
> > > +++++++++++++++++++++-
> > >  2 files changed, 76 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> > > index aa21730..f7eff37 100644
> > > --- a/gcc/config/arm/arm_neon.h
> > > +++ b/gcc/config/arm/arm_neon.h
> > > @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
> > >    return vreinterpret_u64_u32 (__m);
> > >  }
> > >
> > > +__extension__ extern __inline uint64x1_t
> > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > +vceqz_p64 (poly64x1_t __a)
> > > +{
> > > +  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
> > > +  return vceq_p64 (__a, __b);
> > > +}
> >
> > This approach is okay, but can we have some kind of test to confirm it generates the VCEQ instruction with immediate zero rather than having a separate DUP...
>
> I had checked that manually, but I'll add a test.
> However, I have noticed that although vceqz_p64 uses vceq.i32 dX, dY, #0,
> the vceqzq_64 version below first sets
> vmov dZ, #0
> and then emits two
> vmoz dX, dY, dZ
>
> I'm looking at why this happens.
>

Hi,

Here is an updated version, which adds two tests (arm/simd/vceqz_p64.c
and arm/simd/vceqzq_p64.c).

The vceqzq_64 test does not currently expect instructions with
immediate zero, because we generate:
vmov.i32        q9, #0  @ v4si
[...]
vceq.i32        d16, d16, d19
vceq.i32        d17, d17, d19

Looking at the traces, I can see this in reload:
(insn 19 8 15 2 (set (reg:V2SI 48 d16 [orig:128 _18 ] [128])
        (neg:V2SI (eq:V2SI (reg:V2SI 48 d16 [orig:139 v1 ] [139])
                (reg:V2SI 54 d19 [ _5+8 ]))))
"/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22
1650 {neon_vceqv2si_insn}
     (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 48
d16 [orig:139 v1 ] [139]) 0)
                (const_vector:V2SI [
                        (const_int 0 [0]) repeated x2
                    ])))
        (nil)))
(insn 15 19 20 2 (set (reg:V2SI 50 d17 [orig:121 _11 ] [121])
        (neg:V2SI (eq:V2SI (reg:V2SI 50 d17 [orig:141 v2 ] [141])
                (reg:V2SI 54 d19 [ _5+8 ]))))
"/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22
1650 {neon_vceqv2si_insn}
     (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 50
d17 [orig:141 v2 ] [141]) 0)
                (const_vector:V2SI [
                        (const_int 0 [0]) repeated x2
                    ])))
        (nil)))

but it says:
         Choosing alt 0 in insn 19:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
          alt=0,overall=0,losers=0,rld_nregs=0
         Choosing alt 0 in insn 15:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
          alt=0,overall=0,losers=0,rld_nregs=0

Why isn't it picking alternative 1 with the Dz constraint?

Christophe


> Thanks,
>
> Christophe
>
>
> > Thanks,
> > Kyrill
> >
> > > +
> > > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > > +__extension__ extern __inline uint64x2_t
> > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > > +{
> > > +  poly64_t __high_a = vget_high_p64 (__a);
> > > +  poly64_t __high_b = vget_high_p64 (__b);
> > > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> > > +
> > > +  poly64_t __low_a = vget_low_p64 (__a);
> > > +  poly64_t __low_b = vget_low_p64 (__b);
> > > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> > > +  return vcombine_u64 (__low, __high);
> > > +}
> > > +
> > > +__extension__ extern __inline uint64x2_t
> > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > +vceqzq_p64 (poly64x2_t __a)
> > > +{
> > > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > > +  return vceqq_p64 (__a, __b);
> > > +}
> > > +
> > >  /* The vtst_p64 intrinsic does not map to a single instruction.
> > >     We emulate it in way similar to vceq_p64 above but here we do
> > >     a reduction with max since if any two corresponding bits
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > index a3210a9..6aed096 100644
> > > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] =
> > > { 0xfffffff1,
> > >
> > >  /* Expected results: vceq.  */
> > >  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
> > > +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> > > +
> > > +/* Expected results: vceqz.  */
> > > +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
> > > +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> > >
> > >  /* Expected results: vcombine.  */
> > >  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0,
> > > 0x88 };
> > > @@ -213,7 +218,7 @@ int main (void)
> > >
> > >    /* vceq_p64 tests. */
> > >  #undef TEST_MSG
> > > -#define TEST_MSG "VCEQ"
> > > +#define TEST_MSG "VCEQ/VCEQQ"
> > >
> > >  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
> > >       \
> > >    VECT_VAR(vceq_vector_res, T3, W, N) =
> > >       \
> > > @@ -227,16 +232,55 @@ int main (void)
> > >    DECL_VARIABLE(vceq_vector, poly, 64, 1);
> > >    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
> > >    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
> > > +  DECL_VARIABLE(vceq_vector, poly, 64, 2);
> > > +  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
> > > +  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
> > >
> > >    CLEAN(result, uint, 64, 1);
> > > +  CLEAN(result, uint, 64, 2);
> > >
> > >    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
> > > +  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
> > >
> > >    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
> > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
> > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
> > >
> > >    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
> > > +  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
> > >
> > >    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
> > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
> > > +
> > > +  /* vceqz_p64 tests. */
> > > +#undef TEST_MSG
> > > +#define TEST_MSG "VCEQZ/VCEQZQ"
> > > +
> > > +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > >       \
> > > +  VECT_VAR(vceqz_vector_res, T3, W, N) =                             \
> > > +    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));             \
> > > +  vst1##Q##_u##W(VECT_VAR(result, T3, W, N),
> > > VECT_VAR(vceqz_vector_res, T3, W, N))
> > > +
> > > +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)
> > >       \
> > > +  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > > +
> > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
> > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
> > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
> > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
> > > +
> > > +  CLEAN(result, uint, 64, 1);
> > > +  CLEAN(result, uint, 64, 2);
> > > +
> > > +  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
> > > +  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
> > > +  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
> > > +
> > > +  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
> > > +  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
> > > +
> > > +  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
> > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
> > >
> > >    /* vcombine_p64 tests.  */
> > >  #undef TEST_MSG
> > > --
> > > 2.7.4
> >

[-- Attachment #2: v2-0001-arm-Implement-vceqq_p64-vceqz_p64-and-vceqzq_p64-.patch --]
[-- Type: text/x-patch, Size: 7313 bytes --]

From e0ca6975a559c445572ae6db30add4081c8207f6 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@linaro.org>
Date: Thu, 15 Oct 2020 17:13:59 +0000
Subject: [PATCH v2] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
 intrinsics

This patch adds implementations for vceqq_p64, vceqz_p64 and
vceqzq_p64 intrinsics.

vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
into their high and low halves.

vceqz[q] simply call the vceq and vceqq with a second argument equal
to zero.

The added (executable) testcases make sure that the poly64x2_t
variants have results with one element of all zeroes (false) and the
other element with all bits set to one (true).

2020-10-15  Christophe Lyon  <christophe.lyon@linaro.org>

	gcc/
	* config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64): New.

	gcc/testsuite/
	* gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
	vceqz_p64, vceqq_p64 and vceqzq_p64.
	* gcc.target/arm/simd/vceqz_p64.c: New test.
	* gcc.target/arm/simd/vceqzq_p64.c: New test.
---
 gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
 .../aarch64/advsimd-intrinsics/p64_p128.c          | 46 +++++++++++++++++++++-
 gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c      | 17 ++++++++
 gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c     | 17 ++++++++
 4 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c
 create mode 100644 gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index aa21730..fd57ed5 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
   return vreinterpret_u64_u32 (__m);
 }
 
+__extension__ extern __inline uint64x1_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_p64 (poly64x1_t __a)
+{
+  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
+  return vceq_p64 (__a, __b);
+}
+
+/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+  poly64_t __high_a = vget_high_p64 (__a);
+  poly64_t __high_b = vget_high_p64 (__b);
+  uint64x1_t __high = vceq_p64 (__high_a, __high_b);
+
+  poly64_t __low_a = vget_low_p64 (__a);
+  poly64_t __low_b = vget_low_p64 (__b);
+  uint64x1_t __low = vceq_p64 (__low_a, __low_b);
+  return vcombine_u64 (__low, __high);
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_p64 (poly64x2_t __a)
+{
+  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
+  return vceqq_p64 (__a, __b);
+}
+
 /* The vtst_p64 intrinsic does not map to a single instruction.
    We emulate it in way similar to vceq_p64 above but here we do
    a reduction with max since if any two corresponding bits
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
index a3210a9..6aed096 100644
--- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
@@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] = { 0xfffffff1,
 
 /* Expected results: vceq.  */
 VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
+VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
+
+/* Expected results: vceqz.  */
+VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
+VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
 
 /* Expected results: vcombine.  */
 VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0, 0x88 };
@@ -213,7 +218,7 @@ int main (void)
 
   /* vceq_p64 tests. */
 #undef TEST_MSG
-#define TEST_MSG "VCEQ"
+#define TEST_MSG "VCEQ/VCEQQ"
 
 #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)				\
   VECT_VAR(vceq_vector_res, T3, W, N) =					\
@@ -227,16 +232,55 @@ int main (void)
   DECL_VARIABLE(vceq_vector, poly, 64, 1);
   DECL_VARIABLE(vceq_vector2, poly, 64, 1);
   DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
+  DECL_VARIABLE(vceq_vector, poly, 64, 2);
+  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
+  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
 
   CLEAN(result, uint, 64, 1);
+  CLEAN(result, uint, 64, 2);
 
   VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
 
   VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
+  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
+  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
 
   TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
+  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
 
   CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
+
+  /* vceqz_p64 tests. */
+#undef TEST_MSG
+#define TEST_MSG "VCEQZ/VCEQZQ"
+
+#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)				\
+  VECT_VAR(vceqz_vector_res, T3, W, N) =				\
+    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));		\
+  vst1##Q##_u##W(VECT_VAR(result, T3, W, N), VECT_VAR(vceqz_vector_res, T3, W, N))
+
+#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)				\
+  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
+
+  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
+  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
+  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
+  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
+
+  CLEAN(result, uint, 64, 1);
+  CLEAN(result, uint, 64, 2);
+
+  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
+  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
+  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
+
+  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
+  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
+
+  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
+  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
 
   /* vcombine_p64 tests.  */
 #undef TEST_MSG
diff --git a/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c b/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c
new file mode 100644
index 0000000..f26cbff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vceqz_p64.c
@@ -0,0 +1,17 @@
+/* Test the `vceqz_p64' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-options "-save-temps -O2 -fno-inline" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+poly64x1_t v1;
+uint64x1_t result1;
+
+void func()
+{
+  result1 = vceqz_p64 (v1);
+}
+
+/* { dg-final { scan-assembler-times "vceq\.i32\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, #0\n" 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c b/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c
new file mode 100644
index 0000000..355efd8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/vceqzq_p64.c
@@ -0,0 +1,17 @@
+/* Test the `vceqzq_p64' ARM Neon intrinsic.  */
+
+/* { dg-do compile } */
+/* { dg-options "-save-temps -O2 -fno-inline" } */
+/* { dg-add-options arm_neon } */
+
+#include "arm_neon.h"
+
+poly64x2_t v2;
+uint64x2_t result2;
+
+void func()
+{
+  result2 = vceqzq_p64 (v2);
+}
+
+/* { dg-final { scan-assembler-times "vceq\.i32\[ \t\]+\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+, ?\[dD\]\[0-9\]+\n" 2 } } */
-- 
2.7.4


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-11-06 15:22     ` Christophe Lyon
@ 2021-01-15 10:47       ` Christophe Lyon
  2021-01-15 11:15       ` Kyrylo Tkachov
  1 sibling, 0 replies; 11+ messages in thread
From: Christophe Lyon @ 2021-01-15 10:47 UTC (permalink / raw)
  To: Kyrylo Tkachov; +Cc: gcc-patches

ping?

On Fri, 6 Nov 2020 at 16:22, Christophe Lyon <christophe.lyon@linaro.org> wrote:
>
> On Thu, 5 Nov 2020 at 12:55, Christophe Lyon <christophe.lyon@linaro.org> wrote:
> >
> > On Thu, 5 Nov 2020 at 10:36, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com> wrote:
> > >
> > > H, Christophe,
> > >
> > > > -----Original Message-----
> > > > From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> > > > Christophe Lyon via Gcc-patches
> > > > Sent: 15 October 2020 18:23
> > > > To: gcc-patches@gcc.gnu.org
> > > > Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
> > > > intrinsics
> > > >
> > > > This patch adds implementations for vceqq_p64, vceqz_p64 and
> > > > vceqzq_p64 intrinsics.
> > > >
> > > > vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
> > > > into their high and low halves.
> > > >
> > > > vceqz[q] simply call the vceq and vceqq with a second argument equal
> > > > to zero.
> > > >
> > > > The added (executable) testcases make sure that the poly64x2_t
> > > > variants have results with one element of all zeroes (false) and the
> > > > other element with all bits set to one (true).
> > > >
> > > > 2020-10-15  Christophe Lyon  <christophe.lyon@linaro.org>
> > > >
> > > >       gcc/
> > > >       * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64):
> > > > New.
> > > >
> > > >       gcc/testsuite/
> > > >       * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
> > > >       vceqz_p64, vceqq_p64 and vceqzq_p64.
> > > > ---
> > > >  gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
> > > >  .../aarch64/advsimd-intrinsics/p64_p128.c          | 46
> > > > +++++++++++++++++++++-
> > > >  2 files changed, 76 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> > > > index aa21730..f7eff37 100644
> > > > --- a/gcc/config/arm/arm_neon.h
> > > > +++ b/gcc/config/arm/arm_neon.h
> > > > @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t __b)
> > > >    return vreinterpret_u64_u32 (__m);
> > > >  }
> > > >
> > > > +__extension__ extern __inline uint64x1_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqz_p64 (poly64x1_t __a)
> > > > +{
> > > > +  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
> > > > +  return vceq_p64 (__a, __b);
> > > > +}
> > >
> > > This approach is okay, but can we have some kind of test to confirm it generates the VCEQ instruction with immediate zero rather than having a separate DUP...
> >
> > I had checked that manually, but I'll add a test.
> > However, I have noticed that although vceqz_p64 uses vceq.i32 dX, dY, #0,
> > the vceqzq_64 version below first sets
> > vmov dZ, #0
> > and then emits two
> > vmoz dX, dY, dZ
> >
> > I'm looking at why this happens.
> >
>
> Hi,
>
> Here is an updated version, which adds two tests (arm/simd/vceqz_p64.c
> and arm/simd/vceqzq_p64.c).
>
> The vceqzq_64 test does not currently expect instructions with
> immediate zero, because we generate:
> vmov.i32        q9, #0  @ v4si
> [...]
> vceq.i32        d16, d16, d19
> vceq.i32        d17, d17, d19
>
> Looking at the traces, I can see this in reload:
> (insn 19 8 15 2 (set (reg:V2SI 48 d16 [orig:128 _18 ] [128])
>         (neg:V2SI (eq:V2SI (reg:V2SI 48 d16 [orig:139 v1 ] [139])
>                 (reg:V2SI 54 d19 [ _5+8 ]))))
> "/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22
> 1650 {neon_vceqv2si_insn}
>      (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 48
> d16 [orig:139 v1 ] [139]) 0)
>                 (const_vector:V2SI [
>                         (const_int 0 [0]) repeated x2
>                     ])))
>         (nil)))
> (insn 15 19 20 2 (set (reg:V2SI 50 d17 [orig:121 _11 ] [121])
>         (neg:V2SI (eq:V2SI (reg:V2SI 50 d17 [orig:141 v2 ] [141])
>                 (reg:V2SI 54 d19 [ _5+8 ]))))
> "/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-intrinsics/tools/lib/gcc/arm-none-linux-gnueabihf/11.0.0/include/arm_neon.h":2404:22
> 1650 {neon_vceqv2si_insn}
>      (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 50
> d17 [orig:141 v2 ] [141]) 0)
>                 (const_vector:V2SI [
>                         (const_int 0 [0]) repeated x2
>                     ])))
>         (nil)))
>
> but it says:
>          Choosing alt 0 in insn 19:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
>           alt=0,overall=0,losers=0,rld_nregs=0
>          Choosing alt 0 in insn 15:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
>           alt=0,overall=0,losers=0,rld_nregs=0
>
> Why isn't it picking alternative 1 with the Dz constraint?
>
> Christophe
>
>
> > Thanks,
> >
> > Christophe
> >
> >
> > > Thanks,
> > > Kyrill
> > >
> > > > +
> > > > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.  */
> > > > +__extension__ extern __inline uint64x2_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > > > +{
> > > > +  poly64_t __high_a = vget_high_p64 (__a);
> > > > +  poly64_t __high_b = vget_high_p64 (__b);
> > > > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> > > > +
> > > > +  poly64_t __low_a = vget_low_p64 (__a);
> > > > +  poly64_t __low_b = vget_low_p64 (__b);
> > > > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> > > > +  return vcombine_u64 (__low, __high);
> > > > +}
> > > > +
> > > > +__extension__ extern __inline uint64x2_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqzq_p64 (poly64x2_t __a)
> > > > +{
> > > > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > > > +  return vceqq_p64 (__a, __b);
> > > > +}
> > > > +
> > > >  /* The vtst_p64 intrinsic does not map to a single instruction.
> > > >     We emulate it in way similar to vceq_p64 above but here we do
> > > >     a reduction with max since if any two corresponding bits
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > index a3210a9..6aed096 100644
> > > > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] =
> > > > { 0xfffffff1,
> > > >
> > > >  /* Expected results: vceq.  */
> > > >  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
> > > > +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> > > > +
> > > > +/* Expected results: vceqz.  */
> > > > +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
> > > > +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> > > >
> > > >  /* Expected results: vcombine.  */
> > > >  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0,
> > > > 0x88 };
> > > > @@ -213,7 +218,7 @@ int main (void)
> > > >
> > > >    /* vceq_p64 tests. */
> > > >  #undef TEST_MSG
> > > > -#define TEST_MSG "VCEQ"
> > > > +#define TEST_MSG "VCEQ/VCEQQ"
> > > >
> > > >  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > >    VECT_VAR(vceq_vector_res, T3, W, N) =
> > > >       \
> > > > @@ -227,16 +232,55 @@ int main (void)
> > > >    DECL_VARIABLE(vceq_vector, poly, 64, 1);
> > > >    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
> > > >    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
> > > > +  DECL_VARIABLE(vceq_vector, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
> > > >
> > > >    CLEAN(result, uint, 64, 1);
> > > > +  CLEAN(result, uint, 64, 2);
> > > >
> > > >    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
> > > > +  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
> > > >
> > > >    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
> > > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
> > > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
> > > >
> > > >    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
> > > > +  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
> > > >
> > > >    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
> > > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
> > > > +
> > > > +  /* vceqz_p64 tests. */
> > > > +#undef TEST_MSG
> > > > +#define TEST_MSG "VCEQZ/VCEQZQ"
> > > > +
> > > > +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > > +  VECT_VAR(vceqz_vector_res, T3, W, N) =                             \
> > > > +    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));             \
> > > > +  vst1##Q##_u##W(VECT_VAR(result, T3, W, N),
> > > > VECT_VAR(vceqz_vector_res, T3, W, N))
> > > > +
> > > > +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > > +  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > > > +
> > > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
> > > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
> > > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
> > > > +
> > > > +  CLEAN(result, uint, 64, 1);
> > > > +  CLEAN(result, uint, 64, 2);
> > > > +
> > > > +  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
> > > > +  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
> > > > +  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
> > > > +
> > > > +  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
> > > > +  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
> > > > +
> > > > +  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
> > > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
> > > >
> > > >    /* vcombine_p64 tests.  */
> > > >  #undef TEST_MSG
> > > > --
> > > > 2.7.4
> > >

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics
  2020-11-06 15:22     ` Christophe Lyon
  2021-01-15 10:47       ` Christophe Lyon
@ 2021-01-15 11:15       ` Kyrylo Tkachov
  1 sibling, 0 replies; 11+ messages in thread
From: Kyrylo Tkachov @ 2021-01-15 11:15 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: gcc-patches



> -----Original Message-----
> From: Christophe Lyon <christophe.lyon@linaro.org>
> Sent: 06 November 2020 15:23
> To: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Cc: gcc-patches@gcc.gnu.org
> Subject: Re: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64
> intrinsics
> 
> On Thu, 5 Nov 2020 at 12:55, Christophe Lyon <christophe.lyon@linaro.org>
> wrote:
> >
> > On Thu, 5 Nov 2020 at 10:36, Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> wrote:
> > >
> > > H, Christophe,
> > >
> > > > -----Original Message-----
> > > > From: Gcc-patches <gcc-patches-bounces@gcc.gnu.org> On Behalf Of
> > > > Christophe Lyon via Gcc-patches
> > > > Sent: 15 October 2020 18:23
> > > > To: gcc-patches@gcc.gnu.org
> > > > Subject: [PATCH] arm: Implement vceqq_p64, vceqz_p64 and
> vceqzq_p64
> > > > intrinsics
> > > >
> > > > This patch adds implementations for vceqq_p64, vceqz_p64 and
> > > > vceqzq_p64 intrinsics.
> > > >
> > > > vceqq_p64 uses the existing vceq_p64 after splitting the input vectors
> > > > into their high and low halves.
> > > >
> > > > vceqz[q] simply call the vceq and vceqq with a second argument equal
> > > > to zero.
> > > >
> > > > The added (executable) testcases make sure that the poly64x2_t
> > > > variants have results with one element of all zeroes (false) and the
> > > > other element with all bits set to one (true).
> > > >
> > > > 2020-10-15  Christophe Lyon  <christophe.lyon@linaro.org>
> > > >
> > > >       gcc/
> > > >       * config/arm/arm_neon.h (vceqz_p64, vceqq_p64, vceqzq_p64):
> > > > New.
> > > >
> > > >       gcc/testsuite/
> > > >       * gcc.target/aarch64/advsimd-intrinsics/p64_p128.c: Add tests for
> > > >       vceqz_p64, vceqq_p64 and vceqzq_p64.
> > > > ---
> > > >  gcc/config/arm/arm_neon.h                          | 31 +++++++++++++++
> > > >  .../aarch64/advsimd-intrinsics/p64_p128.c          | 46
> > > > +++++++++++++++++++++-
> > > >  2 files changed, 76 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
> > > > index aa21730..f7eff37 100644
> > > > --- a/gcc/config/arm/arm_neon.h
> > > > +++ b/gcc/config/arm/arm_neon.h
> > > > @@ -16912,6 +16912,37 @@ vceq_p64 (poly64x1_t __a, poly64x1_t
> __b)
> > > >    return vreinterpret_u64_u32 (__m);
> > > >  }
> > > >
> > > > +__extension__ extern __inline uint64x1_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqz_p64 (poly64x1_t __a)
> > > > +{
> > > > +  poly64x1_t __b = vreinterpret_p64_u32 (vdup_n_u32 (0));
> > > > +  return vceq_p64 (__a, __b);
> > > > +}
> > >
> > > This approach is okay, but can we have some kind of test to confirm it
> generates the VCEQ instruction with immediate zero rather than having a
> separate DUP...
> >
> > I had checked that manually, but I'll add a test.
> > However, I have noticed that although vceqz_p64 uses vceq.i32 dX, dY, #0,
> > the vceqzq_64 version below first sets
> > vmov dZ, #0
> > and then emits two
> > vmoz dX, dY, dZ
> >
> > I'm looking at why this happens.
> >
> 
> Hi,
> 
> Here is an updated version, which adds two tests (arm/simd/vceqz_p64.c
> and arm/simd/vceqzq_p64.c).
> 
> The vceqzq_64 test does not currently expect instructions with
> immediate zero, because we generate:
> vmov.i32        q9, #0  @ v4si
> [...]
> vceq.i32        d16, d16, d19
> vceq.i32        d17, d17, d19
> 
> Looking at the traces, I can see this in reload:
> (insn 19 8 15 2 (set (reg:V2SI 48 d16 [orig:128 _18 ] [128])
>         (neg:V2SI (eq:V2SI (reg:V2SI 48 d16 [orig:139 v1 ] [139])
>                 (reg:V2SI 54 d19 [ _5+8 ]))))
> "/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-
> intrinsics/tools/lib/gcc/arm-none-linux-
> gnueabihf/11.0.0/include/arm_neon.h":2404:22
> 1650 {neon_vceqv2si_insn}
>      (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 48
> d16 [orig:139 v1 ] [139]) 0)
>                 (const_vector:V2SI [
>                         (const_int 0 [0]) repeated x2
>                     ])))
>         (nil)))
> (insn 15 19 20 2 (set (reg:V2SI 50 d17 [orig:121 _11 ] [121])
>         (neg:V2SI (eq:V2SI (reg:V2SI 50 d17 [orig:141 v2 ] [141])
>                 (reg:V2SI 54 d19 [ _5+8 ]))))
> "/home/christophe.lyon/src/GCC/builds/gcc-fsf-git-neon-
> intrinsics/tools/lib/gcc/arm-none-linux-
> gnueabihf/11.0.0/include/arm_neon.h":2404:22
> 1650 {neon_vceqv2si_insn}
>      (expr_list:REG_EQUAL (neg:V2SI (eq:V2SI (subreg:V2SI (reg:DI 50
> d17 [orig:141 v2 ] [141]) 0)
>                 (const_vector:V2SI [
>                         (const_int 0 [0]) repeated x2
>                     ])))
>         (nil)))
> 
> but it says:
>          Choosing alt 0 in insn 19:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
>           alt=0,overall=0,losers=0,rld_nregs=0
>          Choosing alt 0 in insn 15:  (0) =w  (1) w  (2) w {neon_vceqv2si_insn}
>           alt=0,overall=0,losers=0,rld_nregs=0
> 
> Why isn't it picking alternative 1 with the Dz constraint?
> 

Not sure, but the intrinsics implementation looks correct so let's go ahead with that and improve the codegen later.
Thanks,
Kyrill

> Christophe
> 
> 
> > Thanks,
> >
> > Christophe
> >
> >
> > > Thanks,
> > > Kyrill
> > >
> > > > +
> > > > +/* For vceqq_p64, we rely on vceq_p64 for each of the two elements.
> */
> > > > +__extension__ extern __inline uint64x2_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
> > > > +{
> > > > +  poly64_t __high_a = vget_high_p64 (__a);
> > > > +  poly64_t __high_b = vget_high_p64 (__b);
> > > > +  uint64x1_t __high = vceq_p64(__high_a, __high_b);
> > > > +
> > > > +  poly64_t __low_a = vget_low_p64 (__a);
> > > > +  poly64_t __low_b = vget_low_p64 (__b);
> > > > +  uint64x1_t __low = vceq_p64(__low_a, __low_b);
> > > > +  return vcombine_u64 (__low, __high);
> > > > +}
> > > > +
> > > > +__extension__ extern __inline uint64x2_t
> > > > +__attribute__  ((__always_inline__, __gnu_inline__, __artificial__))
> > > > +vceqzq_p64 (poly64x2_t __a)
> > > > +{
> > > > +  poly64x2_t __b = vreinterpretq_p64_u32 (vdupq_n_u32 (0));
> > > > +  return vceqq_p64 (__a, __b);
> > > > +}
> > > > +
> > > >  /* The vtst_p64 intrinsic does not map to a single instruction.
> > > >     We emulate it in way similar to vceq_p64 above but here we do
> > > >     a reduction with max since if any two corresponding bits
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-
> intrinsics/p64_p128.c
> > > > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > index a3210a9..6aed096 100644
> > > > --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/p64_p128.c
> > > > @@ -16,6 +16,11 @@ VECT_VAR_DECL(vbsl_expected,poly,64,2) [] =
> > > > { 0xfffffff1,
> > > >
> > > >  /* Expected results: vceq.  */
> > > >  VECT_VAR_DECL(vceq_expected,uint,64,1) [] = { 0x0 };
> > > > +VECT_VAR_DECL(vceq_expected,uint,64,2) [] = { 0x0, 0xffffffffffffffff };
> > > > +
> > > > +/* Expected results: vceqz.  */
> > > > +VECT_VAR_DECL(vceqz_expected,uint,64,1) [] = { 0x0 };
> > > > +VECT_VAR_DECL(vceqz_expected,uint,64,2) [] = { 0x0,
> 0xffffffffffffffff };
> > > >
> > > >  /* Expected results: vcombine.  */
> > > >  VECT_VAR_DECL(vcombine_expected,poly,64,2) [] = { 0xfffffffffffffff0,
> > > > 0x88 };
> > > > @@ -213,7 +218,7 @@ int main (void)
> > > >
> > > >    /* vceq_p64 tests. */
> > > >  #undef TEST_MSG
> > > > -#define TEST_MSG "VCEQ"
> > > > +#define TEST_MSG "VCEQ/VCEQQ"
> > > >
> > > >  #define TEST_VCOMP1(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > >    VECT_VAR(vceq_vector_res, T3, W, N) =
> > > >       \
> > > > @@ -227,16 +232,55 @@ int main (void)
> > > >    DECL_VARIABLE(vceq_vector, poly, 64, 1);
> > > >    DECL_VARIABLE(vceq_vector2, poly, 64, 1);
> > > >    DECL_VARIABLE(vceq_vector_res, uint, 64, 1);
> > > > +  DECL_VARIABLE(vceq_vector, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceq_vector2, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceq_vector_res, uint, 64, 2);
> > > >
> > > >    CLEAN(result, uint, 64, 1);
> > > > +  CLEAN(result, uint, 64, 2);
> > > >
> > > >    VLOAD(vceq_vector, buffer, , poly, p, 64, 1);
> > > > +  VLOAD(vceq_vector, buffer, q, poly, p, 64, 2);
> > > >
> > > >    VDUP(vceq_vector2, , poly, p, 64, 1, 0x88);
> > > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 0, 0x88);
> > > > +  VSET_LANE(vceq_vector2, q, poly, p, 64, 2, 1, 0xFFFFFFFFFFFFFFF1);
> > > >
> > > >    TEST_VCOMP(vceq, , poly, p, uint, 64, 1);
> > > > +  TEST_VCOMP(vceq, q, poly, p, uint, 64, 2);
> > > >
> > > >    CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceq_expected, "");
> > > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceq_expected, "");
> > > > +
> > > > +  /* vceqz_p64 tests. */
> > > > +#undef TEST_MSG
> > > > +#define TEST_MSG "VCEQZ/VCEQZQ"
> > > > +
> > > > +#define TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > > +  VECT_VAR(vceqz_vector_res, T3, W, N) =                             \
> > > > +    INSN##Q##_##T2##W(VECT_VAR(vceqz_vector, T1, W, N));             \
> > > > +  vst1##Q##_u##W(VECT_VAR(result, T3, W, N),
> > > > VECT_VAR(vceqz_vector_res, T3, W, N))
> > > > +
> > > > +#define TEST_VCOMPZ(INSN, Q, T1, T2, T3, W, N)
> > > >       \
> > > > +  TEST_VCOMPZ1(INSN, Q, T1, T2, T3, W, N)
> > > > +
> > > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 1);
> > > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 1);
> > > > +  DECL_VARIABLE(vceqz_vector, poly, 64, 2);
> > > > +  DECL_VARIABLE(vceqz_vector_res, uint, 64, 2);
> > > > +
> > > > +  CLEAN(result, uint, 64, 1);
> > > > +  CLEAN(result, uint, 64, 2);
> > > > +
> > > > +  VLOAD(vceqz_vector, buffer, , poly, p, 64, 1);
> > > > +  VLOAD(vceqz_vector, buffer, q, poly, p, 64, 2);
> > > > +  VSET_LANE(vceqz_vector, q, poly, p, 64, 2, 1, 0);
> > > > +
> > > > +  TEST_VCOMPZ(vceqz, , poly, p, uint, 64, 1);
> > > > +  TEST_VCOMPZ(vceqz, q, poly, p, uint, 64, 2);
> > > > +
> > > > +  CHECK(TEST_MSG, uint, 64, 1, PRIx64, vceqz_expected, "");
> > > > +  CHECK(TEST_MSG, uint, 64, 2, PRIx64, vceqz_expected, "");
> > > >
> > > >    /* vcombine_p64 tests.  */
> > > >  #undef TEST_MSG
> > > > --
> > > > 2.7.4
> > >

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2021-01-15 11:15 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-15 17:23 [PATCH] arm: Implement vceqq_p64, vceqz_p64 and vceqzq_p64 intrinsics Christophe Lyon
2020-10-15 18:10 ` Andrea Corallo
2020-10-16  8:40   ` Christophe Lyon
2020-10-16  8:41   ` Christophe Lyon
2020-10-23 17:20     ` Christophe Lyon
2020-11-04 20:17       ` Christophe Lyon
2020-11-05  9:36 ` Kyrylo Tkachov
2020-11-05 11:55   ` Christophe Lyon
2020-11-06 15:22     ` Christophe Lyon
2021-01-15 10:47       ` Christophe Lyon
2021-01-15 11:15       ` Kyrylo Tkachov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).