* [PATCH] Add testcases for ffs/ctz vectorization.
@ 2023-04-23 3:02 liuhongt
0 siblings, 0 replies; only message in thread
From: liuhongt @ 2023-04-23 3:02 UTC (permalink / raw)
To: gcc-patches; +Cc: crazylht, hjl.tools
Ready push to trunk.
gcc/testsuite/ChangeLog:
PR tree-optimization/109011
* gcc.target/i386/pr109011-b1.c: New test.
* gcc.target/i386/pr109011-b2.c: New test.
* gcc.target/i386/pr109011-d1.c: New test.
* gcc.target/i386/pr109011-d2.c: New test.
* gcc.target/i386/pr109011-q1.c: New test.
* gcc.target/i386/pr109011-q2.c: New test.
* gcc.target/i386/pr109011-w1.c: New test.
* gcc.target/i386/pr109011-w2.c: New test.
---
gcc/testsuite/gcc.target/i386/pr109011-b1.c | 53 +++++++++
gcc/testsuite/gcc.target/i386/pr109011-b2.c | 104 ++++++++++++++++
gcc/testsuite/gcc.target/i386/pr109011-d1.c | 46 ++++++++
gcc/testsuite/gcc.target/i386/pr109011-d2.c | 118 +++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr109011-dq1.c | 46 ++++++++
gcc/testsuite/gcc.target/i386/pr109011-dq2.c | 104 ++++++++++++++++
gcc/testsuite/gcc.target/i386/pr109011-q1.c | 46 ++++++++
gcc/testsuite/gcc.target/i386/pr109011-q2.c | 118 +++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr109011-w1.c | 47 ++++++++
gcc/testsuite/gcc.target/i386/pr109011-w2.c | 104 ++++++++++++++++
10 files changed, 786 insertions(+)
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-b1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-b2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-d1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-d2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-dq1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-dq2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-q1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-q2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-w1.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr109011-w2.c
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-b1.c b/gcc/testsuite/gcc.target/i386/pr109011-b1.c
new file mode 100644
index 00000000000..9833d3526f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-b1.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntb\[ \t\]+" 4 } } */
+/* 4 vplzcntd come from function clzw, the other 4 come from function clzb0. */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 8 } } */
+
+void
+__attribute__((noipa))
+popcntb (unsigned char *p, unsigned char *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa))
+clzb (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa))
+ffsb (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa))
+ctzb (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctz (q[i]);
+}
+
+void
+__attribute__((noipa))
+clzb0 (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 8;
+}
+
+void
+__attribute__((noipa))
+ctzb0 (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 8;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-b2.c b/gcc/testsuite/gcc.target/i386/pr109011-b2.c
new file mode 100644
index 00000000000..7f2042645d7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-b2.c
@@ -0,0 +1,104 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-b1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntb_scalar (unsigned char *p, unsigned char *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzb_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsb_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzb0_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 8;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzb0_scalar (unsigned char *p, unsigned char* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 8;
+}
+
+void
+test_256 ()
+{
+ unsigned char src[2048];
+ unsigned char res[2048];
+ unsigned char exp[2048];
+ for (int i = 0; i != 2048; i++)
+ {
+ src[i] = i * i - 1;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntb (&res[0], &src[0]);
+ popcntb_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+
+ clzb (&res[0], &src[0]);
+ clzb_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+
+ ffsb (&res[0], &src[0]);
+ ffsb_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+
+ clzb0 (&res[0], &src[0]);
+ clzb0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+
+ ctzb0 (&res[0], &src[0]);
+ ctzb0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-d1.c b/gcc/testsuite/gcc.target/i386/pr109011-d1.c
new file mode 100644
index 00000000000..23eb2d57e07
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-d1.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntd\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 5 } } */
+
+void
+popcntd (unsigned int *p, unsigned int *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+clzd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+ffsd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+ctzd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctz (q[i]);
+}
+
+void
+clzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 32;
+}
+
+void
+ctzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 32;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-d2.c b/gcc/testsuite/gcc.target/i386/pr109011-d2.c
new file mode 100644
index 00000000000..f6fb78d1df0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-d2.c
@@ -0,0 +1,118 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-d1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntd_scalar (unsigned int *p, unsigned int *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 32;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 32;
+}
+
+void
+test_256 ()
+{
+ unsigned int src[2048];
+ unsigned int res[2048];
+ unsigned int exp[2048];
+ for (int i = 0; i != 2048; i++)
+ {
+ src[i] = i * i - 1;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntd (&res[0], &src[0]);
+ popcntd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ clzd (&res[0], &src[0]);
+ clzd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (&res[0], &exp[0], 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ffsd (&res[0], &src[0]);
+ ffsd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ctzd (&res[0], &src[0]);
+ ctzd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ clzd0 (&res[0], &src[0]);
+ clzd0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ctzd0 (&res[0], &src[0]);
+ ctzd0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-dq1.c b/gcc/testsuite/gcc.target/i386/pr109011-dq1.c
new file mode 100644
index 00000000000..876dce01946
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-dq1.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntd\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 5 } } */
+
+void
+popcntd (unsigned int *p, unsigned int *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+clzd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clzll (q[i]);
+}
+
+void
+ffsd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+ctzd (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+clzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clzll (q[i]) : 32;
+}
+
+void
+ctzd0 (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctzll (q[i]) : 32;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-dq2.c b/gcc/testsuite/gcc.target/i386/pr109011-dq2.c
new file mode 100644
index 00000000000..ceb6655a6d2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-dq2.c
@@ -0,0 +1,104 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-dq1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntd_scalar (unsigned int *p, unsigned int *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsd_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clzll (q[i]) : 32;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzd0_scalar (unsigned int *p, unsigned int* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctzll (q[i]) : 32;
+}
+
+void
+test_256 ()
+{
+ unsigned int src[2048];
+ unsigned int res[2048];
+ unsigned int exp[2048];
+ for (int i = 0; i != 2048; i++)
+ {
+ src[i] = i * i - 1;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntd (&res[0], &src[0]);
+ popcntd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ clzd (&res[0], &src[0]);
+ clzd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (&res[0], &exp[0], 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ffsd (&res[0], &src[0]);
+ ffsd_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ clzd0 (&res[0], &src[0]);
+ clzd0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+
+ ctzd0 (&res[0], &src[0]);
+ ctzd0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 4) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-q1.c b/gcc/testsuite/gcc.target/i386/pr109011-q1.c
new file mode 100644
index 00000000000..237381c796a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-q1.c
@@ -0,0 +1,46 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntq\[ \t\]+" 1 } } */
+/* { dg-final { scan-assembler-times "vplzcntq\[ \t\]+" 5 } } */
+
+void
+popcntq (unsigned long long *p, unsigned long long *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+clzq (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clzll (q[i]);
+}
+
+void
+ffsq (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+ctzq (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+clzq0 (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clzll (q[i]) : 64;
+}
+
+void
+ctzq0 (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctzll (q[i]) : 64;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-q2.c b/gcc/testsuite/gcc.target/i386/pr109011-q2.c
new file mode 100644
index 00000000000..6f9654f0ef8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-q2.c
@@ -0,0 +1,118 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-q1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntq_scalar (unsigned long long *p, unsigned long long *q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcountll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = __builtin_clzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffsll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzq_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctzll (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzq0_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clzll (q[i]) : 64;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzq0_scalar (unsigned long long *p, unsigned long long* __restrict q)
+{
+ for (unsigned long long i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctzll (q[i]) : 64;
+}
+
+void
+test_256 ()
+{
+ unsigned long long src[2048];
+ unsigned long long res[2048];
+ unsigned long long exp[2048];
+ for (unsigned long long i = 0; i != 2048ULL; i++)
+ {
+ src[i] = i * i - 1ULL;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntq (&res[0], &src[0]);
+ popcntq_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ clzq (&res[0], &src[0]);
+ clzq_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ ffsq (&res[0], &src[0]);
+ ffsq_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ ctzq (&res[0], &src[0]);
+ ctzq_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ clzq0 (&res[0], &src[0]);
+ clzq0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+
+ ctzq0 (&res[0], &src[0]);
+ ctzq0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 8) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-w1.c b/gcc/testsuite/gcc.target/i386/pr109011-w1.c
new file mode 100644
index 00000000000..f6045abe8ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-w1.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-march=icelake-server -O3" } */
+/* { dg-final { scan-assembler-times "vpopcntw\[ \t\]+" 4 } } */
+/* 2 vplzcntd come from function clzw, the other 2 come from function clzb0. */
+/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 4 } } */
+
+void
+popcntw (unsigned short *p, unsigned short *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+clzw (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+ffsw (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+ctzw (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ctz (q[i]);
+}
+
+void
+clzw0 (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 16;
+}
+
+void
+ctzw0 (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 16;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr109011-w2.c b/gcc/testsuite/gcc.target/i386/pr109011-w2.c
new file mode 100644
index 00000000000..15dd338eefa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109011-w2.c
@@ -0,0 +1,104 @@
+/* { dg-do run } */
+/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */
+/* { dg-require-effective-target avx512f } */
+/* { dg-require-effective-target avx512vl } */
+/* { dg-require-effective-target avx512cd } */
+/* { dg-require-effective-target avx512bitalg } */
+/* { dg-require-effective-target avx512vpopcntdq } */
+
+#define AVX512F
+#define AVX512VL
+#define AVX512CD
+#define AVX512BITALG
+#define AVX512VPOPCNTDQ
+
+#include "avx512f-helper.h"
+#include "pr109011-w1.c"
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+popcntw_scalar (unsigned short *p, unsigned short *q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_popcount (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzw_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_clz (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ffsw_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = __builtin_ffs (q[i]);
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+clzw0_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_clz (q[i]) : 16;
+}
+
+void
+__attribute__((noipa, optimize ("no-tree-vectorize")))
+ctzw0_scalar (unsigned short *p, unsigned short* __restrict q)
+{
+ for (unsigned int i = 0; i < 2048; ++i)
+ p[i] = q[i] ? __builtin_ctz (q[i]) : 16;
+}
+
+void
+test_256 ()
+{
+ unsigned short src[2048];
+ unsigned short res[2048];
+ unsigned short exp[2048];
+ for (int i = 0; i != 2048; i++)
+ {
+ src[i] = i * i - 1;
+ res[i] = 0;
+ exp[i] = 1;
+ }
+
+ popcntw (&res[0], &src[0]);
+ popcntw_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+
+ clzw (&res[0], &src[0]);
+ clzw_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+
+ ffsw (&res[0], &src[0]);
+ ffsw_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+
+ clzw0 (&res[0], &src[0]);
+ clzw0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+
+ ctzw0 (&res[0], &src[0]);
+ ctzw0_scalar (&exp[0], &src[0]);
+
+ if (__builtin_memcmp (res, exp, 2048 * 2) != 0)
+ __builtin_abort ();
+}
+
+void
+test_128 ()
+{}
--
2.39.1.388.g2fc9e9ca3c
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2023-04-23 3:05 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-23 3:02 [PATCH] Add testcases for ffs/ctz vectorization liuhongt
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).