From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2078) id 5164B3858D32; Mon, 24 Apr 2023 01:35:07 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 5164B3858D32 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1682300107; bh=sImPN3WZMMxpJSa349TTIr3/6v606UmF1nB32LyUUm4=; h=From:To:Subject:Date:From; b=WcQ5BFrb0zDHBMlS0mQ6rA9DXOTeOzeFpHkS7PeSKBSsWP0+uNZmG6bU04vgRihjE i3Fzqwi2lrRT+/ZcGePyZHJUEwPSON2IcpAtMsZ7fe3l93iluJc/ZHptlDpKdGbuOS 4EVdJOiYSi15acC45PxcY0tsAFZEx0BONeFxKgLk= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: hongtao Liu To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-182] Add testcases for ffs/ctz vectorization. X-Act-Checkin: gcc X-Git-Author: liuhongt X-Git-Refname: refs/heads/master X-Git-Oldrev: 6c06b9e4c3161a48a9ea0e406421cc21599ed3e7 X-Git-Newrev: 8311c26757657fe8ffa28ca1539d02d141bb8292 Message-Id: <20230424013507.5164B3858D32@sourceware.org> Date: Mon, 24 Apr 2023 01:35:07 +0000 (GMT) List-Id: https://gcc.gnu.org/g:8311c26757657fe8ffa28ca1539d02d141bb8292 commit r14-182-g8311c26757657fe8ffa28ca1539d02d141bb8292 Author: liuhongt Date: Wed Mar 15 13:41:06 2023 +0800 Add testcases for ffs/ctz vectorization. gcc/testsuite/ChangeLog: PR tree-optimization/109011 * gcc.target/i386/pr109011-b1.c: New test. * gcc.target/i386/pr109011-b2.c: New test. * gcc.target/i386/pr109011-d1.c: New test. * gcc.target/i386/pr109011-d2.c: New test. * gcc.target/i386/pr109011-q1.c: New test. * gcc.target/i386/pr109011-q2.c: New test. * gcc.target/i386/pr109011-w1.c: New test. * gcc.target/i386/pr109011-w2.c: New test. Diff: --- gcc/testsuite/gcc.target/i386/pr109011-b1.c | 53 ++++++++++++ gcc/testsuite/gcc.target/i386/pr109011-b2.c | 104 +++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr109011-d1.c | 46 +++++++++++ gcc/testsuite/gcc.target/i386/pr109011-d2.c | 118 +++++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr109011-dq1.c | 46 +++++++++++ gcc/testsuite/gcc.target/i386/pr109011-dq2.c | 104 +++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr109011-q1.c | 46 +++++++++++ gcc/testsuite/gcc.target/i386/pr109011-q2.c | 118 +++++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr109011-w1.c | 47 +++++++++++ gcc/testsuite/gcc.target/i386/pr109011-w2.c | 104 +++++++++++++++++++++++ 10 files changed, 786 insertions(+) diff --git a/gcc/testsuite/gcc.target/i386/pr109011-b1.c b/gcc/testsuite/gcc.target/i386/pr109011-b1.c new file mode 100644 index 00000000000..9833d3526f9 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-b1.c @@ -0,0 +1,53 @@ +/* { dg-do compile } */ +/* { dg-options "-march=icelake-server -O3" } */ +/* { dg-final { scan-assembler-times "vpopcntb\[ \t\]+" 4 } } */ +/* 4 vplzcntd come from function clzw, the other 4 come from function clzb0. */ +/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 8 } } */ + +void +__attribute__((noipa)) +popcntb (unsigned char *p, unsigned char *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcount (q[i]); +} + +void +__attribute__((noipa)) +clzb (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clz (q[i]); +} + +void +__attribute__((noipa)) +ffsb (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffs (q[i]); +} + +void +__attribute__((noipa)) +ctzb (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ctz (q[i]); +} + +void +__attribute__((noipa)) +clzb0 (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clz (q[i]) : 8; +} + +void +__attribute__((noipa)) +ctzb0 (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctz (q[i]) : 8; +} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-b2.c b/gcc/testsuite/gcc.target/i386/pr109011-b2.c new file mode 100644 index 00000000000..7f2042645d7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-b2.c @@ -0,0 +1,104 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-require-effective-target avx512cd } */ +/* { dg-require-effective-target avx512bitalg } */ +/* { dg-require-effective-target avx512vpopcntdq } */ + +#define AVX512F +#define AVX512VL +#define AVX512CD +#define AVX512BITALG +#define AVX512VPOPCNTDQ + +#include "avx512f-helper.h" +#include "pr109011-b1.c" + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +popcntb_scalar (unsigned char *p, unsigned char *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcount (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzb_scalar (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clz (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ffsb_scalar (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffs (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzb0_scalar (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clz (q[i]) : 8; +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ctzb0_scalar (unsigned char *p, unsigned char* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctz (q[i]) : 8; +} + +void +test_256 () +{ + unsigned char src[2048]; + unsigned char res[2048]; + unsigned char exp[2048]; + for (int i = 0; i != 2048; i++) + { + src[i] = i * i - 1; + res[i] = 0; + exp[i] = 1; + } + + popcntb (&res[0], &src[0]); + popcntb_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048) != 0) + __builtin_abort (); + + clzb (&res[0], &src[0]); + clzb_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048) != 0) + __builtin_abort (); + + ffsb (&res[0], &src[0]); + ffsb_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048) != 0) + __builtin_abort (); + + clzb0 (&res[0], &src[0]); + clzb0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048) != 0) + __builtin_abort (); + + ctzb0 (&res[0], &src[0]); + ctzb0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048) != 0) + __builtin_abort (); +} + +void +test_128 () +{} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-d1.c b/gcc/testsuite/gcc.target/i386/pr109011-d1.c new file mode 100644 index 00000000000..23eb2d57e07 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-d1.c @@ -0,0 +1,46 @@ +/* { dg-do compile } */ +/* { dg-options "-march=icelake-server -O3" } */ +/* { dg-final { scan-assembler-times "vpopcntd\[ \t\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 5 } } */ + +void +popcntd (unsigned int *p, unsigned int *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcount (q[i]); +} + +void +clzd (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clz (q[i]); +} + +void +ffsd (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffs (q[i]); +} + +void +ctzd (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ctz (q[i]); +} + +void +clzd0 (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clz (q[i]) : 32; +} + +void +ctzd0 (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctz (q[i]) : 32; +} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-d2.c b/gcc/testsuite/gcc.target/i386/pr109011-d2.c new file mode 100644 index 00000000000..f6fb78d1df0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-d2.c @@ -0,0 +1,118 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-require-effective-target avx512cd } */ +/* { dg-require-effective-target avx512bitalg } */ +/* { dg-require-effective-target avx512vpopcntdq } */ + +#define AVX512F +#define AVX512VL +#define AVX512CD +#define AVX512BITALG +#define AVX512VPOPCNTDQ + +#include "avx512f-helper.h" +#include "pr109011-d1.c" + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +popcntd_scalar (unsigned int *p, unsigned int *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcount (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzd_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clz (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ffsd_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffs (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ctzd_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ctz (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzd0_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clz (q[i]) : 32; +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ctzd0_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctz (q[i]) : 32; +} + +void +test_256 () +{ + unsigned int src[2048]; + unsigned int res[2048]; + unsigned int exp[2048]; + for (int i = 0; i != 2048; i++) + { + src[i] = i * i - 1; + res[i] = 0; + exp[i] = 1; + } + + popcntd (&res[0], &src[0]); + popcntd_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); + + clzd (&res[0], &src[0]); + clzd_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (&res[0], &exp[0], 2048 * 4) != 0) + __builtin_abort (); + + ffsd (&res[0], &src[0]); + ffsd_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); + + ctzd (&res[0], &src[0]); + ctzd_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); + + clzd0 (&res[0], &src[0]); + clzd0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); + + ctzd0 (&res[0], &src[0]); + ctzd0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); +} + +void +test_128 () +{} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-dq1.c b/gcc/testsuite/gcc.target/i386/pr109011-dq1.c new file mode 100644 index 00000000000..876dce01946 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-dq1.c @@ -0,0 +1,46 @@ +/* { dg-do compile } */ +/* { dg-options "-march=icelake-server -O3" } */ +/* { dg-final { scan-assembler-times "vpopcntd\[ \t\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 5 } } */ + +void +popcntd (unsigned int *p, unsigned int *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcountll (q[i]); +} + +void +clzd (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clzll (q[i]); +} + +void +ffsd (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffsll (q[i]); +} + +void +ctzd (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ctzll (q[i]); +} + +void +clzd0 (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clzll (q[i]) : 32; +} + +void +ctzd0 (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctzll (q[i]) : 32; +} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-dq2.c b/gcc/testsuite/gcc.target/i386/pr109011-dq2.c new file mode 100644 index 00000000000..ceb6655a6d2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-dq2.c @@ -0,0 +1,104 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-require-effective-target avx512cd } */ +/* { dg-require-effective-target avx512bitalg } */ +/* { dg-require-effective-target avx512vpopcntdq } */ + +#define AVX512F +#define AVX512VL +#define AVX512CD +#define AVX512BITALG +#define AVX512VPOPCNTDQ + +#include "avx512f-helper.h" +#include "pr109011-dq1.c" + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +popcntd_scalar (unsigned int *p, unsigned int *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcountll (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzd_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clzll (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ffsd_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffsll (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzd0_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clzll (q[i]) : 32; +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ctzd0_scalar (unsigned int *p, unsigned int* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctzll (q[i]) : 32; +} + +void +test_256 () +{ + unsigned int src[2048]; + unsigned int res[2048]; + unsigned int exp[2048]; + for (int i = 0; i != 2048; i++) + { + src[i] = i * i - 1; + res[i] = 0; + exp[i] = 1; + } + + popcntd (&res[0], &src[0]); + popcntd_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); + + clzd (&res[0], &src[0]); + clzd_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (&res[0], &exp[0], 2048 * 4) != 0) + __builtin_abort (); + + ffsd (&res[0], &src[0]); + ffsd_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); + + clzd0 (&res[0], &src[0]); + clzd0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); + + ctzd0 (&res[0], &src[0]); + ctzd0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 4) != 0) + __builtin_abort (); +} + +void +test_128 () +{} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-q1.c b/gcc/testsuite/gcc.target/i386/pr109011-q1.c new file mode 100644 index 00000000000..237381c796a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-q1.c @@ -0,0 +1,46 @@ +/* { dg-do compile } */ +/* { dg-options "-march=icelake-server -O3" } */ +/* { dg-final { scan-assembler-times "vpopcntq\[ \t\]+" 1 } } */ +/* { dg-final { scan-assembler-times "vplzcntq\[ \t\]+" 5 } } */ + +void +popcntq (unsigned long long *p, unsigned long long *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcountll (q[i]); +} + +void +clzq (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clzll (q[i]); +} + +void +ffsq (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffsll (q[i]); +} + +void +ctzq (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ctzll (q[i]); +} + +void +clzq0 (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clzll (q[i]) : 64; +} + +void +ctzq0 (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctzll (q[i]) : 64; +} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-q2.c b/gcc/testsuite/gcc.target/i386/pr109011-q2.c new file mode 100644 index 00000000000..6f9654f0ef8 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-q2.c @@ -0,0 +1,118 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-require-effective-target avx512cd } */ +/* { dg-require-effective-target avx512bitalg } */ +/* { dg-require-effective-target avx512vpopcntdq } */ + +#define AVX512F +#define AVX512VL +#define AVX512CD +#define AVX512BITALG +#define AVX512VPOPCNTDQ + +#include "avx512f-helper.h" +#include "pr109011-q1.c" + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +popcntq_scalar (unsigned long long *p, unsigned long long *q) +{ + for (unsigned long long i = 0; i < 2048; ++i) + p[i] = __builtin_popcountll (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzq_scalar (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned long long i = 0; i < 2048; ++i) + p[i] = __builtin_clzll (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ffsq_scalar (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned long long i = 0; i < 2048; ++i) + p[i] = __builtin_ffsll (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ctzq_scalar (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned long long i = 0; i < 2048; ++i) + p[i] = __builtin_ctzll (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzq0_scalar (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned long long i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clzll (q[i]) : 64; +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ctzq0_scalar (unsigned long long *p, unsigned long long* __restrict q) +{ + for (unsigned long long i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctzll (q[i]) : 64; +} + +void +test_256 () +{ + unsigned long long src[2048]; + unsigned long long res[2048]; + unsigned long long exp[2048]; + for (unsigned long long i = 0; i != 2048ULL; i++) + { + src[i] = i * i - 1ULL; + res[i] = 0; + exp[i] = 1; + } + + popcntq (&res[0], &src[0]); + popcntq_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 8) != 0) + __builtin_abort (); + + clzq (&res[0], &src[0]); + clzq_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 8) != 0) + __builtin_abort (); + + ffsq (&res[0], &src[0]); + ffsq_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 8) != 0) + __builtin_abort (); + + ctzq (&res[0], &src[0]); + ctzq_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 8) != 0) + __builtin_abort (); + + clzq0 (&res[0], &src[0]); + clzq0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 8) != 0) + __builtin_abort (); + + ctzq0 (&res[0], &src[0]); + ctzq0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 8) != 0) + __builtin_abort (); +} + +void +test_128 () +{} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-w1.c b/gcc/testsuite/gcc.target/i386/pr109011-w1.c new file mode 100644 index 00000000000..f6045abe8ac --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-w1.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-options "-march=icelake-server -O3" } */ +/* { dg-final { scan-assembler-times "vpopcntw\[ \t\]+" 4 } } */ +/* 2 vplzcntd come from function clzw, the other 2 come from function clzb0. */ +/* { dg-final { scan-assembler-times "vplzcntd\[ \t\]+" 4 } } */ + +void +popcntw (unsigned short *p, unsigned short *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcount (q[i]); +} + +void +clzw (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clz (q[i]); +} + +void +ffsw (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffs (q[i]); +} + +void +ctzw (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ctz (q[i]); +} + +void +clzw0 (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clz (q[i]) : 16; +} + +void +ctzw0 (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctz (q[i]) : 16; +} diff --git a/gcc/testsuite/gcc.target/i386/pr109011-w2.c b/gcc/testsuite/gcc.target/i386/pr109011-w2.c new file mode 100644 index 00000000000..15dd338eefa --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr109011-w2.c @@ -0,0 +1,104 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mbmi -mlzcnt -mavx512vl -mavx512cd -mavx512bitalg -mavx512vpopcntdq -mprefer-vector-width=256" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-require-effective-target avx512cd } */ +/* { dg-require-effective-target avx512bitalg } */ +/* { dg-require-effective-target avx512vpopcntdq } */ + +#define AVX512F +#define AVX512VL +#define AVX512CD +#define AVX512BITALG +#define AVX512VPOPCNTDQ + +#include "avx512f-helper.h" +#include "pr109011-w1.c" + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +popcntw_scalar (unsigned short *p, unsigned short *q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_popcount (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzw_scalar (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_clz (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ffsw_scalar (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = __builtin_ffs (q[i]); +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +clzw0_scalar (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_clz (q[i]) : 16; +} + +void +__attribute__((noipa, optimize ("no-tree-vectorize"))) +ctzw0_scalar (unsigned short *p, unsigned short* __restrict q) +{ + for (unsigned int i = 0; i < 2048; ++i) + p[i] = q[i] ? __builtin_ctz (q[i]) : 16; +} + +void +test_256 () +{ + unsigned short src[2048]; + unsigned short res[2048]; + unsigned short exp[2048]; + for (int i = 0; i != 2048; i++) + { + src[i] = i * i - 1; + res[i] = 0; + exp[i] = 1; + } + + popcntw (&res[0], &src[0]); + popcntw_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 2) != 0) + __builtin_abort (); + + clzw (&res[0], &src[0]); + clzw_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 2) != 0) + __builtin_abort (); + + ffsw (&res[0], &src[0]); + ffsw_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 2) != 0) + __builtin_abort (); + + clzw0 (&res[0], &src[0]); + clzw0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 2) != 0) + __builtin_abort (); + + ctzw0 (&res[0], &src[0]); + ctzw0_scalar (&exp[0], &src[0]); + + if (__builtin_memcmp (res, exp, 2048 * 2) != 0) + __builtin_abort (); +} + +void +test_128 () +{}