From: Robin Dapp <rdapp.gcc@gmail.com>
To: gcc-patches <gcc-patches@gcc.gnu.org>
Cc: rdapp.gcc@gmail.com
Subject: [PATCH] vect: Add a popcount fallback.
Date: Mon, 7 Aug 2023 22:20:07 +0200 [thread overview]
Message-ID: <4d0d53a0-20d2-5b98-c4f9-67b624a27269@gmail.com> (raw)
Hi,
This patch adds a fallback when the backend does not provide a popcount
implementation. The algorithm is the same one libgcc uses, as well as
match.pd for recognizing a popcount idiom. __builtin_ctz and __builtin_ffs
can also rely on popcount so I used the fallback for them as well.
Bootstrapped and regtested on x86, aarch64 and power10. Unfortunately
I don't have access to any architecture other than riscv that vectorizes
but does not have a vectorized popcount. I added all vect_int targets
to the selector where a cursory grep "expand.*popcount" would yield no
result.
Regards
Robin
gcc/ChangeLog:
* tree-vect-patterns.cc (vect_have_popcount_fallback): New
function.
(vect_generate_popcount_fallback): New function to emit
vectorized popcount fallback.
(vect_recog_ctz_ffs_pattern): Use fallback.
(vect_recog_popcount_clz_ctz_ffs_pattern): Ditto.
gcc/testsuite/ChangeLog:
* gcc.dg/vect/vect-popcount-fallback.c: New test.
---
.../gcc.dg/vect/vect-popcount-fallback.c | 106 +++++++++++
gcc/tree-vect-patterns.cc | 172 ++++++++++++++++--
2 files changed, 267 insertions(+), 11 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
diff --git a/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
new file mode 100644
index 00000000000..f6300f4ab35
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-popcount-fallback.c
@@ -0,0 +1,106 @@
+/* Check if we vectorize popcount when no expander is available. */
+/* { dg-do run { target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */
+/* { dg-additional-options { -O2 -fdump-tree-vect-details -fno-vect-cost-model } } */
+/* { dg-require-effective-target vect_int } */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stdint-gcc.h>
+
+__attribute__ ((noipa))
+void popc32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_popcount (a[i]);
+}
+
+__attribute__ ((noipa))
+void ctz32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ctz (a[i]);
+}
+
+__attribute__ ((noipa))
+void ffs32 (uint32_t *restrict dst, uint32_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ffs (a[i]);
+}
+
+__attribute__ ((noipa))
+void popc64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_popcountll (a[i]);
+}
+
+__attribute__ ((noipa))
+void ctz64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ctzll (a[i]);
+}
+
+__attribute__ ((noipa))
+void ffs64 (uint64_t *restrict dst, uint64_t *restrict a, int n)
+{
+ for (int i = 0; i < n; i++)
+ dst[i] = __builtin_ffsll (a[i]);
+}
+
+#define SZ 512
+
+__attribute__ ((optimize ("0")))
+int main ()
+{
+ uint32_t *a32pc = malloc (SZ * sizeof (*a32pc));
+ uint32_t *b32pc = malloc (SZ * sizeof (*b32pc));
+ uint32_t *a32ctz = malloc (SZ * sizeof (*a32ctz));
+ uint32_t *b32ctz = malloc (SZ * sizeof (*b32ctz));
+ uint32_t *a32ffs = malloc (SZ * sizeof (*a32ffs));
+ uint32_t *b32ffs = malloc (SZ * sizeof (*b32ffs));
+
+ uint64_t *a64pc = malloc (SZ * sizeof (*a64pc));
+ uint64_t *b64pc = malloc (SZ * sizeof (*b64pc));
+ uint64_t *a64ctz = malloc (SZ * sizeof (*a64ctz));
+ uint64_t *b64ctz = malloc (SZ * sizeof (*b64ctz));
+ uint64_t *a64ffs = malloc (SZ * sizeof (*a64ffs));
+ uint64_t *b64ffs = malloc (SZ * sizeof (*b64ffs));
+
+ for (int i = 0; i < SZ; i++)
+ {
+ int ia = i + 1;
+ a32pc[i] = ia * 1234567;
+ b32pc[i] = 0;
+ a32ctz[i] = ia * 1234567;
+ b32ctz[i] = 0;
+ a32ffs[i] = ia * 1234567;
+ b32ffs[i] = 0;
+ a64pc[i] = ia * 123456789ull;
+ b64pc[i] = 0;
+ a64ctz[i] = ia * 123456789ull;
+ b64ctz[i] = 0;
+ a64ffs[i] = ia * 123456789ull;
+ b64ffs[i] = 0;
+ }
+
+ popc32 (b32pc, a32pc, SZ);
+ ctz32 (b32ctz, a32ctz, SZ);
+ ffs32 (b32ffs, a32ffs, SZ);
+ popc64 (b64pc, a64pc, SZ);
+ ctz64 (b64ctz, a64ctz, SZ);
+ ffs64 (b64ffs, a64ffs, SZ);
+
+ for (int i = 0; i < SZ; i++)
+ {
+ assert (b32pc[i] == __builtin_popcount (a32pc[i]));
+ assert (b32ctz[i] == __builtin_ctz (a32ctz[i]));
+ assert (b32ffs[i] == __builtin_ffs (a32ffs[i]));
+ assert (b64pc[i] == __builtin_popcountll (a64pc[i]));
+ assert (b64ctz[i] == __builtin_ctzll (a64ctz[i]));
+ assert (b64ffs[i] == __builtin_ffsll (a64ffs[i]));
+ }
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 6 "vect" target { amdgcn-*-* sparc*-*-* alpha*-*-* ia64-*-* mips*-*-* riscv*-*-* } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index ef806e2346e..b812354b986 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -1782,6 +1782,122 @@ vect_recog_widen_abd_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
return widen_abd_stmt;
}
+/* Return TRUE if we have the necessary operations to create a vectorized
+ popcount for type VEC_TYPE. */
+
+static bool
+vect_have_popcount_fallback (tree vec_type)
+{
+ return (optab_for_tree_code (RSHIFT_EXPR, vec_type, optab_scalar)
+ && optab_for_tree_code (PLUS_EXPR, vec_type, optab_default)
+ && optab_for_tree_code (MINUS_EXPR, vec_type, optab_default)
+ && optab_for_tree_code (BIT_AND_EXPR, vec_type, optab_default)
+ && optab_for_tree_code (MULT_EXPR, vec_type, optab_default));
+}
+
+/* This generates a Wilkes-Wheeler-Gill popcount similar to what libgcc
+ does (and match.pd recognizes). There are only 32-bit and 64-bit
+ variants.
+ It returns the generated gimple sequence of vector instructions with
+ type VEC_TYPE which is being attached to STMT_VINFO.
+ RHS is the unpromoted input value and LHS_TYPE is the output type.
+ RET_VAR is the address of an SSA variable that holds the result
+ of the last operation. It needs to be created before calling
+ this function and must have LHS_TYPE.
+
+ int popcount64c (uint64_t x)
+ {
+ x -= (x >> 1) & 0x5555555555555555ULL;
+ x = (x & 0x3333333333333333ULL) + ((x >> 2) & 0x3333333333333333ULL);
+ x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0fULL;
+ return (x * 0x0101010101010101ULL) >> 56;
+ }
+
+ int popcount32c (uint32_t x)
+ {
+ x -= (x >> 1) & 0x55555555;
+ x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+ x = (x + (x >> 4)) & 0x0f0f0f0f;
+ return (x * 0x01010101) >> 24;
+ }
+ */
+
+static gimple*
+vect_generate_popcount_fallback (vec_info *vinfo, stmt_vec_info stmt_vinfo,
+ vect_unpromoted_value rhs, tree lhs_type,
+ tree vec_type, tree *ret_var)
+{
+ int bitsize = GET_MODE_BITSIZE (TYPE_MODE (lhs_type)).to_constant ();
+ bool is64 = bitsize == 64;
+
+ tree nuop1 = vect_convert_input (vinfo, stmt_vinfo,
+ lhs_type, &rhs, vec_type);
+
+ tree one_cst = build_one_cst (lhs_type);
+ tree two_cst = build_int_cst (lhs_type, 2);
+ tree four_cst = build_int_cst (lhs_type, 4);
+ tree lcst = build_int_cst (lhs_type, bitsize - CHAR_BIT);
+
+ tree c1 = build_int_cst (lhs_type,
+ is64 ? 0x5555555555555555ull : 0x55555555);
+ tree c2 = build_int_cst (lhs_type,
+ is64 ? 0x3333333333333333ull : 0x33333333);
+ tree c4 = build_int_cst (lhs_type,
+ is64 ? 0x0F0F0F0F0F0F0F0Full : 0x0F0F0F0F);
+ tree cm = build_int_cst (lhs_type,
+ is64 ? 0x0101010101010101ull : 0x01010101);
+
+ gassign *g;
+
+ tree shift1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift1, RSHIFT_EXPR, nuop1, one_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and1, BIT_AND_EXPR, shift1, c1);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x1 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x1, MINUS_EXPR, nuop1, and1);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree shift2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift2, RSHIFT_EXPR, x1, two_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and2, BIT_AND_EXPR, shift2, c2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree and22 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (and22, BIT_AND_EXPR, x1, c2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x2 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x2, PLUS_EXPR, and2, and22);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree shift3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (shift3, RSHIFT_EXPR, x2, four_cst);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree plus3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (plus3, PLUS_EXPR, shift3, x2);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x3 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x3, BIT_AND_EXPR, plus3, c4);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ tree x4 = vect_recog_temp_ssa_var (lhs_type, NULL);
+ g = gimple_build_assign (x4, MULT_EXPR, x3, cm);
+ append_pattern_def_seq (vinfo, stmt_vinfo, g, vec_type);
+
+ g = gimple_build_assign (*ret_var, RSHIFT_EXPR, x4, lcst);
+
+ return g;
+}
+
/* Function vect_recog_ctz_ffs_pattern
Try to find the following pattern:
@@ -1894,8 +2010,9 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
}
if ((ifnnew == IFN_LAST
|| (defined_at_zero && !defined_at_zero_new))
- && direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
- OPTIMIZE_FOR_SPEED))
+ && (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_rhs_type,
+ OPTIMIZE_FOR_SPEED)
+ || vect_have_popcount_fallback (vec_rhs_type)))
{
ifnnew = IFN_POPCOUNT;
defined_at_zero_new = true;
@@ -1996,9 +2113,22 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
/* Create B = .IFNNEW (A). */
new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
- pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
- gimple_call_set_lhs (pattern_stmt, new_var);
- gimple_set_location (pattern_stmt, loc);
+ if (ifnnew == IFN_POPCOUNT
+ && !direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED))
+ {
+ gcc_assert (vect_have_popcount_fallback (vec_type));
+ vect_unpromoted_value un_rhs;
+ vect_look_through_possible_promotion (vinfo, rhs_oprnd, &un_rhs);
+ pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo, un_rhs,
+ lhs_type, vec_type,
+ &new_var);
+ }
+ else
+ {
+ pattern_stmt = gimple_build_call_internal (ifnnew, 1, rhs_oprnd);
+ gimple_call_set_lhs (pattern_stmt, new_var);
+ gimple_set_location (pattern_stmt, loc);
+ }
*type_out = vec_type;
if (sub)
@@ -2042,6 +2172,7 @@ vect_recog_ctz_ffs_pattern (vec_info *vinfo, stmt_vec_info stmt_vinfo,
return pattern_stmt;
}
+
/* Function vect_recog_popcount_clz_ctz_ffs_pattern
Try to find the following pattern:
@@ -2226,12 +2357,17 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
if (!vec_type)
return NULL;
+ bool popcount_fallback_p = false;
+
bool supported
= direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED);
if (!supported)
switch (ifn)
{
case IFN_POPCOUNT:
+ if (vect_have_popcount_fallback (vec_type))
+ popcount_fallback_p = true;
+ break;
case IFN_CLZ:
return NULL;
case IFN_FFS:
@@ -2247,7 +2383,8 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
OPTIMIZE_FOR_SPEED))
break;
if (direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type,
- OPTIMIZE_FOR_SPEED))
+ OPTIMIZE_FOR_SPEED)
+ || vect_have_popcount_fallback (vec_type))
break;
return NULL;
default:
@@ -2257,12 +2394,25 @@ vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo,
vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern",
call_stmt);
- /* Create B = .POPCOUNT (A). */
new_var = vect_recog_temp_ssa_var (lhs_type, NULL);
- pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
- gimple_call_set_lhs (pattern_stmt, new_var);
- gimple_set_location (pattern_stmt, gimple_location (last_stmt));
- *type_out = vec_type;
+ if (!popcount_fallback_p)
+ {
+ /* Create B = .POPCOUNT (A). */
+ pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op);
+ gimple_call_set_lhs (pattern_stmt, new_var);
+ gimple_set_location (pattern_stmt, gimple_location (last_stmt));
+ *type_out = vec_type;
+ }
+ else
+ {
+ pattern_stmt = vect_generate_popcount_fallback (vinfo, stmt_vinfo,
+ unprom_diff,
+ lhs_type, vec_type,
+ &new_var);
+ *type_out = vec_type;
+ /* For popcount we're done here. */
+ return pattern_stmt;
+ }
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
--
2.41.0
next reply other threads:[~2023-08-07 20:20 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-08-07 20:20 Robin Dapp [this message]
2023-08-08 6:21 ` Richard Biener
2023-08-08 8:55 ` Robin Dapp
2023-08-08 10:02 ` Richard Biener
2023-08-08 11:37 ` Robin Dapp
2023-08-08 12:49 ` Richard Biener
2023-08-08 13:06 ` Robin Dapp
2023-08-08 13:28 ` Richard Biener
2023-08-09 10:23 ` Robin Dapp
2023-08-09 11:58 ` Richard Biener
2023-08-08 14:14 ` Jeff Law
2023-08-08 14:21 ` Robin Dapp
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4d0d53a0-20d2-5b98-c4f9-67b624a27269@gmail.com \
--to=rdapp.gcc@gmail.com \
--cc=gcc-patches@gcc.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).