From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2153) id 74C1E3858D39; Wed, 19 Apr 2023 09:15:14 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 74C1E3858D39 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1681895714; bh=M63Erst0ZiNHlNnppbK/SwNMqhbYTrV7wPNps2EDKA4=; h=From:To:Subject:Date:From; b=KgX87FYpifK/ifcy1s23WOSHbmhGL7rhOP40Inch/VcNxFHKAFz7KJiRYbygtAaum 6YLcnCPAshB4jc5ouut9hVCsuach4S25iE4US6YUVXRL3iVYUJqL2R3/yQ4tFl51o2 tg26efzAUTO2DtgrZ0lJNn6K/h6SJ0oMumig42C0= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Jakub Jelinek To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-65] tree-vect-patterns: Improve __builtin_{clz, ctz, ffs}ll vectorization [PR109011] X-Act-Checkin: gcc X-Git-Author: Jakub Jelinek X-Git-Refname: refs/heads/master X-Git-Oldrev: 76f44fbfea1f11e53d4b7e83f0debd029c94a1b3 X-Git-Newrev: ade0a1ee5c6707b950ba284adcfed0514866c12d Message-Id: <20230419091514.74C1E3858D39@sourceware.org> Date: Wed, 19 Apr 2023 09:15:14 +0000 (GMT) List-Id: https://gcc.gnu.org/g:ade0a1ee5c6707b950ba284adcfed0514866c12d commit r14-65-gade0a1ee5c6707b950ba284adcfed0514866c12d Author: Jakub Jelinek Date: Wed Apr 19 11:14:23 2023 +0200 tree-vect-patterns: Improve __builtin_{clz,ctz,ffs}ll vectorization [PR109011] For __builtin_popcountll tree-vect-patterns.cc has vect_recog_popcount_pattern, which improves the vectorized code. Without that the vectorization is always multi-type vectorization in the loop (at least int and long long types) where we emit two .POPCOUNT calls with long long arguments and int return value and then widen to long long, so effectively after vectorization do the V?DImode -> V?DImode popcount twice, then pack the result into V?SImode and immediately unpack. The following patch extends that handling to __builtin_{clz,ctz,ffs}ll builtins as well (as long as there is an optab for them; more to come laster). x86 can do __builtin_popcountll with -mavx512vpopcntdq, __builtin_clzll with -mavx512cd, ppc can do __builtin_popcountll and __builtin_clzll with -mpower8-vector and __builtin_ctzll with -mpower9-vector, s390 can do __builtin_{popcount,clz,ctz}ll with -march=z13 -mzarch (i.e. VX). 2023-04-19 Jakub Jelinek PR tree-optimization/109011 * tree-vect-patterns.cc (vect_recog_popcount_pattern): Rename to ... (vect_recog_popcount_clz_ctz_ffs_pattern): ... this. Handle also CLZ, CTZ and FFS. Remove vargs variable, use gimple_build_call_internal rather than gimple_build_call_internal_vec. (vect_vect_recog_func_ptrs): Adjust popcount entry. * gcc.dg/vect/pr109011-1.c: New test. Diff: --- gcc/testsuite/gcc.dg/vect/pr109011-1.c | 48 +++++++++++ gcc/tree-vect-patterns.cc | 148 +++++++++++++++++++++++++++------ 2 files changed, 171 insertions(+), 25 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr109011-1.c b/gcc/testsuite/gcc.dg/vect/pr109011-1.c new file mode 100644 index 00000000000..707a82aaf43 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr109011-1.c @@ -0,0 +1,48 @@ +/* PR tree-optimization/109011 */ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-unroll-loops --param=vect-epilogues-nomask=0 -fdump-tree-optimized" } */ +/* { dg-additional-options "-mavx512cd" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } */ +/* { dg-additional-options "-mavx512vpopcntdq" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } */ +/* { dg-additional-options "-mpower8-vector" { target powerpc_p8vector_ok } } */ +/* { dg-additional-options "-mpower9-vector" { target powerpc_p9vector_ok } } */ +/* { dg-additional-options "-march=z13 -mzarch" { target s390_vx } } */ + +void +foo (long long *p, long long *q) +{ +#pragma omp simd + for (int i = 0; i < 2048; ++i) + p[i] = __builtin_popcountll (q[i]); +} + +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512vpopcntdq } } } } */ +/* { dg-final { scan-tree-dump-times " = \.POPCOUNT \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */ + +void +bar (long long *p, long long *q) +{ +#pragma omp simd + for (int i = 0; i < 2048; ++i) + p[i] = __builtin_clzll (q[i]); +} + +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { { i?86-*-* x86_64-*-* } && avx512cd } } } } */ +/* { dg-final { scan-tree-dump-times " = \.CLZ \\\(" 1 "optimized" { target { powerpc_p8vector_ok || s390_vx } } } } */ + +void +baz (long long *p, long long *q) +{ +#pragma omp simd + for (int i = 0; i < 2048; ++i) + p[i] = __builtin_ctzll (q[i]); +} + +/* { dg-final { scan-tree-dump-times " = \.CTZ \\\(" 1 "optimized" { target { powerpc_p9vector_ok || s390_vx } } } } */ + +void +qux (long long *p, long long *q) +{ +#pragma omp simd + for (int i = 0; i < 2048; ++i) + p[i] = __builtin_ffsll (q[i]); +} diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc index 8802141cd6e..633998e8e3a 100644 --- a/gcc/tree-vect-patterns.cc +++ b/gcc/tree-vect-patterns.cc @@ -1501,7 +1501,7 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info, "vect_recog_widen_minus_pattern"); } -/* Function vect_recog_popcount_pattern +/* Function vect_recog_popcount_clz_ctz_ffs_pattern Try to find the following pattern: @@ -1530,16 +1530,20 @@ vect_recog_widen_minus_pattern (vec_info *vinfo, stmt_vec_info last_stmt_info, * Return value: A new stmt that will be used to replace the sequence of stmts that constitute the pattern. In this case it will be: B = .POPCOUNT (A); + + Similarly for clz, ctz and ffs. */ static gimple * -vect_recog_popcount_pattern (vec_info *vinfo, - stmt_vec_info stmt_vinfo, tree *type_out) +vect_recog_popcount_clz_ctz_ffs_pattern (vec_info *vinfo, + stmt_vec_info stmt_vinfo, + tree *type_out) { gassign *last_stmt = dyn_cast (stmt_vinfo->stmt); - gimple *popcount_stmt, *pattern_stmt; + gimple *call_stmt, *pattern_stmt; tree rhs_oprnd, rhs_origin, lhs_oprnd, lhs_type, vec_type, new_var; - auto_vec vargs; + internal_fn ifn = IFN_LAST; + int addend = 0; /* Find B = (TYPE1) temp_out. */ if (!last_stmt) @@ -1557,51 +1561,137 @@ vect_recog_popcount_pattern (vec_info *vinfo, if (TREE_CODE (rhs_oprnd) != SSA_NAME || !has_single_use (rhs_oprnd)) return NULL; - popcount_stmt = SSA_NAME_DEF_STMT (rhs_oprnd); + call_stmt = SSA_NAME_DEF_STMT (rhs_oprnd); /* Find temp_out = __builtin_popcount{,l,ll} (temp_in); */ - if (!is_gimple_call (popcount_stmt)) + if (!is_gimple_call (call_stmt)) return NULL; - switch (gimple_call_combined_fn (popcount_stmt)) + switch (gimple_call_combined_fn (call_stmt)) { + int val; CASE_CFN_POPCOUNT: + ifn = IFN_POPCOUNT; + break; + CASE_CFN_CLZ: + ifn = IFN_CLZ; + /* Punt if call result is unsigned and defined value at zero + is negative, as the negative value doesn't extend correctly. */ + if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd)) + && gimple_call_internal_p (call_stmt) + && CLZ_DEFINED_VALUE_AT_ZERO + (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2 + && val < 0) + return NULL; + break; + CASE_CFN_CTZ: + ifn = IFN_CTZ; + /* Punt if call result is unsigned and defined value at zero + is negative, as the negative value doesn't extend correctly. */ + if (TYPE_UNSIGNED (TREE_TYPE (rhs_oprnd)) + && gimple_call_internal_p (call_stmt) + && CTZ_DEFINED_VALUE_AT_ZERO + (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val) == 2 + && val < 0) + return NULL; + break; + CASE_CFN_FFS: + ifn = IFN_FFS; break; default: return NULL; } - if (gimple_call_num_args (popcount_stmt) != 1) + if (gimple_call_num_args (call_stmt) != 1) return NULL; - rhs_oprnd = gimple_call_arg (popcount_stmt, 0); + rhs_oprnd = gimple_call_arg (call_stmt, 0); vect_unpromoted_value unprom_diff; - rhs_origin = vect_look_through_possible_promotion (vinfo, rhs_oprnd, - &unprom_diff); + rhs_origin + = vect_look_through_possible_promotion (vinfo, rhs_oprnd, &unprom_diff); if (!rhs_origin) return NULL; - /* Input and output of .POPCOUNT should be same-precision integer. - Also A should be unsigned or same precision as temp_in, - otherwise there would be sign_extend from A to temp_in. */ - if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type) - || (!TYPE_UNSIGNED (unprom_diff.type) - && (TYPE_PRECISION (unprom_diff.type) - != TYPE_PRECISION (TREE_TYPE (rhs_oprnd))))) + /* Input and output of .POPCOUNT should be same-precision integer. */ + if (TYPE_PRECISION (unprom_diff.type) != TYPE_PRECISION (lhs_type)) return NULL; - vargs.safe_push (unprom_diff.op); - vect_pattern_detected ("vec_regcog_popcount_pattern", popcount_stmt); + /* Also A should be unsigned or same precision as temp_in, otherwise + different builtins/internal functions have different behaviors. */ + if (TYPE_PRECISION (unprom_diff.type) + != TYPE_PRECISION (TREE_TYPE (rhs_oprnd))) + switch (ifn) + { + case IFN_POPCOUNT: + /* For popcount require zero extension, which doesn't add any + further bits to the count. */ + if (!TYPE_UNSIGNED (unprom_diff.type)) + return NULL; + break; + case IFN_CLZ: + /* clzll (x) == clz (x) + 32 for unsigned x != 0, so ok + if it is undefined at zero or if it matches also for the + defined value there. */ + if (!TYPE_UNSIGNED (unprom_diff.type)) + return NULL; + if (!type_has_mode_precision_p (lhs_type) + || !type_has_mode_precision_p (TREE_TYPE (rhs_oprnd))) + return NULL; + addend = (TYPE_PRECISION (TREE_TYPE (rhs_oprnd)) + - TYPE_PRECISION (lhs_type)); + if (gimple_call_internal_p (call_stmt)) + { + int val1, val2; + int d1 + = CLZ_DEFINED_VALUE_AT_ZERO + (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1); + int d2 + = CLZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type), + val2); + if (d1 != 2) + break; + if (d2 != 2 || val1 != val2 + addend) + return NULL; + } + break; + case IFN_CTZ: + /* ctzll (x) == ctz (x) for unsigned or signed x != 0, so ok + if it is undefined at zero or if it matches also for the + defined value there. */ + if (gimple_call_internal_p (call_stmt)) + { + int val1, val2; + int d1 + = CTZ_DEFINED_VALUE_AT_ZERO + (SCALAR_INT_TYPE_MODE (TREE_TYPE (rhs_oprnd)), val1); + int d2 + = CTZ_DEFINED_VALUE_AT_ZERO (SCALAR_INT_TYPE_MODE (lhs_type), + val2); + if (d1 != 2) + break; + if (d2 != 2 || val1 != val2) + return NULL; + } + break; + case IFN_FFS: + /* ffsll (x) == ffs (x) for unsigned or signed x. */ + break; + default: + gcc_unreachable (); + } + + vect_pattern_detected ("vec_recog_popcount_clz_ctz_ffs_pattern", + call_stmt); vec_type = get_vectype_for_scalar_type (vinfo, lhs_type); - /* Do it only if the backend has popcount2 pattern. */ + /* Do it only if the backend has popcount2 etc. pattern. */ if (!vec_type - || !direct_internal_fn_supported_p (IFN_POPCOUNT, vec_type, + || !direct_internal_fn_supported_p (ifn, vec_type, OPTIMIZE_FOR_SPEED)) return NULL; /* Create B = .POPCOUNT (A). */ new_var = vect_recog_temp_ssa_var (lhs_type, NULL); - pattern_stmt = gimple_build_call_internal_vec (IFN_POPCOUNT, vargs); + pattern_stmt = gimple_build_call_internal (ifn, 1, unprom_diff.op); gimple_call_set_lhs (pattern_stmt, new_var); gimple_set_location (pattern_stmt, gimple_location (last_stmt)); *type_out = vec_type; @@ -1609,6 +1699,14 @@ vect_recog_popcount_pattern (vec_info *vinfo, if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "created pattern stmt: %G", pattern_stmt); + + if (addend) + { + append_pattern_def_seq (vinfo, stmt_vinfo, pattern_stmt, vec_type); + tree ret_var = vect_recog_temp_ssa_var (lhs_type, NULL); + pattern_stmt = gimple_build_assign (ret_var, PLUS_EXPR, new_var, + build_int_cst (lhs_type, addend)); + } return pattern_stmt; } @@ -6051,7 +6149,7 @@ static vect_recog_func vect_vect_recog_func_ptrs[] = { { vect_recog_sad_pattern, "sad" }, { vect_recog_widen_sum_pattern, "widen_sum" }, { vect_recog_pow_pattern, "pow" }, - { vect_recog_popcount_pattern, "popcount" }, + { vect_recog_popcount_clz_ctz_ffs_pattern, "popcount_clz_ctz_ffs" }, { vect_recog_widen_shift_pattern, "widen_shift" }, { vect_recog_rotate_pattern, "rotate" }, { vect_recog_vector_vector_shift_pattern, "vector_vector_shift" },