From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1666) id A91BC3858D1E; Fri, 13 Oct 2023 12:30:18 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A91BC3858D1E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1697200218; bh=rDHs/xOMntYVmh4p7ZDOq9Lrm3gP/YlD1xXcI/3GWJg=; h=From:To:Subject:Date:From; b=KDDKq4ZTi4vfKM7AfqtOrO7Ovmf3MmQMYDsMH6xA2oa6i/y8mBJDBgoaPsirbjTAt z77MrghndHmGaeogBzU0o3n60sxPwMdJiJj3N7IhLwIrk+8bs3NhfFCjQrMxMKSJ9Q HAk67ezNFQCJQPUj4zxqiNC6ZOJg1OzudkwvbFPo= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Richard Biener To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-4629] OMP SIMD inbranch call vectorization for AVX512 style masks X-Act-Checkin: gcc X-Git-Author: Richard Biener X-Git-Refname: refs/heads/master X-Git-Oldrev: 63eaccd114393f4692976bb78b30148e6d77a89e X-Git-Newrev: 3179ad72f67f31824c444ef30ef171ad7495d274 Message-Id: <20231013123018.A91BC3858D1E@sourceware.org> Date: Fri, 13 Oct 2023 12:30:18 +0000 (GMT) List-Id: https://gcc.gnu.org/g:3179ad72f67f31824c444ef30ef171ad7495d274 commit r14-4629-g3179ad72f67f31824c444ef30ef171ad7495d274 Author: Richard Biener Date: Fri Oct 13 12:32:51 2023 +0200 OMP SIMD inbranch call vectorization for AVX512 style masks The following teaches vectorizable_simd_clone_call to handle integer mode masks. The tricky bit is to second-guess the number of lanes represented by a single mask argument - the following uses simdlen and the number of mask arguments to calculate that, assuming ABIs have them uniform. Similar to the VOIDmode handling there's a restriction on not supporting splitting/merging of incoming vector masks to more/less SIMD call arguments. PR tree-optimization/111795 * tree-vect-stmts.cc (vectorizable_simd_clone_call): Handle integer mode mask arguments. * gcc.target/i386/vect-simd-clone-avx512-1.c: New testcase. * gcc.target/i386/vect-simd-clone-avx512-2.c: Likewise. * gcc.target/i386/vect-simd-clone-avx512-3.c: Likewise. Diff: --- .../gcc.target/i386/vect-simd-clone-avx512-1.c | 43 ++++++ .../gcc.target/i386/vect-simd-clone-avx512-2.c | 6 + .../gcc.target/i386/vect-simd-clone-avx512-3.c | 6 + gcc/tree-vect-stmts.cc | 150 ++++++++++++++++----- 4 files changed, 175 insertions(+), 30 deletions(-) diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c new file mode 100644 index 00000000000..e350996439e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-1.c @@ -0,0 +1,43 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */ + +#include "avx512vl-check.h" + +#ifndef SIMDLEN +#define SIMDLEN 4 +#endif + +int x[1024]; + +#pragma omp declare simd simdlen(SIMDLEN) +__attribute__((noinline)) int +foo (int a, int b) +{ + return a + b; +} + +void __attribute__((noipa)) +bar (void) +{ +#pragma omp simd + for (int i = 0; i < 1024; i++) + if (x[i] < 20) + x[i] = foo (x[i], x[i]); +} + +void avx512vl_test () +{ + int i; +#pragma GCC novector + for (i = 0; i < 1024; i++) + x[i] = i; + + bar (); + +#pragma GCC novector + for (i = 0; i < 1024; i++) + if ((i < 20 && x[i] != i + i) + || (i >= 20 && x[i] != i)) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c new file mode 100644 index 00000000000..d9968ae30f5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-2.c @@ -0,0 +1,6 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */ + +#define SIMDLEN 8 +#include "vect-simd-clone-avx512-1.c" diff --git a/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c new file mode 100644 index 00000000000..c05f6c8ce91 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-simd-clone-avx512-3.c @@ -0,0 +1,6 @@ +/* { dg-do run } */ +/* { dg-require-effective-target avx512vl } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512vl" } */ + +#define SIMDLEN 16 +#include "vect-simd-clone-avx512-1.c" diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 33b557c2a49..b3a56498595 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -4385,6 +4385,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, i = -1; break; case SIMD_CLONE_ARG_TYPE_MASK: + if (SCALAR_INT_MODE_P (n->simdclone->mask_mode) + != SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype))) + i = -1; break; } if (i == (size_t) -1) @@ -4410,6 +4413,12 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, if (bestn == NULL) return false; + unsigned int num_mask_args = 0; + if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode)) + for (i = 0; i < nargs; i++) + if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK) + num_mask_args++; + for (i = 0; i < nargs; i++) { if ((arginfo[i].dt == vect_constant_def @@ -4434,30 +4443,50 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, return false; } - if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK - && bestn->simdclone->mask_mode == VOIDmode - && (simd_clone_subparts (bestn->simdclone->args[i].vector_type) - != simd_clone_subparts (arginfo[i].vectype))) + if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK) { - /* FORNOW we only have partial support for vector-type masks that - can't hold all of simdlen. */ - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "in-branch vector clones are not yet" - " supported for mismatched vector sizes.\n"); - return false; - } - if (bestn->simdclone->args[i].arg_type == SIMD_CLONE_ARG_TYPE_MASK - && bestn->simdclone->mask_mode != VOIDmode) - { - /* FORNOW don't support integer-type masks. */ - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "in-branch vector clones are not yet" - " supported for integer mask modes.\n"); - return false; + if (bestn->simdclone->mask_mode == VOIDmode) + { + if (simd_clone_subparts (bestn->simdclone->args[i].vector_type) + != simd_clone_subparts (arginfo[i].vectype)) + { + /* FORNOW we only have partial support for vector-type masks + that can't hold all of simdlen. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "in-branch vector clones are not yet" + " supported for mismatched vector sizes.\n"); + return false; + } + } + else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode)) + { + if (!SCALAR_INT_MODE_P (TYPE_MODE (arginfo[i].vectype)) + || maybe_ne (exact_div (bestn->simdclone->simdlen, + num_mask_args), + simd_clone_subparts (arginfo[i].vectype))) + { + /* FORNOW we only have partial support for integer-type masks + that represent the same number of lanes as the + vectorized mask inputs. */ + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "in-branch vector clones are not yet " + "supported for mismatched vector sizes.\n"); + return false; + } + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "in-branch vector clones not supported" + " on this target.\n"); + return false; + } } } @@ -4674,14 +4703,9 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, } break; case SIMD_CLONE_ARG_TYPE_MASK: - atype = bestn->simdclone->args[i].vector_type; - if (bestn->simdclone->mask_mode != VOIDmode) - { - /* FORNOW: this is disabled above. */ - gcc_unreachable (); - } - else + if (bestn->simdclone->mask_mode == VOIDmode) { + atype = bestn->simdclone->args[i].vector_type; tree elt_type = TREE_TYPE (atype); tree one = fold_convert (elt_type, integer_one_node); tree zero = fold_convert (elt_type, integer_zero_node); @@ -4732,6 +4756,72 @@ vectorizable_simd_clone_call (vec_info *vinfo, stmt_vec_info stmt_info, } } } + else if (SCALAR_INT_MODE_P (bestn->simdclone->mask_mode)) + { + atype = bestn->simdclone->args[i].vector_type; + /* Guess the number of lanes represented by atype. */ + unsigned HOST_WIDE_INT atype_subparts + = exact_div (bestn->simdclone->simdlen, + num_mask_args).to_constant (); + o = vector_unroll_factor (nunits, atype_subparts); + for (m = j * o; m < (j + 1) * o; m++) + { + if (m == 0) + { + if (!slp_node) + vect_get_vec_defs_for_operand (vinfo, stmt_info, + o * ncopies, + op, + &vec_oprnds[i]); + vec_oprnds_i[i] = 0; + } + if (atype_subparts + < simd_clone_subparts (arginfo[i].vectype)) + { + /* The mask argument has fewer elements than the + input vector. */ + /* FORNOW */ + gcc_unreachable (); + } + else if (atype_subparts + == simd_clone_subparts (arginfo[i].vectype)) + { + /* The vector mask argument matches the input + in the number of lanes, but not necessarily + in the mode. */ + vec_oprnd0 = vec_oprnds[i][vec_oprnds_i[i]++]; + tree st = lang_hooks.types.type_for_mode + (TYPE_MODE (TREE_TYPE (vec_oprnd0)), 1); + vec_oprnd0 = build1 (VIEW_CONVERT_EXPR, st, + vec_oprnd0); + gassign *new_stmt + = gimple_build_assign (make_ssa_name (st), + vec_oprnd0); + vect_finish_stmt_generation (vinfo, stmt_info, + new_stmt, gsi); + if (!types_compatible_p (atype, st)) + { + new_stmt + = gimple_build_assign (make_ssa_name (atype), + NOP_EXPR, + gimple_assign_lhs + (new_stmt)); + vect_finish_stmt_generation (vinfo, stmt_info, + new_stmt, gsi); + } + vargs.safe_push (gimple_assign_lhs (new_stmt)); + } + else + { + /* The mask argument has more elements than the + input vector. */ + /* FORNOW */ + gcc_unreachable (); + } + } + } + else + gcc_unreachable (); break; case SIMD_CLONE_ARG_TYPE_UNIFORM: vargs.safe_push (op);