* [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR @ 2020-09-18 6:17 Xiong Hu Luo 2020-09-18 6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo ` (2 more replies) 0 siblings, 3 replies; 24+ messages in thread From: Xiong Hu Luo @ 2020-09-18 6:17 UTC (permalink / raw) To: gcc-patches Cc: segher, dje.gcc, wschmidt, guojiufu, linkw, richard.guenther, richard.sandiford, Xiong Hu Luo This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to VEC_SET internal function in gimple-isel pass if target supports vec_set with variable index by checking can_vec_set_var_idx_p. gcc/ChangeLog: 2020-09-18 Xionghu Luo <luoxhu@linux.ibm.com> * gimple-isel.cc (gimple_expand_vec_set_expr): New function. (gimple_expand_vec_cond_exprs): Call gimple_expand_vec_set_expr. * internal-fn.c (vec_set_direct): New define. (expand_vec_set_optab_fn): New function. (direct_vec_set_optab_supported_p): New define. * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN. * optabs.c (can_vec_set_var_idx_p): New function. * optabs.h (can_vec_set_var_idx_p): New declare. --- gcc/gimple-isel.cc | 116 +++++++++++++++++++++++++++++++++++++++++++- gcc/internal-fn.c | 36 ++++++++++++++ gcc/internal-fn.def | 2 + gcc/optabs.c | 17 +++++++ gcc/optabs.h | 3 ++ 5 files changed, 172 insertions(+), 2 deletions(-) diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc index b330cf4c20e..bc61e2895be 100644 --- a/gcc/gimple-isel.cc +++ b/gcc/gimple-isel.cc @@ -35,6 +35,80 @@ along with GCC; see the file COPYING3. If not see #include "tree-cfg.h" #include "bitmap.h" #include "tree-ssa-dce.h" +#include "fold-const.h" +#include "gimple-fold.h" +#include "memmodel.h" +#include "optabs.h" + +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to + internal function based on vector type of selected expansion. + i.e.: + VIEW_CONVERT_EXPR<int[4]>(u)[_1] = = i_4(D); + => + _7 = u; + _8 = .VEC_SET (_7, i_4(D), _1); + u = _8; */ + +static gimple * +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) +{ + enum tree_code code; + gcall *new_stmt = NULL; + gassign *ass_stmt = NULL; + + /* Only consider code == GIMPLE_ASSIGN. */ + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi)); + if (!stmt) + return NULL; + + code = TREE_CODE (gimple_assign_lhs (stmt)); + if (code != ARRAY_REF) + return NULL; + + tree lhs = gimple_assign_lhs (stmt); + tree val = gimple_assign_rhs1 (stmt); + + tree type = TREE_TYPE (lhs); + tree op0 = TREE_OPERAND (lhs, 0); + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR + && tree_fits_uhwi_p (TYPE_SIZE (type))) + { + tree pos = TREE_OPERAND (lhs, 1); + tree view_op0 = TREE_OPERAND (op0, 0); + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); + scalar_mode innermode = GET_MODE_INNER (outermode); + tree_code code = TREE_CODE (TREE_TYPE(view_op0)); + if (!is_global_var (view_op0) && code == VECTOR_TYPE + && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0))) + && can_vec_set_var_idx_p (code, outermode, innermode, + TYPE_MODE (TREE_TYPE (pos)))) + { + location_t loc = gimple_location (stmt); + tree var_src = make_ssa_name (TREE_TYPE (view_op0)); + tree var_dst = make_ssa_name (TREE_TYPE (view_op0)); + + ass_stmt = gimple_build_assign (var_src, view_op0); + gimple_set_vuse (ass_stmt, gimple_vuse (stmt)); + gimple_set_location (ass_stmt, loc); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + + new_stmt + = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos); + gimple_call_set_lhs (new_stmt, var_dst); + gimple_set_location (new_stmt, loc); + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); + + ass_stmt = gimple_build_assign (view_op0, var_dst); + gimple_set_location (ass_stmt, loc); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + + gimple_move_vops (ass_stmt, stmt); + gsi_remove (gsi, true); + } + } + + return ass_stmt; +} /* Expand all VEC_COND_EXPR gimple assignments into calls to internal function based on type of selected expansion. */ @@ -187,8 +261,25 @@ gimple_expand_vec_cond_exprs (void) { for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { - gimple *g = gimple_expand_vec_cond_expr (&gsi, - &vec_cond_ssa_name_uses); + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); + if (!stmt) + continue; + + enum tree_code code; + gimple *g = NULL; + code = gimple_assign_rhs_code (stmt); + switch (code) + { + case VEC_COND_EXPR: + g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses); + break; + case ARRAY_REF: + /* TODO: generate IFN for vec_extract with variable index. */ + break; + default: + break; + } + if (g != NULL) { tree lhs = gimple_assign_lhs (gsi_stmt (gsi)); @@ -204,6 +295,27 @@ gimple_expand_vec_cond_exprs (void) simple_dce_from_worklist (dce_ssa_names); + FOR_EACH_BB_FN (bb, cfun) + { + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); + if (!stmt) + continue; + + enum tree_code code; + code = TREE_CODE (gimple_assign_lhs (stmt)); + switch (code) + { + case ARRAY_REF: + gimple_expand_vec_set_expr (&gsi); + break; + default: + break; + } + } + } + return 0; } diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 8efc77d986b..36837381c04 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -115,6 +115,7 @@ init_internal_fns () #define vec_condeq_direct { 0, 0, false } #define scatter_store_direct { 3, 1, false } #define len_store_direct { 3, 3, false } +#define vec_set_direct { 3, 3, false } #define unary_direct { 0, 0, true } #define binary_direct { 0, 0, true } #define ternary_direct { 0, 0, true } @@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn +static void +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) +{ + tree lhs = gimple_call_lhs (stmt); + tree op0 = gimple_call_arg (stmt, 0); + tree op1 = gimple_call_arg (stmt, 1); + tree op2 = gimple_call_arg (stmt, 2); + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); + + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); + scalar_mode innermode = GET_MODE_INNER (outermode); + + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); + + class expand_operand ops[3]; + enum insn_code icode = optab_handler (optab, outermode); + + if (icode != CODE_FOR_nothing) + { + pos = convert_to_mode (E_SImode, pos, 0); + + create_fixed_operand (&ops[0], src); + create_input_operand (&ops[1], value, innermode); + create_input_operand (&ops[2], pos, GET_MODE (pos)); + if (maybe_expand_insn (icode, 3, ops)) + { + emit_move_insn (target, src); + return; + } + } +} + static void expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) { @@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_fold_left_optab_supported_p direct_optab_supported_p #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p #define direct_check_ptrs_optab_supported_p direct_optab_supported_p +#define direct_vec_set_optab_supported_p direct_optab_supported_p /* Return the optab used by internal function FN. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 13e60828fcf..e6cfe1b6159 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) + DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) diff --git a/gcc/optabs.c b/gcc/optabs.c index 184827fdf4e..c8125670d2d 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, && insn_operand_matches (icode, 3, test); } +bool +can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode, + machine_mode value_mode, machine_mode idx_mode) +{ + gcc_assert (code == VECTOR_TYPE); + + rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1); + rtx reg2 = alloca_raw_REG (value_mode, LAST_VIRTUAL_REGISTER + 2); + rtx reg3 = alloca_raw_REG (idx_mode, LAST_VIRTUAL_REGISTER + 3); + + enum insn_code icode = optab_handler (vec_set_optab, vec_mode); + + return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1) + && insn_operand_matches (icode, 1, reg2) + && insn_operand_matches (icode, 2, reg3); +} + /* This function is called when we are going to emit a compare instruction that compares the values found in X and Y, using the rtl operator COMPARISON. diff --git a/gcc/optabs.h b/gcc/optabs.h index 7c2ec257cb0..c018d127756 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -249,6 +249,9 @@ extern int can_compare_p (enum rtx_code, machine_mode, VALUE_MODE. */ extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode); +extern bool can_vec_set_var_idx_p (enum tree_code, machine_mode, machine_mode, + machine_mode); + extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode, machine_mode, int); /* Emit a pair of rtl insns to compare two rtx's and to jump -- 2.27.0.90.geebb51ba8c ^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-18 6:17 [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Xiong Hu Luo @ 2020-09-18 6:17 ` Xiong Hu Luo 2020-09-18 20:19 ` Segher Boessenkool 2020-09-18 18:20 ` [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Segher Boessenkool 2020-09-21 8:31 ` Richard Biener 2 siblings, 1 reply; 24+ messages in thread From: Xiong Hu Luo @ 2020-09-18 6:17 UTC (permalink / raw) To: gcc-patches Cc: segher, dje.gcc, wschmidt, guojiufu, linkw, richard.guenther, richard.sandiford, Xiong Hu Luo vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value to be insert, arg2 is the place to insert arg1 to arg0. Current expander generates stxv+stwx+lxv if arg2 is variable instead of constant, which causes serious store hit load performance issue on Power. This patch tries 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to unify the gimple code, then expander could use vec_set_optab to expand. 2) Expand the IFN VEC_SET to fast instructions: lvsl+xxperm+xxsel. In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too early in gimple stage if arg2 is variable, avoid generating store hit load instructions. For Power9 V4SI: addi 9,1,-16 rldic 6,6,2,60 stxv 34,-16(1) stwx 5,9,6 lxv 34,-16(1) => addis 9,2,.LC0@toc@ha addi 9,9,.LC0@toc@l mtvsrwz 33,5 lxv 32,0(9) sradi 9,6,2 addze 9,9 sldi 9,9,2 subf 9,9,6 subfic 9,9,3 sldi 9,9,2 subfic 9,9,20 lvsl 13,0,9 xxperm 33,33,45 xxperm 32,32,45 xxsel 34,34,33,32 Though instructions increase from 5 to 15, the performance is improved 60% in typical cases. Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE and Power8-BE, bootstrap tested pass. gcc/ChangeLog: 2020-09-18 Xionghu Luo <luoxhu@linux.ibm.com> * config/rs6000/altivec.md (altivec_lvsl_reg_<mode>2): Rename to (altivec_lvsl_reg_<mode>2) and extend to SDI mode. * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Ajdust variable index vec_insert to VIEW_CONVERT_EXPR. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var): New declare. * config/rs6000/rs6000.c (rs6000_expand_vector_set_var): New function. * config/rs6000/rs6000.md (FQHS): New mode iterator. (FD): New mode iterator. p8_mtvsrwz_v16qi<mode>2: New define_insn. p8_mtvsrd_v16qi<mode>2: New define_insn. * config/rs6000/vector.md: Add register operand2 match for vec_set index. * config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2. gcc/testsuite/ChangeLog: 2020-09-18 Xionghu Luo <luoxhu@linux.ibm.com> * gcc.target/powerpc/pr79251.c: New test. * gcc.target/powerpc/pr79251-run.c: New test. * gcc.target/powerpc/pr79251.h: New header. --- gcc/config/rs6000/altivec.md | 4 +- gcc/config/rs6000/rs6000-c.c | 22 ++- gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000.c | 146 ++++++++++++++++++ gcc/config/rs6000/rs6000.md | 19 +++ gcc/config/rs6000/vector.md | 19 ++- gcc/config/rs6000/vsx.md | 2 +- .../gcc.target/powerpc/pr79251-run.c | 29 ++++ gcc/testsuite/gcc.target/powerpc/pr79251.c | 15 ++ gcc/testsuite/gcc.target/powerpc/pr79251.h | 19 +++ 10 files changed, 257 insertions(+), 19 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 0a2e634d6b0..66b636059a6 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2772,10 +2772,10 @@ (define_expand "altivec_lvsl" DONE; }) -(define_insn "altivec_lvsl_reg" +(define_insn "altivec_lvsl_reg_<mode>2" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:SDI 1 "gpc_reg_operand" "b")] UNSPEC_LVSL_REG))] "TARGET_ALTIVEC" "lvsl %0,0,%1" diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 2fad3d94706..78abe49c833 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1509,9 +1509,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, tree arg1; tree arg2; tree arg1_type; - tree arg1_inner_type; tree decl, stmt; - tree innerptrtype; machine_mode mode; /* No second or third arguments. */ @@ -1563,8 +1561,13 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, return build_call_expr (call, 3, arg1, arg0, arg2); } - /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ - arg1_inner_type = TREE_TYPE (arg1_type); + /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with + VIEW_CONVERT_EXPR. i.e.: + D.3192 = v1; + _1 = n & 3; + VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i; + v1 = D.3192; + D.3194 = v1; */ if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) arg2 = build_int_cst (TREE_TYPE (arg2), 0); else @@ -1593,15 +1596,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, SET_EXPR_LOCATION (stmt, loc); stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); } - - innerptrtype = build_pointer_type (arg1_inner_type); - - stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); - stmt = convert (innerptrtype, stmt); - stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); - stmt = build_indirect_ref (loc, stmt, RO_NULL); - stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt, - convert (TREE_TYPE (stmt), arg0)); + stmt = build_array_ref (loc, stmt, arg2); + stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0); stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); return stmt; } diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 28e859f4381..f6f8bd65c2f 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); extern void rs6000_expand_vector_set (rtx, rtx, int); +extern void rs6000_expand_vector_set_var (rtx, rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fe93cf6ff2b..d22d7999a61 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6788,6 +6788,152 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) emit_insn (gen_rtx_SET (target, x)); } +/* Insert value from VEC into idx of TARGET. */ + +void +rs6000_expand_vector_set_var (rtx target, rtx vec, rtx val, rtx idx) +{ + machine_mode mode = GET_MODE (vec); + + if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx)) + gcc_unreachable (); + else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx) + && TARGET_DIRECT_MOVE_64BIT) + { + gcc_assert (GET_MODE (idx) == E_SImode); + machine_mode inner_mode = GET_MODE (val); + HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode); + + rtx tmp = gen_reg_rtx (GET_MODE (idx)); + if (GET_MODE_SIZE (inner_mode) == 8) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 1 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx)); + /* idx = idx * 8. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3))); + /* idx = 16 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (3))); + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 4) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 3 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx)); + /* idx = idx * 4. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2))); + /* idx = 20 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2))); + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 2) + { + if (!BYTES_BIG_ENDIAN) + { + /* idx = 7 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (7), idx)); + /* idx = idx * 2. */ + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); + /* idx = 22 - idx. */ + emit_insn (gen_subsi3 (tmp, GEN_INT (22), tmp)); + } + else + { + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1))); + emit_insn (gen_subsi3 (tmp, GEN_INT (22), idx)); + } + } + else if (GET_MODE_SIZE (inner_mode) == 1) + if (!BYTES_BIG_ENDIAN) + emit_insn (gen_addsi3 (tmp, idx, GEN_INT (8))); + else + emit_insn (gen_subsi3 (tmp, GEN_INT (23), idx)); + else + gcc_unreachable (); + + /* lxv vs32, mask. + DImode: 0xffffffffffffffff0000000000000000 + SImode: 0x00000000ffffffff0000000000000000 + HImode: 0x000000000000ffff0000000000000000. + QImode: 0x00000000000000ff0000000000000000. */ + rtx mask = gen_reg_rtx (V16QImode); + rtx mask_v2di = gen_reg_rtx (V2DImode); + rtvec v = rtvec_alloc (2); + if (!BYTES_BIG_ENDIAN) + { + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask); + } + else + { + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask); + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0); + } + emit_insn ( + gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v))); + rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0); + emit_insn (gen_rtx_SET (mask, sub_mask)); + + /* mtvsrd[wz] f0,val. */ + rtx val_v16qi = gen_reg_rtx (V16QImode); + switch (inner_mode) + { + default: + gcc_unreachable (); + break; + case E_QImode: + emit_insn (gen_p8_mtvsrwz_v16qiqi2 (val_v16qi, val)); + break; + case E_HImode: + emit_insn (gen_p8_mtvsrwz_v16qihi2 (val_v16qi, val)); + break; + case E_SImode: + emit_insn (gen_p8_mtvsrwz_v16qisi2 (val_v16qi, val)); + break; + case E_SFmode: + emit_insn (gen_p8_mtvsrwz_v16qisf2 (val_v16qi, val)); + break; + case E_DImode: + emit_insn (gen_p8_mtvsrd_v16qidi2 (val_v16qi, val)); + break; + case E_DFmode: + emit_insn (gen_p8_mtvsrd_v16qidf2 (val_v16qi, val)); + break; + } + + /* lvsl v1,0,idx. */ + rtx pcv = gen_reg_rtx (V16QImode); + emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp)); + + /* xxperm vs0,vs0,vs33. */ + /* xxperm vs32,vs32,vs33. */ + rtx val_perm = gen_reg_rtx (V16QImode); + rtx mask_perm = gen_reg_rtx (V16QImode); + emit_insn ( + gen_altivec_vperm_v8hiv16qi (val_perm, val_v16qi, val_v16qi, pcv)); + emit_insn (gen_altivec_vperm_v8hiv16qi (mask_perm, mask, mask, pcv)); + + rtx sub_target = simplify_gen_subreg (V16QImode, vec, mode, 0); + emit_insn (gen_rtx_SET (target, sub_target)); + + /* xxsel vs34,vs34,vs0,vs32. */ + emit_insn (gen_vector_select_v16qi (target, target, val_perm, mask_perm)); + } +} + /* Extract field ELT from VEC into TARGET. */ void diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index 43b620ae1c0..b02fda836d4 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -8713,6 +8713,25 @@ (define_insn "p8_mtvsrwz" "mtvsrwz %x0,%1" [(set_attr "type" "mftgpr")]) +(define_mode_iterator FQHS [SF QI HI SI]) +(define_mode_iterator FD [DF DI]) + +(define_insn "p8_mtvsrwz_v16qi<mode>2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRWZ))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrwz %x0,%1" + [(set_attr "type" "mftgpr")]) + +(define_insn "p8_mtvsrd_v16qi<mode>2" + [(set (match_operand:V16QI 0 "register_operand" "=wa") + (unspec:V16QI [(match_operand:FD 1 "register_operand" "r")] + UNSPEC_P8V_MTVSRD))] + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" + "mtvsrd %x0,%1" + [(set_attr "type" "mftgpr")]) + (define_insn_and_split "reload_fpr_from_gpr<mode>" [(set (match_operand:FMOVE64X 0 "register_operand" "=d") (unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")] diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index 796345c80d3..28e59c1c995 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -1227,11 +1227,24 @@ (define_expand "vec_init<mode><VEC_base_l>" (define_expand "vec_set<mode>" [(match_operand:VEC_E 0 "vlogical_operand") (match_operand:<VEC_base> 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "reg_or_cint_operand")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" { - rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2])); - DONE; + if (CONST_INT_P (operands[2])) + { + rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2])); + DONE; + } + else + { + rtx target = gen_reg_rtx (V16QImode); + rs6000_expand_vector_set_var (target, operands[0], operands[1], + operands[2]); + rtx sub_target + = simplify_gen_subreg (GET_MODE (operands[0]), target, V16QImode, 0); + emit_insn (gen_rtx_SET (operands[0], sub_target)); + DONE; + } }) (define_expand "vec_extract<mode><VEC_base_l>" diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index dd750210758..7e82690d12d 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -5349,7 +5349,7 @@ (define_expand "xl_len_r" rtx rtx_vtmp = gen_reg_rtx (V16QImode); rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); + emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2])); emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp)); emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp, diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c new file mode 100644 index 00000000000..840f6712ad2 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c @@ -0,0 +1,29 @@ +/* { dg-do run { target { lp64 && p9vector_hw } } } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ + +#include <stddef.h> +#include <altivec.h> +#include "pr79251.h" + +TEST_VEC_INSERT_ALL (test) + +#define run_test(TYPE, num) \ + { \ + vector TYPE v; \ + vector TYPE u = {0x0}; \ + for (long k = 0; k < 16 / sizeof (TYPE); k++) \ + v[k] = 0xaa; \ + for (long k = 0; k < 16 / sizeof (TYPE); k++) \ + { \ + u = test##num (v, 254, k); \ + if (u[k] != (TYPE) 254) \ + __builtin_abort (); \ + } \ + } + +int +main (void) +{ + TEST_VEC_INSERT_ALL (run_test) + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c new file mode 100644 index 00000000000..8124f503df9 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ + +#include <stddef.h> +#include <altivec.h> +#include "pr79251.h" + +TEST_VEC_INSERT_ALL (test) + +/* { dg-final { scan-assembler-not {\mstxw\M} } } */ +/* { dg-final { scan-assembler-times {\mlvsl\M} 10 } } */ +/* { dg-final { scan-assembler-times {\mxxperm\M} 20 } } */ +/* { dg-final { scan-assembler-times {\mxxsel\M} 10 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.h b/gcc/testsuite/gcc.target/powerpc/pr79251.h new file mode 100644 index 00000000000..609371c96cd --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.h @@ -0,0 +1,19 @@ + +#define test(TYPE, num) \ + __attribute__ ((noinline, noclone)) \ + vector TYPE test##num (vector TYPE v, TYPE i, unsigned int n) \ + { \ + return vec_insert (i, v, n); \ + } + +#define TEST_VEC_INSERT_ALL(T) \ + T (char, 0) \ + T (unsigned char, 1) \ + T (short, 2) \ + T (unsigned short, 3) \ + T (int, 4) \ + T (unsigned int, 5) \ + T (long long, 6) \ + T (unsigned long long, 7) \ + T (float, 8) \ + T (double, 9) -- 2.27.0.90.geebb51ba8c ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-18 6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo @ 2020-09-18 20:19 ` Segher Boessenkool 2020-09-24 8:21 ` xionghu luo 0 siblings, 1 reply; 24+ messages in thread From: Segher Boessenkool @ 2020-09-18 20:19 UTC (permalink / raw) To: Xiong Hu Luo Cc: gcc-patches, dje.gcc, wschmidt, guojiufu, linkw, richard.guenther, richard.sandiford Hi! On Fri, Sep 18, 2020 at 01:17:41AM -0500, Xiong Hu Luo wrote: > vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value > to be insert, arg2 is the place to insert arg1 to arg0. Current expander > generates stxv+stwx+lxv if arg2 is variable instead of constant, which > causes serious store hit load performance issue on Power. This patch tries > 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to > unify the gimple code, then expander could use vec_set_optab to expand. > 2) Expand the IFN VEC_SET to fast instructions: lvsl+xxperm+xxsel. > In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too > early in gimple stage if arg2 is variable, avoid generating store hit load > instructions. Sounds good. > For Power9 V4SI: > addi 9,1,-16 > rldic 6,6,2,60 > stxv 34,-16(1) > stwx 5,9,6 > lxv 34,-16(1) > => > addis 9,2,.LC0@toc@ha > addi 9,9,.LC0@toc@l > mtvsrwz 33,5 > lxv 32,0(9) > sradi 9,6,2 > addze 9,9 These last two insns are a signed modulo. That seems wrong. > sldi 9,9,2 > subf 9,9,6 > subfic 9,9,3 > sldi 9,9,2 > subfic 9,9,20 This should be optimised quite a bit. Why isn't it? Is this -O0 output? That isn't interesting, please show -O2 instead. > lvsl 13,0,9 > xxperm 33,33,45 > xxperm 32,32,45 > xxsel 34,34,33,32 It feels like this can be done in fewer insns. Hrm. Since you already rotate one of the data and the mask, does it work better if you rotate to some known position and then use vinsertw (and then rotate again)? > Though instructions increase from 5 to 15, the performance is improved > 60% in typical cases. Yeah, great :-) > Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI That is what the new testcases do, right? > * config/rs6000/altivec.md (altivec_lvsl_reg_<mode>2): Rename to > (altivec_lvsl_reg_<mode>2) and extend to SDI mode. (wrong leading space) You say the same thing is renamed to the same thing (and not how we normally do that). "extend to SDI mode"... Please rephrase, "extend" usually means sign- or zero-extension in RTL. But, see below. > * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): > Ajdust variable index vec_insert to VIEW_CONVERT_EXPR. "Adjust", but this changelog entry does not tell me what the change is. > * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var): > New declare. (declaration) > * config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2. From some specific define_something, surely? > -(define_insn "altivec_lvsl_reg" > +(define_insn "altivec_lvsl_reg_<mode>2" > [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") > (unspec:V16QI > - [(match_operand:DI 1 "gpc_reg_operand" "b")] > + [(match_operand:SDI 1 "gpc_reg_operand" "b")] > UNSPEC_LVSL_REG))] > "TARGET_ALTIVEC" > "lvsl %0,0,%1" "DI" is wrong, yes; but so is SDI. It should just be P here, afaics? SI on -m32, DI on -m64. > +/* Insert value from VEC into idx of TARGET. */ What does idx count? Bytes? Things the same size as "value"? What does "value" mean, anyway? Do you mean "VAL" (we use upercase for all parameter names; "IDX" as well). > +void > +rs6000_expand_vector_set_var (rtx target, rtx vec, rtx val, rtx idx) > +{ > + machine_mode mode = GET_MODE (vec); > + > + if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx)) > + gcc_unreachable (); > + else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx) > + && TARGET_DIRECT_MOVE_64BIT) You already handled the CONST_INT_P case a line before, CONST_INT_P is always true here. This actually does *nothing* if this if() isn't true, which cannot be right. Maybe you want to replace these four lines with just gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)); and handle !TARGET_DIRECT_MOVE_64BIT some way as well (maybe this cannot be called then; assert it as well, then, or at least document it is required). > + if (!BYTES_BIG_ENDIAN) > + { > + /* idx = 1 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx)); > + /* idx = idx * 8. */ > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3))); > + /* idx = 16 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp)); > + } Maybe combine this here already? 16 - 8*(1 - idx) is the same as 8 + 8*idx. Which might not be correct at all? (GEN_INT (1) is const1_rtx, btw.) > + else if (GET_MODE_SIZE (inner_mode) == 4) > + { > + if (!BYTES_BIG_ENDIAN) > + { > + /* idx = 3 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx)); > + /* idx = idx * 4. */ > + emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2))); > + /* idx = 20 - idx. */ > + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); > + } > + else > + { > + emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2))); > + emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp)); > + } > + } Please have all the different sizes in just one block, and compute the contants you need there. > + /* lxv vs32, mask. > + DImode: 0xffffffffffffffff0000000000000000 > + SImode: 0x00000000ffffffff0000000000000000 > + HImode: 0x000000000000ffff0000000000000000. > + QImode: 0x00000000000000ff0000000000000000. */ > + rtx mask = gen_reg_rtx (V16QImode); > + rtx mask_v2di = gen_reg_rtx (V2DImode); > + rtvec v = rtvec_alloc (2); > + if (!BYTES_BIG_ENDIAN) > + { > + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0); > + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask); > + } > + else > + { > + RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask); > + RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0); > + } > + emit_insn ( > + gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v))); Please use two (or more) statements if things like this do not fit. If it is hard to layout code, you almost always should factor the code a bit, and it will be better for it. > + /* mtvsrd[wz] f0,val. */ > + rtx val_v16qi = gen_reg_rtx (V16QImode); > + switch (inner_mode) So it looks like we want a parameterized name for p8_mtvsrwz, and for p8_mtvsrd as well. And then you can do the switch as just an if() on the size of inner_mode (whether it is <= 4 or not). > + /* lvsl v1,0,idx. */ > + rtx pcv = gen_reg_rtx (V16QImode); > + emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp)); > + > + /* xxperm vs0,vs0,vs33. */ > + /* xxperm vs32,vs32,vs33. */ But you generate vperm, instead! vperm has a parameter more, and is much easier to understand because of that. > +(define_mode_iterator FQHS [SF QI HI SI]) > +(define_mode_iterator FD [DF DI]) Those are pretty bad names. The concept does not make much sense either? SF is normally stored in registers as DF, so that makes it even weirder. Why have floating point modes here at all? > +(define_insn "p8_mtvsrwz_v16qi<mode>2" > + [(set (match_operand:V16QI 0 "register_operand" "=wa") > + (unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")] > + UNSPEC_P8V_MTVSRWZ))] > + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" > + "mtvsrwz %x0,%1" > + [(set_attr "type" "mftgpr")]) This insn does not require TARGET_POWERPC64, it only looks at the low 32 bits of a GPR after all. > +(define_insn "p8_mtvsrd_v16qi<mode>2" > + [(set (match_operand:V16QI 0 "register_operand" "=wa") > + (unspec:V16QI [(match_operand:FD 1 "register_operand" "r")] > + UNSPEC_P8V_MTVSRD))] > + "TARGET_POWERPC64 && TARGET_DIRECT_MOVE" > + "mtvsrd %x0,%1" > + [(set_attr "type" "mftgpr")]) All this can be written without unspecs (and it should bet like all existing patterns that use it!) Unspec is the great optimisation inhibitor. What do you need new patterns for, anyway? > (define_expand "vec_set<mode>" [...] > + if (CONST_INT_P (operands[2])) > + { > + rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2])); > + DONE; > + } > + else > + { > + rtx target = gen_reg_rtx (V16QImode); > + rs6000_expand_vector_set_var (target, operands[0], operands[1], > + operands[2]); > + rtx sub_target > + = simplify_gen_subreg (GET_MODE (operands[0]), target, V16QImode, 0); > + emit_insn (gen_rtx_SET (operands[0], sub_target)); > + DONE; > + } Please handle this in rs6000_expand_vector_set, instead. It is fine to call rs6000_vector_set_var early in that, for example. > diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md > index dd750210758..7e82690d12d 100644 > --- a/gcc/config/rs6000/vsx.md > +++ b/gcc/config/rs6000/vsx.md > @@ -5349,7 +5349,7 @@ (define_expand "xl_len_r" > rtx rtx_vtmp = gen_reg_rtx (V16QImode); > rtx tmp = gen_reg_rtx (DImode); > > - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); > + emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2])); So this becomes emit_insn (gen_altivec_lvsl_reg (DImode, shift_mask, operands[2])); if you use parameterized names. Tests... You don't need lp64 anywhere, but then you probably need to disallow the 64-bit tests. That may be hard? When you resend, please split it into separate patches for separate things. Small patches are fine, no, *good* even! Segher ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-18 20:19 ` Segher Boessenkool @ 2020-09-24 8:21 ` xionghu luo 2020-09-24 13:27 ` Richard Biener 0 siblings, 1 reply; 24+ messages in thread From: xionghu luo @ 2020-09-24 8:21 UTC (permalink / raw) To: Segher Boessenkool Cc: gcc-patches, dje.gcc, wschmidt, guojiufu, linkw, richard.guenther, richard.sandiford [-- Attachment #1: Type: text/plain, Size: 586 bytes --] Hi Segher, The attached two patches are updated and split from "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]" as your comments. [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param This one is preparation work of fix lvsl&lvsr arg mode and rs6000_expand_vector_set parameter support for both constant and variable index input. [PATCH v3 2/3] rs6000: Support variable insert and Expand vec_insert in expander [PR79251] This one is Building VIEW_CONVERT_EXPR and expand the IFN VEC_SET to fast. Thanks, Xionghu [-- Attachment #2: v3-0002-rs6000-Fix-lvsl-lvsr-mode-and-change-rs6000_expan.patch --] [-- Type: text/plain, Size: 6880 bytes --] From 9d74c488ad3c7cad8c276cc49749ec05158d1e96 Mon Sep 17 00:00:00 2001 From: Xiong Hu Luo <luoxhu@linux.ibm.com> Date: Thu, 24 Sep 2020 00:52:35 -0500 Subject: [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param lvsl and lvsr looks only at the low 4 bits, use SI for index param. rs6000_expand_vector_set could accept insert either to constant position or variable position, so change the operand to reg_or_cint_operand. gcc/ChangeLog: 2020-09-24 Xionghu Luo <luoxhu@linux.ibm.com> * config/rs6000/altivec.md (altivec_lvsl_reg): Change to SImode. (altivec_lvsr_reg): Likewise. * config/rs6000/rs6000-call.c (altivec_expand_vec_set_builtin): Change call param 2 from type int to rtx. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set): Likewise. * config/rs6000/rs6000.c (rs6000_expand_vector_init): Change call param 2 from type int to rtx. (rs6000_expand_vector_set): Likewise. * config/rs6000/vector.md (vec_set<mode>): Support both constant and variable index vec_set. * config/rs6000/vsx.md: Call gen_altivec_lvsl_reg with SImode. --- gcc/config/rs6000/altivec.md | 4 ++-- gcc/config/rs6000/rs6000-call.c | 2 +- gcc/config/rs6000/rs6000-protos.h | 2 +- gcc/config/rs6000/rs6000.c | 16 +++++++++------- gcc/config/rs6000/vector.md | 4 ++-- gcc/config/rs6000/vsx.md | 3 ++- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md index 0a2e634d6b0..a1c06c9ab8c 100644 --- a/gcc/config/rs6000/altivec.md +++ b/gcc/config/rs6000/altivec.md @@ -2775,7 +2775,7 @@ (define_expand "altivec_lvsl" (define_insn "altivec_lvsl_reg" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:SI 1 "gpc_reg_operand" "b")] UNSPEC_LVSL_REG))] "TARGET_ALTIVEC" "lvsl %0,0,%1" @@ -2813,7 +2813,7 @@ (define_expand "altivec_lvsr" (define_insn "altivec_lvsr_reg" [(set (match_operand:V16QI 0 "altivec_register_operand" "=v") (unspec:V16QI - [(match_operand:DI 1 "gpc_reg_operand" "b")] + [(match_operand:SI 1 "gpc_reg_operand" "b")] UNSPEC_LVSR_REG))] "TARGET_ALTIVEC" "lvsr %0,0,%1" diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c index e39cfcf672b..51f278933bd 100644 --- a/gcc/config/rs6000/rs6000-call.c +++ b/gcc/config/rs6000/rs6000-call.c @@ -10655,7 +10655,7 @@ altivec_expand_vec_set_builtin (tree exp) op0 = force_reg (tmode, op0); op1 = force_reg (mode1, op1); - rs6000_expand_vector_set (op0, op1, elt); + rs6000_expand_vector_set (op0, op1, GEN_INT (elt)); return op0; } diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 28e859f4381..6a0fbc3ba2e 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -57,7 +57,7 @@ extern bool rs6000_move_128bit_ok_p (rtx []); extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); -extern void rs6000_expand_vector_set (rtx, rtx, int); +extern void rs6000_expand_vector_set (rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index fe93cf6ff2b..c46ec14f060 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6669,7 +6669,8 @@ rs6000_expand_vector_init (rtx target, rtx vals) rs6000_expand_vector_init (target, copy); /* Insert variable. */ - rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), one_var); + rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), + GEN_INT (one_var)); return; } @@ -6683,10 +6684,10 @@ rs6000_expand_vector_init (rtx target, rtx vals) emit_move_insn (target, mem); } -/* Set field ELT of TARGET to VAL. */ +/* Set field ELT_RTX of TARGET to VAL. */ void -rs6000_expand_vector_set (rtx target, rtx val, int elt) +rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) { machine_mode mode = GET_MODE (target); machine_mode inner_mode = GET_MODE_INNER (mode); @@ -6700,7 +6701,6 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) if (VECTOR_MEM_VSX_P (mode)) { rtx insn = NULL_RTX; - rtx elt_rtx = GEN_INT (elt); if (mode == V2DFmode) insn = gen_vsx_set_v2df (target, target, val, elt_rtx); @@ -6727,8 +6727,11 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) } } + gcc_assert (CONST_INT_P (elt_rtx)); + /* Simplify setting single element vectors like V1TImode. */ - if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode) && elt == 0) + if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode) + && INTVAL (elt_rtx) == 0) { emit_move_insn (target, gen_lowpart (mode, val)); return; @@ -6751,8 +6754,7 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt) /* Set permute mask to insert element into target. */ for (i = 0; i < width; ++i) - XVECEXP (mask, 0, elt*width + i) - = GEN_INT (i + 0x10); + XVECEXP (mask, 0, INTVAL (elt_rtx) * width + i) = GEN_INT (i + 0x10); x = gen_rtx_CONST_VECTOR (V16QImode, XVEC (mask, 0)); if (BYTES_BIG_ENDIAN) diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md index 796345c80d3..7aab1887cf5 100644 --- a/gcc/config/rs6000/vector.md +++ b/gcc/config/rs6000/vector.md @@ -1227,10 +1227,10 @@ (define_expand "vec_init<mode><VEC_base_l>" (define_expand "vec_set<mode>" [(match_operand:VEC_E 0 "vlogical_operand") (match_operand:<VEC_base> 1 "register_operand") - (match_operand 2 "const_int_operand")] + (match_operand 2 "reg_or_cint_operand")] "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)" { - rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2])); + rs6000_expand_vector_set (operands[0], operands[1], operands[2]); DONE; }) diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md index dd750210758..96bfeb63e0d 100644 --- a/gcc/config/rs6000/vsx.md +++ b/gcc/config/rs6000/vsx.md @@ -5349,7 +5349,8 @@ (define_expand "xl_len_r" rtx rtx_vtmp = gen_reg_rtx (V16QImode); rtx tmp = gen_reg_rtx (DImode); - emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2])); + rtx op2_si = gen_rtx_REG (SImode, reg_or_subregno (operands[2])); + emit_insn (gen_altivec_lvsl_reg (shift_mask, op2_si)); emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56))); emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp)); emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp, -- 2.27.0.90.geebb51ba8c [-- Attachment #3: v3-0003-rs6000-Support-variable-insert-and-Expand-vec_ins.patch --] [-- Type: text/plain, Size: 11110 bytes --] From 0946aae4af8286a002a5625034fa7195628f2004 Mon Sep 17 00:00:00 2001 From: Xiong Hu Luo <luoxhu@linux.ibm.com> Date: Thu, 24 Sep 2020 01:17:20 -0500 Subject: [PATCH v3 3/3] rs6000: Support variable insert and Expand vec_insert in expander [PR79251] vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value to be insert, arg2 is the place to insert arg1 to arg0. Current expander generates stxv+stwx+lxv if arg2 is variable instead of constant, which causes serious store hit load performance issue on Power. This patch tries 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to unify the gimple code, then expander could use vec_set_optab to expand. 2) Expand the IFN VEC_SET to fast instructions: lvsr+insert+lvsl. In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too early in gimple stage if arg2 is variable, avoid generating store hit load instructions. For Power9 V4SI: addi 9,1,-16 rldic 6,6,2,60 stxv 34,-16(1) stwx 5,9,6 lxv 34,-16(1) => rlwinm 6,6,2,28,29 mtvsrwz 0,5 lvsr 1,0,6 lvsl 0,0,6 xxperm 34,34,33 xxinsertw 34,0,12 xxperm 34,34,32 Though instructions increase from 5 to 7, the performance is improved 60% in typical cases. Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE. gcc/ChangeLog: 2020-09-24 Xionghu Luo <luoxhu@linux.ibm.com> * config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin): Ajdust variable index vec_insert from address dereference to ARRAY_REF(VIEW_CONVERT_EXPR) tree expression. * config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var): New declaration. * config/rs6000/rs6000.c (rs6000_expand_vector_set_var): New function. * config/rs6000/vector.md (vec_set<mode>): Support both constant and variable index vec_set. gcc/testsuite/ChangeLog: 2020-09-24 Xionghu Luo <luoxhu@linux.ibm.com> * gcc.target/powerpc/pr79251.c: New test. * gcc.target/powerpc/pr79251-run.c: New test. * gcc.target/powerpc/pr79251.h: New header. --- gcc/config/rs6000/rs6000-c.c | 22 ++++---- gcc/config/rs6000/rs6000-protos.h | 1 + gcc/config/rs6000/rs6000.c | 50 +++++++++++++++++++ .../gcc.target/powerpc/pr79251-run.c | 29 +++++++++++ gcc/testsuite/gcc.target/powerpc/pr79251.c | 18 +++++++ gcc/testsuite/gcc.target/powerpc/pr79251.h | 19 +++++++ 6 files changed, 126 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.c create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c index 2fad3d94706..78abe49c833 100644 --- a/gcc/config/rs6000/rs6000-c.c +++ b/gcc/config/rs6000/rs6000-c.c @@ -1509,9 +1509,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, tree arg1; tree arg2; tree arg1_type; - tree arg1_inner_type; tree decl, stmt; - tree innerptrtype; machine_mode mode; /* No second or third arguments. */ @@ -1563,8 +1561,13 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, return build_call_expr (call, 3, arg1, arg0, arg2); } - /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */ - arg1_inner_type = TREE_TYPE (arg1_type); + /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with + VIEW_CONVERT_EXPR. i.e.: + D.3192 = v1; + _1 = n & 3; + VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i; + v1 = D.3192; + D.3194 = v1; */ if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1) arg2 = build_int_cst (TREE_TYPE (arg2), 0); else @@ -1593,15 +1596,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl, SET_EXPR_LOCATION (stmt, loc); stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt); } - - innerptrtype = build_pointer_type (arg1_inner_type); - - stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0); - stmt = convert (innerptrtype, stmt); - stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1); - stmt = build_indirect_ref (loc, stmt, RO_NULL); - stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt, - convert (TREE_TYPE (stmt), arg0)); + stmt = build_array_ref (loc, stmt, arg2); + stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0); stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl); return stmt; } diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h index 6a0fbc3ba2e..9687767ec16 100644 --- a/gcc/config/rs6000/rs6000-protos.h +++ b/gcc/config/rs6000/rs6000-protos.h @@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []); extern void rs6000_expand_float128_convert (rtx, rtx, bool); extern void rs6000_expand_vector_init (rtx, rtx); extern void rs6000_expand_vector_set (rtx, rtx, rtx); +extern void rs6000_expand_vector_set_var (rtx, rtx, rtx); extern void rs6000_expand_vector_extract (rtx, rtx, rtx); extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx); extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode); diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index c46ec14f060..b53f946063b 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -6700,6 +6700,12 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) if (VECTOR_MEM_VSX_P (mode)) { + if (!CONST_INT_P (elt_rtx)) + { + rs6000_expand_vector_set_var (target, val, elt_rtx); + return; + } + rtx insn = NULL_RTX; if (mode == V2DFmode) @@ -6790,6 +6796,50 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx) emit_insn (gen_rtx_SET (target, x)); } +/* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX + is variable and also counts by vector element size. */ + +void +rs6000_expand_vector_set_var (rtx target, rtx val, rtx idx) +{ + machine_mode mode = GET_MODE (target); + + gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx) + && TARGET_DIRECT_MOVE_64BIT); + + gcc_assert (GET_MODE (idx) == E_SImode); + + machine_mode inner_mode = GET_MODE (val); + + rtx tmp = gen_reg_rtx (GET_MODE (idx)); + int width = GET_MODE_SIZE (inner_mode); + + gcc_assert (width >= 1 && width <= 8); + + /* Generate the IDX for permute shift, width is the vector element size. + idx = idx * width. */ + emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width))); + + /* lvsr v1,0,idx. */ + rtx pcvr = gen_reg_rtx (V16QImode); + emit_insn (gen_altivec_lvsr_reg (pcvr, tmp)); + + /* lvsl v2,0,idx. */ + rtx pcvl = gen_reg_rtx (V16QImode); + emit_insn (gen_altivec_lvsl_reg (pcvl, tmp)); + + rtx sub_target = simplify_gen_subreg (V16QImode, target, mode, 0); + + rtx perm; + perm = gen_altivec_vperm_v8hiv16qi (sub_target, sub_target, sub_target, pcvr); + emit_insn (perm); + + rs6000_expand_vector_set (target, val, const0_rtx); + + perm = gen_altivec_vperm_v8hiv16qi (sub_target, sub_target, sub_target, pcvl); + emit_insn (perm); +} + /* Extract field ELT from VEC into TARGET. */ void diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c new file mode 100644 index 00000000000..8f555d96622 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c @@ -0,0 +1,29 @@ +/* { dg-do run { target { p9vector_hw } } } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ + +#include <stddef.h> +#include <altivec.h> +#include "pr79251.h" + +TEST_VEC_INSERT_ALL (test) + +#define run_test(TYPE, num) \ + { \ + vector TYPE v; \ + vector TYPE u = {0x0}; \ + for (long k = 0; k < 16 / sizeof (TYPE); k++) \ + v[k] = 0xaa; \ + for (long k = 0; k < 16 / sizeof (TYPE); k++) \ + { \ + u = test##num (v, 254, k); \ + if (u[k] != (TYPE) 254) \ + __builtin_abort (); \ + } \ + } + +int +main (void) +{ + TEST_VEC_INSERT_ALL (run_test) + return 0; +} diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c new file mode 100644 index 00000000000..ec1cb255888 --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target powerpc_p9vector_ok } */ +/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */ + +#include <stddef.h> +#include <altivec.h> +#include "pr79251.h" + +TEST_VEC_INSERT_ALL (test) + +/* { dg-final { scan-assembler-not {\mstxw\M} } } */ +/* { dg-final { scan-assembler-times {\mlvsl\M} 10 } } */ +/* { dg-final { scan-assembler-times {\mlvsr\M} 10 } } */ +/* { dg-final { scan-assembler-times {\mxxperm\M} 20 } } */ +/* { dg-final { scan-assembler-times {\mxxinsertw\M} 3 } } */ +/* { dg-final { scan-assembler-times {\mvinserth\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mvinsertb\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mxxpermdi\M} 3 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.h b/gcc/testsuite/gcc.target/powerpc/pr79251.h new file mode 100644 index 00000000000..addb067f9ed --- /dev/null +++ b/gcc/testsuite/gcc.target/powerpc/pr79251.h @@ -0,0 +1,19 @@ + +#define test(TYPE, num) \ + __attribute__ ((noinline, noclone)) \ + vector TYPE test##num (vector TYPE v, TYPE i, signed int n) \ + { \ + return vec_insert (i, v, n); \ + } + +#define TEST_VEC_INSERT_ALL(T) \ + T (char, 0) \ + T (unsigned char, 1) \ + T (short, 2) \ + T (unsigned short, 3) \ + T (int, 4) \ + T (unsigned int, 5) \ + T (long long, 6) \ + T (unsigned long long, 7) \ + T (float, 8) \ + T (double, 9) -- 2.27.0.90.geebb51ba8c ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-24 8:21 ` xionghu luo @ 2020-09-24 13:27 ` Richard Biener 2020-09-24 14:55 ` Richard Biener ` (2 more replies) 0 siblings, 3 replies; 24+ messages in thread From: Richard Biener @ 2020-09-24 13:27 UTC (permalink / raw) To: xionghu luo Cc: Segher Boessenkool, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> wrote: > > Hi Segher, > > The attached two patches are updated and split from > "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]" > as your comments. > > > [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param > > This one is preparation work of fix lvsl&lvsr arg mode and rs6000_expand_vector_set > parameter support for both constant and variable index input. > > > [PATCH v3 2/3] rs6000: Support variable insert and Expand vec_insert in expander [PR79251] > > This one is Building VIEW_CONVERT_EXPR and expand the IFN VEC_SET to fast. I'll just comment that xxperm 34,34,33 xxinsertw 34,0,12 xxperm 34,34,32 doesn't look like a variable-position insert instruction but this is a variable whole-vector rotate plus an insert at index zero followed by a variable whole-vector rotate. I'm not fluend in ppc assembly but rlwinm 6,6,2,28,29 mtvsrwz 0,5 lvsr 1,0,6 lvsl 0,0,6 possibly computes the shift masks for r33/r32? though I do not see those registers mentioned... This might be a generic viable expansion strathegy btw, which is why I asked before whether the CPU supports inserts at a variable position ... the building blocks are already there with vec_set at constant zero position plus vec_perm_const for the rotates. But well, I did ask this question. Multiple times. ppc does _not_ have a VSX instruction like xxinsertw r34, r8, r12 where r8 denotes the vector element (or byte position or whatever). So I don't think vec_set with a variable index is the best approach. Xionghu - you said even without the patch the stack storage is eventually elided but addi 9,1,-16 rldic 6,6,2,60 stxv 34,-16(1) stwx 5,9,6 lxv 34,-16(1) still shows stack(?) store/load with a bad STLF penalty. Richard. > > Thanks, > Xionghu ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-24 13:27 ` Richard Biener @ 2020-09-24 14:55 ` Richard Biener 2020-09-24 19:36 ` Segher Boessenkool 2020-09-24 19:02 ` Segher Boessenkool 2020-09-25 3:50 ` xionghu luo 2 siblings, 1 reply; 24+ messages in thread From: Richard Biener @ 2020-09-24 14:55 UTC (permalink / raw) To: xionghu luo Cc: Segher Boessenkool, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On Thu, Sep 24, 2020 at 3:27 PM Richard Biener <richard.guenther@gmail.com> wrote: > > On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> wrote: > > > > Hi Segher, > > > > The attached two patches are updated and split from > > "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]" > > as your comments. > > > > > > [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param > > > > This one is preparation work of fix lvsl&lvsr arg mode and rs6000_expand_vector_set > > parameter support for both constant and variable index input. > > > > > > [PATCH v3 2/3] rs6000: Support variable insert and Expand vec_insert in expander [PR79251] > > > > This one is Building VIEW_CONVERT_EXPR and expand the IFN VEC_SET to fast. > > I'll just comment that > > xxperm 34,34,33 > xxinsertw 34,0,12 > xxperm 34,34,32 Btw, on x86_64 the following produces sth reasonable: #define N 32 typedef int T; typedef T V __attribute__((vector_size(N))); V setg (V v, int idx, T val) { V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx}; V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv); v = (v & ~mask) | (valv & mask); return v; } vmovd %edi, %xmm1 vpbroadcastd %xmm1, %ymm1 vpcmpeqd .LC0(%rip), %ymm1, %ymm2 vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ret I'm quite sure you could do sth similar on power? > doesn't look like a variable-position insert instruction but > this is a variable whole-vector rotate plus an insert at index zero > followed by a variable whole-vector rotate. I'm not fluend in > ppc assembly but > > rlwinm 6,6,2,28,29 > mtvsrwz 0,5 > lvsr 1,0,6 > lvsl 0,0,6 > > possibly computes the shift masks for r33/r32? though > I do not see those registers mentioned... > > This might be a generic viable expansion strathegy btw, > which is why I asked before whether the CPU supports > inserts at a variable position ... the building blocks are > already there with vec_set at constant zero position > plus vec_perm_const for the rotates. > > But well, I did ask this question. Multiple times. > > ppc does _not_ have a VSX instruction > like xxinsertw r34, r8, r12 where r8 denotes > the vector element (or byte position or whatever). > > So I don't think vec_set with a variable index is the > best approach. > Xionghu - you said even without the patch the stack > storage is eventually elided but > > addi 9,1,-16 > rldic 6,6,2,60 > stxv 34,-16(1) > stwx 5,9,6 > lxv 34,-16(1) > > still shows stack(?) store/load with a bad STLF penalty. > > Richard. > > > > > Thanks, > > Xionghu ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-24 14:55 ` Richard Biener @ 2020-09-24 19:36 ` Segher Boessenkool 2020-09-25 6:58 ` Richard Biener 0 siblings, 1 reply; 24+ messages in thread From: Segher Boessenkool @ 2020-09-24 19:36 UTC (permalink / raw) To: Richard Biener Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford Hi! On Thu, Sep 24, 2020 at 04:55:21PM +0200, Richard Biener wrote: > Btw, on x86_64 the following produces sth reasonable: > > #define N 32 > typedef int T; > typedef T V __attribute__((vector_size(N))); > V setg (V v, int idx, T val) > { > V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx}; > V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv); > v = (v & ~mask) | (valv & mask); > return v; > } > > vmovd %edi, %xmm1 > vpbroadcastd %xmm1, %ymm1 > vpcmpeqd .LC0(%rip), %ymm1, %ymm2 > vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 > ret > > I'm quite sure you could do sth similar on power? This only allows inserting aligned elements. Which is probably fine of course (we don't allow elements that straddle vector boundaries either, anyway). And yes, we can do that :-) That should be #define N 32 typedef int T; typedef T V __attribute__((vector_size(N))); V setg (V v, int idx, T val) { V valv = (V){val, val, val, val, val, val, val, val}; V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx}; V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv); v = (v & ~mask) | (valv & mask); return v; } after which I get (-march=znver2) setg: vmovd %edi, %xmm1 vmovd %esi, %xmm2 vpbroadcastd %xmm1, %ymm1 vpbroadcastd %xmm2, %ymm2 vpcmpeqd .LC0(%rip), %ymm1, %ymm1 vpandn %ymm0, %ymm1, %ymm0 vpand %ymm2, %ymm1, %ymm1 vpor %ymm0, %ymm1, %ymm0 ret .LC0: .long 0 .long 1 .long 2 .long 3 .long 4 .long 5 .long 6 .long 7 and for powerpc (changing it to 16B vectors, -mcpu=power9) it is setg: addis 9,2,.LC0@toc@ha mtvsrws 32,5 mtvsrws 33,6 addi 9,9,.LC0@toc@l lxv 45,0(9) vcmpequw 0,0,13 xxsel 34,34,33,32 blr .LC0: .long 0 .long 1 .long 2 .long 3 (We can generate that 0..3 vector without doing loads; I guess x86 can do that as well? But it takes more than one insn to do (of course we have to set up the memory address first *with* the load, heh).) For power8 it becomes (we need to splat in separate insns): setg: addis 9,2,.LC0@toc@ha mtvsrwz 32,5 mtvsrwz 33,6 addi 9,9,.LC0@toc@l lxvw4x 45,0,9 xxspltw 32,32,1 xxspltw 33,33,1 vcmpequw 0,0,13 xxsel 34,34,33,32 blr Segher ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-24 19:36 ` Segher Boessenkool @ 2020-09-25 6:58 ` Richard Biener 2020-09-25 12:21 ` Richard Sandiford 2020-09-25 22:39 ` Segher Boessenkool 0 siblings, 2 replies; 24+ messages in thread From: Richard Biener @ 2020-09-25 6:58 UTC (permalink / raw) To: Segher Boessenkool Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool <segher@kernel.crashing.org> wrote: > > Hi! > > On Thu, Sep 24, 2020 at 04:55:21PM +0200, Richard Biener wrote: > > Btw, on x86_64 the following produces sth reasonable: > > > > #define N 32 > > typedef int T; > > typedef T V __attribute__((vector_size(N))); > > V setg (V v, int idx, T val) > > { > > V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx}; > > V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv); > > v = (v & ~mask) | (valv & mask); > > return v; > > } > > > > vmovd %edi, %xmm1 > > vpbroadcastd %xmm1, %ymm1 > > vpcmpeqd .LC0(%rip), %ymm1, %ymm2 > > vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 > > ret > > > > I'm quite sure you could do sth similar on power? > > This only allows inserting aligned elements. Which is probably fine > of course (we don't allow elements that straddle vector boundaries > either, anyway). > > And yes, we can do that :-) > > That should be > #define N 32 > typedef int T; > typedef T V __attribute__((vector_size(N))); > V setg (V v, int idx, T val) > { > V valv = (V){val, val, val, val, val, val, val, val}; > V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx}; > V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv); > v = (v & ~mask) | (valv & mask); > return v; > } Whoops yeah, simplified it a bit too much ;) > after which I get (-march=znver2) > > setg: > vmovd %edi, %xmm1 > vmovd %esi, %xmm2 > vpbroadcastd %xmm1, %ymm1 > vpbroadcastd %xmm2, %ymm2 > vpcmpeqd .LC0(%rip), %ymm1, %ymm1 > vpandn %ymm0, %ymm1, %ymm0 > vpand %ymm2, %ymm1, %ymm1 > vpor %ymm0, %ymm1, %ymm0 > ret I get with -march=znver2 -O2 vmovd %edi, %xmm1 vmovd %esi, %xmm2 vpbroadcastd %xmm1, %ymm1 vpbroadcastd %xmm2, %ymm2 vpcmpeqd .LC0(%rip), %ymm1, %ymm1 vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 and with -mavx512vl vpbroadcastd %edi, %ymm1 vpcmpd $0, .LC0(%rip), %ymm1, %k1 vpbroadcastd %esi, %ymm0{%k1} broadcast-with-mask - heh, would be interesting if we manage to combine v[idx1] = val; v[idx2] = val; ;) Now, with SSE4.2 the 16byte case compiles to setg: .LFB0: .cfi_startproc movd %edi, %xmm3 movdqa %xmm0, %xmm1 movd %esi, %xmm4 pshufd $0, %xmm3, %xmm0 pcmpeqd .LC0(%rip), %xmm0 movdqa %xmm0, %xmm2 pandn %xmm1, %xmm2 pshufd $0, %xmm4, %xmm1 pand %xmm1, %xmm0 por %xmm2, %xmm0 ret since there's no blend with a variable mask IIRC. with aarch64 and SVE it doesn't handle the 32byte case at all, the 16byte case compiles to setg: .LFB0: .cfi_startproc adrp x2, .LC0 dup v1.4s, w0 dup v2.4s, w1 ldr q3, [x2, #:lo12:.LC0] cmeq v1.4s, v1.4s, v3.4s bit v0.16b, v2.16b, v1.16b which looks equivalent to the AVX2 code. For all of those varying the vector element type may also cause "issues" I guess. > .LC0: > .long 0 > .long 1 > .long 2 > .long 3 > .long 4 > .long 5 > .long 6 > .long 7 > > and for powerpc (changing it to 16B vectors, -mcpu=power9) it is > > setg: > addis 9,2,.LC0@toc@ha > mtvsrws 32,5 > mtvsrws 33,6 > addi 9,9,.LC0@toc@l > lxv 45,0(9) > vcmpequw 0,0,13 > xxsel 34,34,33,32 > blr > > .LC0: > .long 0 > .long 1 > .long 2 > .long 3 > > (We can generate that 0..3 vector without doing loads; I guess x86 can > do that as well? But it takes more than one insn to do (of course we > have to set up the memory address first *with* the load, heh).) > > For power8 it becomes (we need to splat in separate insns): > > setg: > addis 9,2,.LC0@toc@ha > mtvsrwz 32,5 > mtvsrwz 33,6 > addi 9,9,.LC0@toc@l > lxvw4x 45,0,9 > xxspltw 32,32,1 > xxspltw 33,33,1 > vcmpequw 0,0,13 > xxsel 34,34,33,32 > blr > > > Segher ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-25 6:58 ` Richard Biener @ 2020-09-25 12:21 ` Richard Sandiford 2020-09-25 22:39 ` Segher Boessenkool 1 sibling, 0 replies; 24+ messages in thread From: Richard Sandiford @ 2020-09-25 12:21 UTC (permalink / raw) To: Richard Biener Cc: Segher Boessenkool, xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw Richard Biener <richard.guenther@gmail.com> writes: > On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool > <segher@kernel.crashing.org> wrote: >> >> Hi! >> >> On Thu, Sep 24, 2020 at 04:55:21PM +0200, Richard Biener wrote: >> > Btw, on x86_64 the following produces sth reasonable: >> > >> > #define N 32 >> > typedef int T; >> > typedef T V __attribute__((vector_size(N))); >> > V setg (V v, int idx, T val) >> > { >> > V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx}; >> > V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv); >> > v = (v & ~mask) | (valv & mask); >> > return v; >> > } >> > >> > vmovd %edi, %xmm1 >> > vpbroadcastd %xmm1, %ymm1 >> > vpcmpeqd .LC0(%rip), %ymm1, %ymm2 >> > vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 >> > ret >> > >> > I'm quite sure you could do sth similar on power? >> >> This only allows inserting aligned elements. Which is probably fine >> of course (we don't allow elements that straddle vector boundaries >> either, anyway). >> >> And yes, we can do that :-) >> >> That should be >> #define N 32 >> typedef int T; >> typedef T V __attribute__((vector_size(N))); >> V setg (V v, int idx, T val) >> { >> V valv = (V){val, val, val, val, val, val, val, val}; >> V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx}; >> V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv); >> v = (v & ~mask) | (valv & mask); >> return v; >> } > > Whoops yeah, simplified it a bit too much ;) > >> after which I get (-march=znver2) >> >> setg: >> vmovd %edi, %xmm1 >> vmovd %esi, %xmm2 >> vpbroadcastd %xmm1, %ymm1 >> vpbroadcastd %xmm2, %ymm2 >> vpcmpeqd .LC0(%rip), %ymm1, %ymm1 >> vpandn %ymm0, %ymm1, %ymm0 >> vpand %ymm2, %ymm1, %ymm1 >> vpor %ymm0, %ymm1, %ymm0 >> ret > > I get with -march=znver2 -O2 > > vmovd %edi, %xmm1 > vmovd %esi, %xmm2 > vpbroadcastd %xmm1, %ymm1 > vpbroadcastd %xmm2, %ymm2 > vpcmpeqd .LC0(%rip), %ymm1, %ymm1 > vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 > > and with -mavx512vl > > vpbroadcastd %edi, %ymm1 > vpcmpd $0, .LC0(%rip), %ymm1, %k1 > vpbroadcastd %esi, %ymm0{%k1} > > broadcast-with-mask - heh, would be interesting if we manage > to combine v[idx1] = val; v[idx2] = val; ;) > > Now, with SSE4.2 the 16byte case compiles to > > setg: > .LFB0: > .cfi_startproc > movd %edi, %xmm3 > movdqa %xmm0, %xmm1 > movd %esi, %xmm4 > pshufd $0, %xmm3, %xmm0 > pcmpeqd .LC0(%rip), %xmm0 > movdqa %xmm0, %xmm2 > pandn %xmm1, %xmm2 > pshufd $0, %xmm4, %xmm1 > pand %xmm1, %xmm0 > por %xmm2, %xmm0 > ret > > since there's no blend with a variable mask IIRC. > > with aarch64 and SVE it doesn't handle the 32byte case at all, FWIW, the SVE version with -msve-vector-bits=256 is: ptrue p0.b, vl32 mov z1.s, w1 index z2.s, #0, #1 ld1w z0.s, p0/z, [x0] cmpeq p1.s, p0/z, z1.s, z2.s mov z0.s, p1/m, w2 st1w z0.s, p0, [x8] where the ptrue, ld1w and st1w are just because generic 256-bit vectors are passed in memory; the real operation is: mov z1.s, w1 index z2.s, #0, #1 cmpeq p1.s, p0/z, z1.s, z2.s mov z0.s, p1/m, w2 Thanks, Richard ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-25 6:58 ` Richard Biener 2020-09-25 12:21 ` Richard Sandiford @ 2020-09-25 22:39 ` Segher Boessenkool 2020-09-28 8:00 ` Richard Biener 1 sibling, 1 reply; 24+ messages in thread From: Segher Boessenkool @ 2020-09-25 22:39 UTC (permalink / raw) To: Richard Biener Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On Fri, Sep 25, 2020 at 08:58:35AM +0200, Richard Biener wrote: > On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool > <segher@kernel.crashing.org> wrote: > > after which I get (-march=znver2) > > > > setg: > > vmovd %edi, %xmm1 > > vmovd %esi, %xmm2 > > vpbroadcastd %xmm1, %ymm1 > > vpbroadcastd %xmm2, %ymm2 > > vpcmpeqd .LC0(%rip), %ymm1, %ymm1 > > vpandn %ymm0, %ymm1, %ymm0 > > vpand %ymm2, %ymm1, %ymm1 > > vpor %ymm0, %ymm1, %ymm0 > > ret > > I get with -march=znver2 -O2 > > vmovd %edi, %xmm1 > vmovd %esi, %xmm2 > vpbroadcastd %xmm1, %ymm1 > vpbroadcastd %xmm2, %ymm2 > vpcmpeqd .LC0(%rip), %ymm1, %ymm1 > vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 Ah, maybe my x86 compiler it too old... x86_64-linux-gcc (GCC) 10.0.0 20190919 (experimental) not exactly old, huh. I wonder what I do wrong then. > Now, with SSE4.2 the 16byte case compiles to > > setg: > .LFB0: > .cfi_startproc > movd %edi, %xmm3 > movdqa %xmm0, %xmm1 > movd %esi, %xmm4 > pshufd $0, %xmm3, %xmm0 > pcmpeqd .LC0(%rip), %xmm0 > movdqa %xmm0, %xmm2 > pandn %xmm1, %xmm2 > pshufd $0, %xmm4, %xmm1 > pand %xmm1, %xmm0 > por %xmm2, %xmm0 > ret > > since there's no blend with a variable mask IIRC. PowerPC got at least *that* right since time immemorial :-) > with aarch64 and SVE it doesn't handle the 32byte case at all, > the 16byte case compiles to > > setg: > .LFB0: > .cfi_startproc > adrp x2, .LC0 > dup v1.4s, w0 > dup v2.4s, w1 > ldr q3, [x2, #:lo12:.LC0] > cmeq v1.4s, v1.4s, v3.4s > bit v0.16b, v2.16b, v1.16b > > which looks equivalent to the AVX2 code. Yes, and we can do pretty much the same on Power, too. > For all of those varying the vector element type may also > cause "issues" I guess. For us, as long as it stays 16B vectors, all should be fine. There may be issues in the compiler, but at least the hardware has no problem with it ;-) > > and for powerpc (changing it to 16B vectors, -mcpu=power9) it is > > > > setg: > > addis 9,2,.LC0@toc@ha > > mtvsrws 32,5 > > mtvsrws 33,6 > > addi 9,9,.LC0@toc@l > > lxv 45,0(9) > > vcmpequw 0,0,13 > > xxsel 34,34,33,32 > > blr The -mcpu=power10 code right now is just plxv 45,.LC0@pcrel mtvsrws 32,5 mtvsrws 33,6 vcmpequw 0,0,13 xxsel 34,34,33,32 blr (exactly the same, but less memory address setup cost), so doing something like this as a generic version would work quite well pretty much everywhere I think! Segher ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-25 22:39 ` Segher Boessenkool @ 2020-09-28 8:00 ` Richard Biener 0 siblings, 0 replies; 24+ messages in thread From: Richard Biener @ 2020-09-28 8:00 UTC (permalink / raw) To: Segher Boessenkool Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On Sat, Sep 26, 2020 at 12:41 AM Segher Boessenkool <segher@kernel.crashing.org> wrote: > > On Fri, Sep 25, 2020 at 08:58:35AM +0200, Richard Biener wrote: > > On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool > > <segher@kernel.crashing.org> wrote: > > > after which I get (-march=znver2) > > > > > > setg: > > > vmovd %edi, %xmm1 > > > vmovd %esi, %xmm2 > > > vpbroadcastd %xmm1, %ymm1 > > > vpbroadcastd %xmm2, %ymm2 > > > vpcmpeqd .LC0(%rip), %ymm1, %ymm1 > > > vpandn %ymm0, %ymm1, %ymm0 > > > vpand %ymm2, %ymm1, %ymm1 > > > vpor %ymm0, %ymm1, %ymm0 > > > ret > > > > I get with -march=znver2 -O2 > > > > vmovd %edi, %xmm1 > > vmovd %esi, %xmm2 > > vpbroadcastd %xmm1, %ymm1 > > vpbroadcastd %xmm2, %ymm2 > > vpcmpeqd .LC0(%rip), %ymm1, %ymm1 > > vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 > > Ah, maybe my x86 compiler it too old... > x86_64-linux-gcc (GCC) 10.0.0 20190919 (experimental) > not exactly old, huh. I wonder what I do wrong then. > > > Now, with SSE4.2 the 16byte case compiles to > > > > setg: > > .LFB0: > > .cfi_startproc > > movd %edi, %xmm3 > > movdqa %xmm0, %xmm1 > > movd %esi, %xmm4 > > pshufd $0, %xmm3, %xmm0 > > pcmpeqd .LC0(%rip), %xmm0 > > movdqa %xmm0, %xmm2 > > pandn %xmm1, %xmm2 > > pshufd $0, %xmm4, %xmm1 > > pand %xmm1, %xmm0 > > por %xmm2, %xmm0 > > ret > > > > since there's no blend with a variable mask IIRC. > > PowerPC got at least *that* right since time immemorial :-) > > > with aarch64 and SVE it doesn't handle the 32byte case at all, > > the 16byte case compiles to > > > > setg: > > .LFB0: > > .cfi_startproc > > adrp x2, .LC0 > > dup v1.4s, w0 > > dup v2.4s, w1 > > ldr q3, [x2, #:lo12:.LC0] > > cmeq v1.4s, v1.4s, v3.4s > > bit v0.16b, v2.16b, v1.16b > > > > which looks equivalent to the AVX2 code. > > Yes, and we can do pretty much the same on Power, too. > > > For all of those varying the vector element type may also > > cause "issues" I guess. > > For us, as long as it stays 16B vectors, all should be fine. There may > be issues in the compiler, but at least the hardware has no problem with > it ;-) > > > > and for powerpc (changing it to 16B vectors, -mcpu=power9) it is > > > > > > setg: > > > addis 9,2,.LC0@toc@ha > > > mtvsrws 32,5 > > > mtvsrws 33,6 > > > addi 9,9,.LC0@toc@l > > > lxv 45,0(9) > > > vcmpequw 0,0,13 > > > xxsel 34,34,33,32 > > > blr > > The -mcpu=power10 code right now is just > > plxv 45,.LC0@pcrel > mtvsrws 32,5 > mtvsrws 33,6 > vcmpequw 0,0,13 > xxsel 34,34,33,32 > blr > > (exactly the same, but less memory address setup cost), so doing > something like this as a generic version would work quite well pretty > much everywhere I think! Given we don't have a good way to query for variable blend support the only change would be to use the bit and/or way which probably would be fine (and eventually combined). I guess that could be done in ISEL as well (given support for generating the mask, of course). Of course that makes it difficult for targets to opt-out (generate the in-memory variant)... Richard. > > Segher ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-24 13:27 ` Richard Biener 2020-09-24 14:55 ` Richard Biener @ 2020-09-24 19:02 ` Segher Boessenkool 2020-09-25 3:50 ` xionghu luo 2 siblings, 0 replies; 24+ messages in thread From: Segher Boessenkool @ 2020-09-24 19:02 UTC (permalink / raw) To: Richard Biener Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford Hi! On Thu, Sep 24, 2020 at 03:27:48PM +0200, Richard Biener wrote: > On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> wrote: > I'll just comment that > > xxperm 34,34,33 > xxinsertw 34,0,12 > xxperm 34,34,32 > > doesn't look like a variable-position insert instruction but > this is a variable whole-vector rotate plus an insert at index zero > followed by a variable whole-vector rotate. I'm not fluend in > ppc assembly but > > rlwinm 6,6,2,28,29 > mtvsrwz 0,5 > lvsr 1,0,6 > lvsl 0,0,6 > > possibly computes the shift masks for r33/r32? though > I do not see those registers mentioned... v0/v1 (what the lvs[lr] write to) are the same as vs32/vs33. The low half of the VSRs (vector-scalar registers) are the FP registers (expanded to 16B each), and the high half are the original VRs (vector registers). AltiVec insns (like lvsl, lvsr) naturally only work on VRs, as do some newer insns for which there wasn't enough budget in the opcode space to have for VSRs (which take 6 bits each, while VRs take only 5, just like FPRs and GPRs). > This might be a generic viable expansion strathegy btw, > which is why I asked before whether the CPU supports > inserts at a variable position ... ISA 3.1 (Power10) supports variable position inserts. Power9 supports fixed position inserts. Older CPUs can of course construct it some other way. > ppc does _not_ have a VSX instruction > like xxinsertw r34, r8, r12 where r8 denotes > the vector element (or byte position or whatever). vins[bhwd][v][lr]x does this. Those are Power10 instructions. Segher ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-24 13:27 ` Richard Biener 2020-09-24 14:55 ` Richard Biener 2020-09-24 19:02 ` Segher Boessenkool @ 2020-09-25 3:50 ` xionghu luo 2020-09-25 5:00 ` Richard Biener 2 siblings, 1 reply; 24+ messages in thread From: xionghu luo @ 2020-09-25 3:50 UTC (permalink / raw) To: Richard Biener Cc: Segher Boessenkool, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford Hi, On 2020/9/24 21:27, Richard Biener wrote: > On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> wrote: > > I'll just comment that > > xxperm 34,34,33 > xxinsertw 34,0,12 > xxperm 34,34,32 > > doesn't look like a variable-position insert instruction but > this is a variable whole-vector rotate plus an insert at index zero > followed by a variable whole-vector rotate. I'm not fluend in > ppc assembly but > > rlwinm 6,6,2,28,29 > mtvsrwz 0,5 > lvsr 1,0,6 > lvsl 0,0,6 > > possibly computes the shift masks for r33/r32? though > I do not see those registers mentioned... For V4SI: rlwinm 6,6,2,28,29 // r6*4 mtvsrwz 0,5 // vs0 <- r5 (0xfe) lvsr 1,0,6 // vs33 <- lvsr[r6] lvsl 0,0,6 // vs32 <- lvsl[r6] xxperm 34,34,33 xxinsertw 34,0,12 xxperm 34,34,32 blr idx = idx * 4; 0 0 0x4000000030000000200000001 xxperm:0x4000000030000000200000001 vs33:0x101112131415161718191a1b1c1d1e1f vs32:0x102030405060708090a0b0c0d0e0f 1 4 0x4000000030000000200000001 xxperm:0x1000000040000000300000002 vs33:0xc0d0e0f101112131415161718191a1b vs32:0x405060708090a0b0c0d0e0f10111213 2 8 0x4000000030000000200000001 xxperm:0x2000000010000000400000003 vs33:0x8090a0b0c0d0e0f1011121314151617 vs32:0x8090a0b0c0d0e0f1011121314151617 3 12 0x4000000030000000200000001 xxperm:0x3000000020000000100000004 vs33:0x405060708090a0b0c0d0e0f10111213 vs32:0xc0d0e0f101112131415161718191a1b vs34: 0x40000000300000002000000fe 0x400000003000000fe00000001 0x4000000fe0000000200000001 0xfe000000030000000200000001 "xxinsertw 34,0,12" will always insert vs0[32:63] content to the forth word of target vector, bits[96:127]. Then the second xxperm rotate the modified vector back. All the instructions are register based operation, as Segher replied, power9 supports only fixed position inserts, so we need do some trick here to support it instead of generate short store wide load instructions. > > This might be a generic viable expansion strathegy btw, > which is why I asked before whether the CPU supports > inserts at a variable position ... the building blocks are > already there with vec_set at constant zero position > plus vec_perm_const for the rotates. > > But well, I did ask this question. Multiple times. > > ppc does _not_ have a VSX instruction > like xxinsertw r34, r8, r12 where r8 denotes > the vector element (or byte position or whatever). > > So I don't think vec_set with a variable index is the > best approach. > Xionghu - you said even without the patch the stack > storage is eventually elided but > > addi 9,1,-16 > rldic 6,6,2,60 > stxv 34,-16(1) > stwx 5,9,6 > lxv 34,-16(1) > > still shows stack(?) store/load with a bad STLF penalty. Sorry that if I didn't describe clearly and misunderstood you, I mean if insert many instructions(tested with a loop inserted) between "stwx 5,9,6" and "lxv 34,-16(1)", the store hit load performance issue could be elided, but this is not the solution we want. I also changed your test as below and build for X86, seems it also generates inefficient code? What my patch does maybe different usage from your pasted case? #define N 32 typedef int T; typedef T V __attribute__((vector_size(N))); V setg3 (V v, int idx, T val) { v[idx&31] = val; return v; } -O2 -S -mavx -march=znver2: setg3: push rbp and edi, 31 mov rbp, rsp and rsp, -32 vmovdqa YMMWORD PTR [rsp-32], ymm0 mov DWORD PTR [rsp-32+rdi*4], esi vmovdqa ymm0, YMMWORD PTR [rsp-32] leave ret While idx is constant: setg3: vpinsrd xmm1, xmm0, esi, 3 vinserti128 ymm0, ymm0, xmm1, 0x0 ret And ARM with -O2 -S -march=armv8.2-a+sve (N change to 16): setg3: sub sp, sp, #16 and x0, x0, 15 str q0, [sp] str w1, [sp, x0, lsl 2] ldr q0, [sp] add sp, sp, 16 ret While idx is constant: setg3: ins v0.s[3], w1 ret Though I've no idea how to optimize this on X86 and ARM with vector instructions to avoid short store with wide load followed on stack. Thanks, Xionghu ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] 2020-09-25 3:50 ` xionghu luo @ 2020-09-25 5:00 ` Richard Biener 0 siblings, 0 replies; 24+ messages in thread From: Richard Biener @ 2020-09-25 5:00 UTC (permalink / raw) To: xionghu luo Cc: Segher Boessenkool, GCC Patches, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On September 25, 2020 5:50:40 AM GMT+02:00, xionghu luo <luoxhu@linux.ibm.com> wrote: >Hi, > >On 2020/9/24 21:27, Richard Biener wrote: >> On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> >wrote: >> >> I'll just comment that >> >> xxperm 34,34,33 >> xxinsertw 34,0,12 >> xxperm 34,34,32 >> >> doesn't look like a variable-position insert instruction but >> this is a variable whole-vector rotate plus an insert at index zero >> followed by a variable whole-vector rotate. I'm not fluend in >> ppc assembly but >> >> rlwinm 6,6,2,28,29 >> mtvsrwz 0,5 >> lvsr 1,0,6 >> lvsl 0,0,6 >> >> possibly computes the shift masks for r33/r32? though >> I do not see those registers mentioned... > >For V4SI: > rlwinm 6,6,2,28,29 // r6*4 > mtvsrwz 0,5 // vs0 <- r5 (0xfe) > lvsr 1,0,6 // vs33 <- lvsr[r6] > lvsl 0,0,6 // vs32 <- lvsl[r6] > xxperm 34,34,33 > xxinsertw 34,0,12 > xxperm 34,34,32 > blr > > >idx = idx * 4; >0 0 0x4000000030000000200000001 >xxperm:0x4000000030000000200000001 >vs33:0x101112131415161718191a1b1c1d1e1f >vs32:0x102030405060708090a0b0c0d0e0f >1 4 0x4000000030000000200000001 >xxperm:0x1000000040000000300000002 >vs33:0xc0d0e0f101112131415161718191a1b >vs32:0x405060708090a0b0c0d0e0f10111213 >2 8 0x4000000030000000200000001 >xxperm:0x2000000010000000400000003 >vs33:0x8090a0b0c0d0e0f1011121314151617 >vs32:0x8090a0b0c0d0e0f1011121314151617 >3 12 0x4000000030000000200000001 >xxperm:0x3000000020000000100000004 >vs33:0x405060708090a0b0c0d0e0f10111213 >vs32:0xc0d0e0f101112131415161718191a1b > >vs34: > 0x40000000300000002000000fe > 0x400000003000000fe00000001 > 0x4000000fe0000000200000001 >0xfe000000030000000200000001 > > >"xxinsertw 34,0,12" will always insert vs0[32:63] content to the forth >word of >target vector, bits[96:127]. Then the second xxperm rotate the >modified vector >back. > >All the instructions are register based operation, as Segher replied, >power9 >supports only fixed position inserts, so we need do some trick here to >support >it instead of generate short store wide load instructions. OK, I fair enough - I've heard power 10 does support those inserts. > >> >> This might be a generic viable expansion strathegy btw, >> which is why I asked before whether the CPU supports >> inserts at a variable position ... the building blocks are >> already there with vec_set at constant zero position >> plus vec_perm_const for the rotates. >> >> But well, I did ask this question. Multiple times. >> >> ppc does _not_ have a VSX instruction >> like xxinsertw r34, r8, r12 where r8 denotes >> the vector element (or byte position or whatever). >> >> So I don't think vec_set with a variable index is the >> best approach. >> Xionghu - you said even without the patch the stack >> storage is eventually elided but >> >> addi 9,1,-16 >> rldic 6,6,2,60 >> stxv 34,-16(1) >> stwx 5,9,6 >> lxv 34,-16(1) >> >> still shows stack(?) store/load with a bad STLF penalty. > > >Sorry that if I didn't describe clearly and misunderstood you, I mean >if insert many >instructions(tested with a loop inserted) between "stwx 5,9,6" and "lxv >34,-16(1)", >the store hit load performance issue could be elided, but this is not >the solution >we want. > >I also changed your test as below and build for X86, seems it also >generates >inefficient code? What my patch does maybe different usage from your >pasted >case? > >#define N 32 >typedef int T; >typedef T V __attribute__((vector_size(N))); > V setg3 (V v, int idx, T val) >{ > v[idx&31] = val; > return v; >} > >-O2 -S -mavx -march=znver2: > >setg3: > push rbp > and edi, 31 > mov rbp, rsp > and rsp, -32 > vmovdqa YMMWORD PTR [rsp-32], ymm0 > mov DWORD PTR [rsp-32+rdi*4], esi > vmovdqa ymm0, YMMWORD PTR [rsp-32] > leave > ret > > >While idx is constant: > >setg3: > vpinsrd xmm1, xmm0, esi, 3 > vinserti128 ymm0, ymm0, xmm1, 0x0 > ret > >And ARM with -O2 -S -march=armv8.2-a+sve (N change to 16): > >setg3: > sub sp, sp, #16 > and x0, x0, 15 > str q0, [sp] > str w1, [sp, x0, lsl 2] > ldr q0, [sp] > add sp, sp, 16 > ret > >While idx is constant: > >setg3: > ins v0.s[3], w1 > ret > > >Though I've no idea how to optimize this on X86 and ARM with vector >instructions >to avoid short store with wide load followed on stack. Yes, all targets suffer from this. On x86 you can play similar tricks as on power9, See my attempt in the other mail (and Seghers fix). Richard. > >Thanks, >Xionghu ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-18 6:17 [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Xiong Hu Luo 2020-09-18 6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo @ 2020-09-18 18:20 ` Segher Boessenkool 2020-09-21 8:31 ` Richard Biener 2 siblings, 0 replies; 24+ messages in thread From: Segher Boessenkool @ 2020-09-18 18:20 UTC (permalink / raw) To: Xiong Hu Luo Cc: gcc-patches, dje.gcc, wschmidt, guojiufu, linkw, richard.guenther, richard.sandiford On Fri, Sep 18, 2020 at 01:17:40AM -0500, Xiong Hu Luo wrote: > This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to > VEC_SET internal function in gimple-isel pass if target supports > vec_set with variable index by checking can_vec_set_var_idx_p. > + tree_code code = TREE_CODE (TREE_TYPE(view_op0)); ^ Missing space here ---------------------------/ Thanks, Segher ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-18 6:17 [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Xiong Hu Luo 2020-09-18 6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo 2020-09-18 18:20 ` [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Segher Boessenkool @ 2020-09-21 8:31 ` Richard Biener 2020-09-22 3:55 ` [PATCH v3 " xionghu luo 2 siblings, 1 reply; 24+ messages in thread From: Richard Biener @ 2020-09-21 8:31 UTC (permalink / raw) To: Xiong Hu Luo Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On Fri, Sep 18, 2020 at 8:18 AM Xiong Hu Luo <luoxhu@linux.ibm.com> wrote: > > This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to > VEC_SET internal function in gimple-isel pass if target supports > vec_set with variable index by checking can_vec_set_var_idx_p. > > gcc/ChangeLog: > > 2020-09-18 Xionghu Luo <luoxhu@linux.ibm.com> > > * gimple-isel.cc (gimple_expand_vec_set_expr): New function. > (gimple_expand_vec_cond_exprs): Call gimple_expand_vec_set_expr. > * internal-fn.c (vec_set_direct): New define. > (expand_vec_set_optab_fn): New function. > (direct_vec_set_optab_supported_p): New define. > * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN. > * optabs.c (can_vec_set_var_idx_p): New function. > * optabs.h (can_vec_set_var_idx_p): New declare. > --- > gcc/gimple-isel.cc | 116 +++++++++++++++++++++++++++++++++++++++++++- > gcc/internal-fn.c | 36 ++++++++++++++ > gcc/internal-fn.def | 2 + > gcc/optabs.c | 17 +++++++ > gcc/optabs.h | 3 ++ > 5 files changed, 172 insertions(+), 2 deletions(-) > > diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc > index b330cf4c20e..bc61e2895be 100644 > --- a/gcc/gimple-isel.cc > +++ b/gcc/gimple-isel.cc > @@ -35,6 +35,80 @@ along with GCC; see the file COPYING3. If not see > #include "tree-cfg.h" > #include "bitmap.h" > #include "tree-ssa-dce.h" > +#include "fold-const.h" > +#include "gimple-fold.h" > +#include "memmodel.h" > +#include "optabs.h" > + > +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to > + internal function based on vector type of selected expansion. > + i.e.: > + VIEW_CONVERT_EXPR<int[4]>(u)[_1] = = i_4(D); > + => > + _7 = u; > + _8 = .VEC_SET (_7, i_4(D), _1); > + u = _8; */ > + > +static gimple * > +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) > +{ > + enum tree_code code; > + gcall *new_stmt = NULL; > + gassign *ass_stmt = NULL; > + > + /* Only consider code == GIMPLE_ASSIGN. */ > + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi)); > + if (!stmt) > + return NULL; > + > + code = TREE_CODE (gimple_assign_lhs (stmt)); do the lhs = gimple_assign_lhs (stmt) before and elide cond, putting the TREE_CODE into the if below. > + if (code != ARRAY_REF) > + return NULL; > + > + tree lhs = gimple_assign_lhs (stmt); > + tree val = gimple_assign_rhs1 (stmt); > + > + tree type = TREE_TYPE (lhs); > + tree op0 = TREE_OPERAND (lhs, 0); > + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR So I think we want to have an exact structural match first here, so if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0)) && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0))) && TYPE_MODE (TREE_TYPE (lhs)) == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0)))) which means we're sure to do an element extract from a vector type (and we know all vector types have sane element types). > + && tree_fits_uhwi_p (TYPE_SIZE (type))) > + { > + tree pos = TREE_OPERAND (lhs, 1); > + tree view_op0 = TREE_OPERAND (op0, 0); > + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); > + scalar_mode innermode = GET_MODE_INNER (outermode); > + tree_code code = TREE_CODE (TREE_TYPE(view_op0)); > + if (!is_global_var (view_op0) && code == VECTOR_TYPE > + && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0))) why did you need those TYPE_SIZE checks? As said earlier you want !TREE_ADDRESSABLE (view_op0) and eventually the stronger auto_var_in_fn_p (view_op0, cfun) rather than !is_global_var. > + && can_vec_set_var_idx_p (code, outermode, innermode, > + TYPE_MODE (TREE_TYPE (pos)))) > + { > + location_t loc = gimple_location (stmt); > + tree var_src = make_ssa_name (TREE_TYPE (view_op0)); > + tree var_dst = make_ssa_name (TREE_TYPE (view_op0)); > + > + ass_stmt = gimple_build_assign (var_src, view_op0); > + gimple_set_vuse (ass_stmt, gimple_vuse (stmt)); > + gimple_set_location (ass_stmt, loc); > + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); > + > + new_stmt > + = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos); > + gimple_call_set_lhs (new_stmt, var_dst); > + gimple_set_location (new_stmt, loc); > + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > + > + ass_stmt = gimple_build_assign (view_op0, var_dst); > + gimple_set_location (ass_stmt, loc); > + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); > + > + gimple_move_vops (ass_stmt, stmt); > + gsi_remove (gsi, true); > + } > + } > + > + return ass_stmt; > +} > > /* Expand all VEC_COND_EXPR gimple assignments into calls to internal > function based on type of selected expansion. */ > @@ -187,8 +261,25 @@ gimple_expand_vec_cond_exprs (void) > { > for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > { > - gimple *g = gimple_expand_vec_cond_expr (&gsi, > - &vec_cond_ssa_name_uses); > + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); > + if (!stmt) > + continue; > + > + enum tree_code code; > + gimple *g = NULL; > + code = gimple_assign_rhs_code (stmt); > + switch (code) > + { > + case VEC_COND_EXPR: > + g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses); > + break; > + case ARRAY_REF: > + /* TODO: generate IFN for vec_extract with variable index. */ so why not do this here? > + break; > + default: > + break; > + } > + > if (g != NULL) > { > tree lhs = gimple_assign_lhs (gsi_stmt (gsi)); > @@ -204,6 +295,27 @@ gimple_expand_vec_cond_exprs (void) > > simple_dce_from_worklist (dce_ssa_names); > > + FOR_EACH_BB_FN (bb, cfun) but in a separate loop? > + { > + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > + { > + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); > + if (!stmt) > + continue; > + > + enum tree_code code; > + code = TREE_CODE (gimple_assign_lhs (stmt)); > + switch (code) > + { > + case ARRAY_REF: > + gimple_expand_vec_set_expr (&gsi); > + break; > + default: > + break; > + } > + } > + } > + > return 0; > } > > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c > index 8efc77d986b..36837381c04 100644 > --- a/gcc/internal-fn.c > +++ b/gcc/internal-fn.c > @@ -115,6 +115,7 @@ init_internal_fns () > #define vec_condeq_direct { 0, 0, false } > #define scatter_store_direct { 3, 1, false } > #define len_store_direct { 3, 3, false } > +#define vec_set_direct { 3, 3, false } > #define unary_direct { 0, 0, true } > #define binary_direct { 0, 0, true } > #define ternary_direct { 0, 0, true } > @@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > > #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn > > +static void > +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) all new functions require a function level comment > +{ > + tree lhs = gimple_call_lhs (stmt); > + tree op0 = gimple_call_arg (stmt, 0); > + tree op1 = gimple_call_arg (stmt, 1); > + tree op2 = gimple_call_arg (stmt, 2); > + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); > + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); > + > + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); > + scalar_mode innermode = GET_MODE_INNER (outermode); > + > + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + > + class expand_operand ops[3]; > + enum insn_code icode = optab_handler (optab, outermode); > + > + if (icode != CODE_FOR_nothing) > + { > + pos = convert_to_mode (E_SImode, pos, 0); > + > + create_fixed_operand (&ops[0], src); > + create_input_operand (&ops[1], value, innermode); > + create_input_operand (&ops[2], pos, GET_MODE (pos)); > + if (maybe_expand_insn (icode, 3, ops)) > + { > + emit_move_insn (target, src); I think you need to assert that we end up here. > + return; > + } > + } > +} > + > static void > expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) > { > @@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, > #define direct_fold_left_optab_supported_p direct_optab_supported_p > #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p > #define direct_check_ptrs_optab_supported_p direct_optab_supported_p > +#define direct_vec_set_optab_supported_p direct_optab_supported_p > > /* Return the optab used by internal function FN. */ > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 13e60828fcf..e6cfe1b6159 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) > DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) > DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) > > +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) > + > DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) > > DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) > diff --git a/gcc/optabs.c b/gcc/optabs.c > index 184827fdf4e..c8125670d2d 100644 > --- a/gcc/optabs.c > +++ b/gcc/optabs.c > @@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, > && insn_operand_matches (icode, 3, test); > } > > +bool > +can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode, > + machine_mode value_mode, machine_mode idx_mode) toplevel comment missing > +{ > + gcc_assert (code == VECTOR_TYPE); what's the point of pasing 'code' here then? Since the optab only has a single mode, the vector mode, the value_mode is redundant as well. And I guess we might want to handle "arbitrary" index modes? That is, the .md expanders should not restrict its mode - I guess it simply uses VOIDmode at the moment (for integer constants). Not sure how to best do this without an explicit mode in the optab ... > + > + rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1); > + rtx reg2 = alloca_raw_REG (value_mode, LAST_VIRTUAL_REGISTER + 2); > + rtx reg3 = alloca_raw_REG (idx_mode, LAST_VIRTUAL_REGISTER + 3); > + > + enum insn_code icode = optab_handler (vec_set_optab, vec_mode); > + > + return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1) > + && insn_operand_matches (icode, 1, reg2) > + && insn_operand_matches (icode, 2, reg3); > +} > + > /* This function is called when we are going to emit a compare instruction that > compares the values found in X and Y, using the rtl operator COMPARISON. > > diff --git a/gcc/optabs.h b/gcc/optabs.h > index 7c2ec257cb0..c018d127756 100644 > --- a/gcc/optabs.h > +++ b/gcc/optabs.h > @@ -249,6 +249,9 @@ extern int can_compare_p (enum rtx_code, machine_mode, > VALUE_MODE. */ > extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode); > > +extern bool can_vec_set_var_idx_p (enum tree_code, machine_mode, machine_mode, > + machine_mode); > + > extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode, > machine_mode, int); > /* Emit a pair of rtl insns to compare two rtx's and to jump > -- > 2.27.0.90.geebb51ba8c > ^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-21 8:31 ` Richard Biener @ 2020-09-22 3:55 ` xionghu luo 2020-09-23 11:33 ` Richard Biener 0 siblings, 1 reply; 24+ messages in thread From: xionghu luo @ 2020-09-22 3:55 UTC (permalink / raw) To: Richard Biener Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford [-- Attachment #1: Type: text/plain, Size: 9867 bytes --] Thanks for the review, On 2020/9/21 16:31, Richard Biener wrote: >> + >> +static gimple * >> +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) >> +{ >> + enum tree_code code; >> + gcall *new_stmt = NULL; >> + gassign *ass_stmt = NULL; >> + >> + /* Only consider code == GIMPLE_ASSIGN. */ >> + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi)); >> + if (!stmt) >> + return NULL; >> + >> + code = TREE_CODE (gimple_assign_lhs (stmt)); > > do the lhs = gimple_assign_lhs (stmt) before and elide cond, > putting the TREE_CODE into the if below. Done. > >> + if (code != ARRAY_REF) >> + return NULL; >> + >> + tree lhs = gimple_assign_lhs (stmt); >> + tree val = gimple_assign_rhs1 (stmt); >> + >> + tree type = TREE_TYPE (lhs); >> + tree op0 = TREE_OPERAND (lhs, 0); >> + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR > > So I think we want to have an exact structural match first here, so > > if (TREE_CODE (op0) == VIEW_CONVERT_EXPR > && DECL_P (TREE_OPERAND (op0, 0)) > && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0))) > && TYPE_MODE (TREE_TYPE (lhs)) == TYPE_MODE (TREE_TYPE > (TREE_TYPE (TREE_OPERAND (op0, 0)))) > > which means we're sure to do an element extract from a vector type > (and we know all vector types have sane element types). Done. > > >> + && tree_fits_uhwi_p (TYPE_SIZE (type))) >> + { >> + tree pos = TREE_OPERAND (lhs, 1); >> + tree view_op0 = TREE_OPERAND (op0, 0); >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); >> + scalar_mode innermode = GET_MODE_INNER (outermode); >> + tree_code code = TREE_CODE (TREE_TYPE(view_op0)); >> + if (!is_global_var (view_op0) && code == VECTOR_TYPE >> + && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0))) > > why did you need those TYPE_SIZE checks? As said earlier > you want !TREE_ADDRESSABLE (view_op0) and eventually > the stronger auto_var_in_fn_p (view_op0, cfun) rather than !is_global_var. Done. > >> + && can_vec_set_var_idx_p (code, outermode, innermode, >> + TYPE_MODE (TREE_TYPE (pos)))) >> + { >> + location_t loc = gimple_location (stmt); >> + tree var_src = make_ssa_name (TREE_TYPE (view_op0)); >> + tree var_dst = make_ssa_name (TREE_TYPE (view_op0)); >> + >> + ass_stmt = gimple_build_assign (var_src, view_op0); >> + gimple_set_vuse (ass_stmt, gimple_vuse (stmt)); >> + gimple_set_location (ass_stmt, loc); >> + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); >> + >> + new_stmt >> + = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos); >> + gimple_call_set_lhs (new_stmt, var_dst); >> + gimple_set_location (new_stmt, loc); >> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); >> + >> + ass_stmt = gimple_build_assign (view_op0, var_dst); >> + gimple_set_location (ass_stmt, loc); >> + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); >> + >> + gimple_move_vops (ass_stmt, stmt); >> + gsi_remove (gsi, true); >> + } >> + } >> + >> + return ass_stmt; >> +} >> >> /* Expand all VEC_COND_EXPR gimple assignments into calls to internal >> function based on type of selected expansion. */ >> @@ -187,8 +261,25 @@ gimple_expand_vec_cond_exprs (void) >> { >> for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) >> { >> - gimple *g = gimple_expand_vec_cond_expr (&gsi, >> - &vec_cond_ssa_name_uses); >> + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); >> + if (!stmt) >> + continue; >> + >> + enum tree_code code; >> + gimple *g = NULL; >> + code = gimple_assign_rhs_code (stmt); >> + switch (code) >> + { >> + case VEC_COND_EXPR: >> + g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses); >> + break; >> + case ARRAY_REF: >> + /* TODO: generate IFN for vec_extract with variable index. */ > > so why not do this here? > >> + break; >> + default: >> + break; >> + } >> + >> if (g != NULL) >> { >> tree lhs = gimple_assign_lhs (gsi_stmt (gsi)); >> @@ -204,6 +295,27 @@ gimple_expand_vec_cond_exprs (void) >> >> simple_dce_from_worklist (dce_ssa_names); >> >> + FOR_EACH_BB_FN (bb, cfun) > > but in a separate loop? The first loop is for rhs stmt process, this loop is for lhs stmt process. I thought vec_extract also need to generate IFN before, but seems not necessary now? And that the first loop needs to update the lhs stmt while then second doesn't. > >> + { >> + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) >> + { >> + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); >> + if (!stmt) >> + continue; >> + >> + enum tree_code code; >> + code = TREE_CODE (gimple_assign_lhs (stmt)); >> + switch (code) >> + { >> + case ARRAY_REF: >> + gimple_expand_vec_set_expr (&gsi); >> + break; >> + default: >> + break; >> + } >> + } >> + } >> + >> return 0; >> } >> >> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c >> index 8efc77d986b..36837381c04 100644 >> --- a/gcc/internal-fn.c >> +++ b/gcc/internal-fn.c >> @@ -115,6 +115,7 @@ init_internal_fns () >> #define vec_condeq_direct { 0, 0, false } >> #define scatter_store_direct { 3, 1, false } >> #define len_store_direct { 3, 3, false } >> +#define vec_set_direct { 3, 3, false } >> #define unary_direct { 0, 0, true } >> #define binary_direct { 0, 0, true } >> #define ternary_direct { 0, 0, true } >> @@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) >> >> #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn >> >> +static void >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > > all new functions require a function level comment Done. > >> +{ >> + tree lhs = gimple_call_lhs (stmt); >> + tree op0 = gimple_call_arg (stmt, 0); >> + tree op1 = gimple_call_arg (stmt, 1); >> + tree op2 = gimple_call_arg (stmt, 2); >> + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); >> + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); >> + >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); >> + scalar_mode innermode = GET_MODE_INNER (outermode); >> + >> + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); >> + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); >> + >> + class expand_operand ops[3]; >> + enum insn_code icode = optab_handler (optab, outermode); >> + >> + if (icode != CODE_FOR_nothing) >> + { >> + pos = convert_to_mode (E_SImode, pos, 0); >> + >> + create_fixed_operand (&ops[0], src); >> + create_input_operand (&ops[1], value, innermode); >> + create_input_operand (&ops[2], pos, GET_MODE (pos)); >> + if (maybe_expand_insn (icode, 3, ops)) >> + { >> + emit_move_insn (target, src); > > I think you need to assert that we end up here. Added gcc_unreachable at the end of this function. > >> + return; >> + } >> + } >> +} >> + >> static void >> expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) >> { >> @@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, >> #define direct_fold_left_optab_supported_p direct_optab_supported_p >> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p >> #define direct_check_ptrs_optab_supported_p direct_optab_supported_p >> +#define direct_vec_set_optab_supported_p direct_optab_supported_p >> >> /* Return the optab used by internal function FN. */ >> >> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def >> index 13e60828fcf..e6cfe1b6159 100644 >> --- a/gcc/internal-fn.def >> +++ b/gcc/internal-fn.def >> @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) >> DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) >> DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) >> >> +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) >> + >> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) >> >> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) >> diff --git a/gcc/optabs.c b/gcc/optabs.c >> index 184827fdf4e..c8125670d2d 100644 >> --- a/gcc/optabs.c >> +++ b/gcc/optabs.c >> @@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, >> && insn_operand_matches (icode, 3, test); >> } >> >> +bool >> +can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode, >> + machine_mode value_mode, machine_mode idx_mode) > > toplevel comment missing > >> +{ >> + gcc_assert (code == VECTOR_TYPE); > > what's the point of pasing 'code' here then? Since the optab only has a single > mode, the vector mode, the value_mode is redundant as well. And I guess > we might want to handle "arbitrary" index modes? That is, the .md expanders > should not restrict its mode - I guess it simply uses VOIDmode at the moment > (for integer constants). Not sure how to best do this without an explicit mode > in the optab ... Yes, removed 'code' and value_mode by checking VECTOR_MODE_P and use GET_MODE_INNER for value_mode. ".md expanders" shall support for integer constants index mode, but I guess they shouldn't be expanded by IFN as this function is for variable index insert only? Anyway, the v3 patch used VOIDmode check... Thanks, Xionghu [-- Attachment #2: v3-0001-IFN-Implement-IFN_VEC_SET-for-ARRAY_REF-with-VIEW.patch --] [-- Type: text/plain, Size: 10373 bytes --] From 571717aea126380d3e36fdb4504f9a6337eed206 Mon Sep 17 00:00:00 2001 From: Xiong Hu Luo <luoxhu@linux.ibm.com> Date: Mon, 14 Sep 2020 21:08:11 -0500 Subject: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to VEC_SET internal function in gimple-isel pass if target supports vec_set with variable index by checking can_vec_set_var_idx_p. gcc/ChangeLog: 2020-09-22 Xionghu Luo <luoxhu@linux.ibm.com> * gimple-isel.cc (gimple_expand_vec_set_expr): New function. (gimple_expand_vec_cond_exprs): Rename to ... (gimple_expand_vec_exprs): ... this and call gimple_expand_vec_set_expr. * internal-fn.c (vec_set_direct): New define. (expand_vec_set_optab_fn): New function. (direct_vec_set_optab_supported_p): New define. * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN. * optabs.c (can_vec_set_var_idx_p): New function. * optabs.h (can_vec_set_var_idx_p): New declaration. --- gcc/gimple-isel.cc | 115 ++++++++++++++++++++++++++++++++++++++++++-- gcc/internal-fn.c | 39 +++++++++++++++ gcc/internal-fn.def | 2 + gcc/optabs.c | 21 ++++++++ gcc/optabs.h | 4 ++ 5 files changed, 177 insertions(+), 4 deletions(-) diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc index b330cf4c20e..d16d67b1a21 100644 --- a/gcc/gimple-isel.cc +++ b/gcc/gimple-isel.cc @@ -35,6 +35,78 @@ along with GCC; see the file COPYING3. If not see #include "tree-cfg.h" #include "bitmap.h" #include "tree-ssa-dce.h" +#include "fold-const.h" +#include "gimple-fold.h" +#include "memmodel.h" +#include "optabs.h" + +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to + internal function based on vector type of selected expansion. + i.e.: + VIEW_CONVERT_EXPR<int[4]>(u)[_1] = = i_4(D); + => + _7 = u; + _8 = .VEC_SET (_7, i_4(D), _1); + u = _8; */ + +static gimple * +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) +{ + enum tree_code code; + gcall *new_stmt = NULL; + gassign *ass_stmt = NULL; + + /* Only consider code == GIMPLE_ASSIGN. */ + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi)); + if (!stmt) + return NULL; + + tree lhs = gimple_assign_lhs (stmt); + code = TREE_CODE (lhs); + if (code != ARRAY_REF) + return NULL; + + tree val = gimple_assign_rhs1 (stmt); + tree type = TREE_TYPE (lhs); + tree op0 = TREE_OPERAND (lhs, 0); + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0)) + && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0))) + && TYPE_MODE (TREE_TYPE (lhs)) + == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0))))) + { + tree pos = TREE_OPERAND (lhs, 1); + tree view_op0 = TREE_OPERAND (op0, 0); + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); + scalar_mode innermode = GET_MODE_INNER (outermode); + if (auto_var_in_fn_p (view_op0, cfun->decl) + && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx_p (outermode)) + { + location_t loc = gimple_location (stmt); + tree var_src = make_ssa_name (TREE_TYPE (view_op0)); + tree var_dst = make_ssa_name (TREE_TYPE (view_op0)); + + ass_stmt = gimple_build_assign (var_src, view_op0); + gimple_set_vuse (ass_stmt, gimple_vuse (stmt)); + gimple_set_location (ass_stmt, loc); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + + new_stmt + = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos); + gimple_call_set_lhs (new_stmt, var_dst); + gimple_set_location (new_stmt, loc); + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); + + ass_stmt = gimple_build_assign (view_op0, var_dst); + gimple_set_location (ass_stmt, loc); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + + gimple_move_vops (ass_stmt, stmt); + gsi_remove (gsi, true); + } + } + + return ass_stmt; +} /* Expand all VEC_COND_EXPR gimple assignments into calls to internal function based on type of selected expansion. */ @@ -176,7 +248,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi, VEC_COND_EXPR assignments. */ static unsigned int -gimple_expand_vec_cond_exprs (void) +gimple_expand_vec_exprs (void) { gimple_stmt_iterator gsi; basic_block bb; @@ -187,8 +259,22 @@ gimple_expand_vec_cond_exprs (void) { for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { - gimple *g = gimple_expand_vec_cond_expr (&gsi, - &vec_cond_ssa_name_uses); + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); + if (!stmt) + continue; + + enum tree_code code; + gimple *g = NULL; + code = gimple_assign_rhs_code (stmt); + switch (code) + { + case VEC_COND_EXPR: + g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses); + break; + default: + break; + } + if (g != NULL) { tree lhs = gimple_assign_lhs (gsi_stmt (gsi)); @@ -204,6 +290,27 @@ gimple_expand_vec_cond_exprs (void) simple_dce_from_worklist (dce_ssa_names); + FOR_EACH_BB_FN (bb, cfun) + { + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); + if (!stmt) + continue; + + enum tree_code code; + code = TREE_CODE (gimple_assign_lhs (stmt)); + switch (code) + { + case ARRAY_REF: + gimple_expand_vec_set_expr (&gsi); + break; + default: + break; + } + } + } + return 0; } @@ -237,7 +344,7 @@ public: virtual unsigned int execute (function *) { - return gimple_expand_vec_cond_exprs (); + return gimple_expand_vec_exprs (); } }; // class pass_gimple_isel diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 8efc77d986b..f97aea44253 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -115,6 +115,7 @@ init_internal_fns () #define vec_condeq_direct { 0, 0, false } #define scatter_store_direct { 3, 1, false } #define len_store_direct { 3, 3, false } +#define vec_set_direct { 3, 3, false } #define unary_direct { 0, 0, true } #define binary_direct { 0, 0, true } #define ternary_direct { 0, 0, true } @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn +/* Expand VEC_SET internal functions. */ + +static void +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) +{ + tree lhs = gimple_call_lhs (stmt); + tree op0 = gimple_call_arg (stmt, 0); + tree op1 = gimple_call_arg (stmt, 1); + tree op2 = gimple_call_arg (stmt, 2); + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); + + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); + scalar_mode innermode = GET_MODE_INNER (outermode); + + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); + + class expand_operand ops[3]; + enum insn_code icode = optab_handler (optab, outermode); + + if (icode != CODE_FOR_nothing) + { + pos = convert_to_mode (E_SImode, pos, 0); + + create_fixed_operand (&ops[0], src); + create_input_operand (&ops[1], value, innermode); + create_input_operand (&ops[2], pos, GET_MODE (pos)); + if (maybe_expand_insn (icode, 3, ops)) + { + emit_move_insn (target, src); + return; + } + } + gcc_unreachable (); +} + static void expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) { @@ -3253,6 +3291,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_fold_left_optab_supported_p direct_optab_supported_p #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p #define direct_check_ptrs_optab_supported_p direct_optab_supported_p +#define direct_vec_set_optab_supported_p direct_optab_supported_p /* Return the optab used by internal function FN. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 13e60828fcf..e6cfe1b6159 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) + DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) diff --git a/gcc/optabs.c b/gcc/optabs.c index 184827fdf4e..8e844028d92 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -3841,6 +3841,27 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, && insn_operand_matches (icode, 3, test); } +/* Return whether the backend can emit vector set instructions for inserting + element into vector at variable index position. */ + +bool +can_vec_set_var_idx_p (machine_mode vec_mode) +{ + if (!VECTOR_MODE_P (vec_mode)) + return false; + + machine_mode inner_mode = GET_MODE_INNER (vec_mode); + rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1); + rtx reg2 = alloca_raw_REG (inner_mode, LAST_VIRTUAL_REGISTER + 2); + rtx reg3 = alloca_raw_REG (VOIDmode, LAST_VIRTUAL_REGISTER + 3); + + enum insn_code icode = optab_handler (vec_set_optab, vec_mode); + + return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1) + && insn_operand_matches (icode, 1, reg2) + && insn_operand_matches (icode, 2, reg3); +} + /* This function is called when we are going to emit a compare instruction that compares the values found in X and Y, using the rtl operator COMPARISON. diff --git a/gcc/optabs.h b/gcc/optabs.h index 7c2ec257cb0..0b14700ab3d 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -249,6 +249,10 @@ extern int can_compare_p (enum rtx_code, machine_mode, VALUE_MODE. */ extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode); +/* Return whether the backend can emit vector set instructions for inserting + element into vector at variable index position. */ +extern bool can_vec_set_var_idx_p (machine_mode); + extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode, machine_mode, int); /* Emit a pair of rtl insns to compare two rtx's and to jump -- 2.27.0.90.geebb51ba8c ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-22 3:55 ` [PATCH v3 " xionghu luo @ 2020-09-23 11:33 ` Richard Biener 2020-09-24 3:24 ` xionghu luo 0 siblings, 1 reply; 24+ messages in thread From: Richard Biener @ 2020-09-23 11:33 UTC (permalink / raw) To: xionghu luo Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On Tue, Sep 22, 2020 at 5:55 AM xionghu luo <luoxhu@linux.ibm.com> wrote: > > Thanks for the review, > > > On 2020/9/21 16:31, Richard Biener wrote: > >> + > >> +static gimple * > >> +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) > >> +{ > >> + enum tree_code code; > >> + gcall *new_stmt = NULL; > >> + gassign *ass_stmt = NULL; > >> + > >> + /* Only consider code == GIMPLE_ASSIGN. */ > >> + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi)); > >> + if (!stmt) > >> + return NULL; > >> + > >> + code = TREE_CODE (gimple_assign_lhs (stmt)); > > > > do the lhs = gimple_assign_lhs (stmt) before and elide cond, > > putting the TREE_CODE into the if below. > > Done. > > > > >> + if (code != ARRAY_REF) > >> + return NULL; > >> + > >> + tree lhs = gimple_assign_lhs (stmt); > >> + tree val = gimple_assign_rhs1 (stmt); > >> + > >> + tree type = TREE_TYPE (lhs); > >> + tree op0 = TREE_OPERAND (lhs, 0); > >> + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR > > > > So I think we want to have an exact structural match first here, so > > > > if (TREE_CODE (op0) == VIEW_CONVERT_EXPR > > && DECL_P (TREE_OPERAND (op0, 0)) > > && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0))) > > && TYPE_MODE (TREE_TYPE (lhs)) == TYPE_MODE (TREE_TYPE > > (TREE_TYPE (TREE_OPERAND (op0, 0)))) > > > > which means we're sure to do an element extract from a vector type > > (and we know all vector types have sane element types). > > Done. > > > > > > >> + && tree_fits_uhwi_p (TYPE_SIZE (type))) > >> + { > >> + tree pos = TREE_OPERAND (lhs, 1); > >> + tree view_op0 = TREE_OPERAND (op0, 0); > >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); > >> + scalar_mode innermode = GET_MODE_INNER (outermode); > >> + tree_code code = TREE_CODE (TREE_TYPE(view_op0)); > >> + if (!is_global_var (view_op0) && code == VECTOR_TYPE > >> + && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0))) > > > > why did you need those TYPE_SIZE checks? As said earlier > > you want !TREE_ADDRESSABLE (view_op0) and eventually > > the stronger auto_var_in_fn_p (view_op0, cfun) rather than !is_global_var. > > Done. > > > > >> + && can_vec_set_var_idx_p (code, outermode, innermode, > >> + TYPE_MODE (TREE_TYPE (pos)))) > >> + { > >> + location_t loc = gimple_location (stmt); > >> + tree var_src = make_ssa_name (TREE_TYPE (view_op0)); > >> + tree var_dst = make_ssa_name (TREE_TYPE (view_op0)); > >> + > >> + ass_stmt = gimple_build_assign (var_src, view_op0); > >> + gimple_set_vuse (ass_stmt, gimple_vuse (stmt)); > >> + gimple_set_location (ass_stmt, loc); > >> + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); > >> + > >> + new_stmt > >> + = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos); > >> + gimple_call_set_lhs (new_stmt, var_dst); > >> + gimple_set_location (new_stmt, loc); > >> + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > >> + > >> + ass_stmt = gimple_build_assign (view_op0, var_dst); > >> + gimple_set_location (ass_stmt, loc); > >> + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); > >> + > >> + gimple_move_vops (ass_stmt, stmt); > >> + gsi_remove (gsi, true); > >> + } > >> + } > >> + > >> + return ass_stmt; > >> +} > >> > >> /* Expand all VEC_COND_EXPR gimple assignments into calls to internal > >> function based on type of selected expansion. */ > >> @@ -187,8 +261,25 @@ gimple_expand_vec_cond_exprs (void) > >> { > >> for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > >> { > >> - gimple *g = gimple_expand_vec_cond_expr (&gsi, > >> - &vec_cond_ssa_name_uses); > >> + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); > >> + if (!stmt) > >> + continue; > >> + > >> + enum tree_code code; > >> + gimple *g = NULL; > >> + code = gimple_assign_rhs_code (stmt); > >> + switch (code) > >> + { > >> + case VEC_COND_EXPR: > >> + g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses); > >> + break; > >> + case ARRAY_REF: > >> + /* TODO: generate IFN for vec_extract with variable index. */ > > > > so why not do this here? > > > >> + break; > >> + default: > >> + break; > >> + } > >> + > >> if (g != NULL) > >> { > >> tree lhs = gimple_assign_lhs (gsi_stmt (gsi)); > >> @@ -204,6 +295,27 @@ gimple_expand_vec_cond_exprs (void) > >> > >> simple_dce_from_worklist (dce_ssa_names); > >> > >> + FOR_EACH_BB_FN (bb, cfun) > > > > but in a separate loop? > > The first loop is for rhs stmt process, this loop is for lhs stmt process. > I thought vec_extract also need to generate IFN before, but seems not > necessary now? And that the first loop needs to update the lhs stmt while > then second doesn't. That's not good reasons to separate them, please move all the processing into one loop. + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); + if (!stmt) + continue; + + enum tree_code code; + code = TREE_CODE (gimple_assign_lhs (stmt)); + switch (code) + { + case ARRAY_REF: + gimple_expand_vec_set_expr (&gsi); you also do the assign and ARRAY_REF checking duplicate. The patch likely wasn't bootstrapped because I've seen unused and set-but-not-used variables. Otherwise the patch looks good to me - I guess you want to add the vec_extract bits as well so you can overall assess the affect of the patch on altivec code? That said, the patch misses a testcase where we verify we properly expand the vector to a pseudo now. Richard. > > > >> + { > >> + for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) > >> + { > >> + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); > >> + if (!stmt) > >> + continue; > >> + > >> + enum tree_code code; > >> + code = TREE_CODE (gimple_assign_lhs (stmt)); > >> + switch (code) > >> + { > >> + case ARRAY_REF: > >> + gimple_expand_vec_set_expr (&gsi); > >> + break; > >> + default: > >> + break; > >> + } > >> + } > >> + } > >> + > >> return 0; > >> } > >> > >> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c > >> index 8efc77d986b..36837381c04 100644 > >> --- a/gcc/internal-fn.c > >> +++ b/gcc/internal-fn.c > >> @@ -115,6 +115,7 @@ init_internal_fns () > >> #define vec_condeq_direct { 0, 0, false } > >> #define scatter_store_direct { 3, 1, false } > >> #define len_store_direct { 3, 3, false } > >> +#define vec_set_direct { 3, 3, false } > >> #define unary_direct { 0, 0, true } > >> #define binary_direct { 0, 0, true } > >> #define ternary_direct { 0, 0, true } > >> @@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > >> > >> #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn > >> > >> +static void > >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > > > > all new functions require a function level comment > > Done. > > > > >> +{ > >> + tree lhs = gimple_call_lhs (stmt); > >> + tree op0 = gimple_call_arg (stmt, 0); > >> + tree op1 = gimple_call_arg (stmt, 1); > >> + tree op2 = gimple_call_arg (stmt, 2); > >> + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); > >> + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); > >> + > >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); > >> + scalar_mode innermode = GET_MODE_INNER (outermode); > >> + > >> + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > >> + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); > >> + > >> + class expand_operand ops[3]; > >> + enum insn_code icode = optab_handler (optab, outermode); > >> + > >> + if (icode != CODE_FOR_nothing) > >> + { > >> + pos = convert_to_mode (E_SImode, pos, 0); > >> + > >> + create_fixed_operand (&ops[0], src); > >> + create_input_operand (&ops[1], value, innermode); > >> + create_input_operand (&ops[2], pos, GET_MODE (pos)); > >> + if (maybe_expand_insn (icode, 3, ops)) > >> + { > >> + emit_move_insn (target, src); > > > > I think you need to assert that we end up here. > > Added gcc_unreachable at the end of this function. > > > > >> + return; > >> + } > >> + } > >> +} > >> + > >> static void > >> expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) > >> { > >> @@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, > >> #define direct_fold_left_optab_supported_p direct_optab_supported_p > >> #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p > >> #define direct_check_ptrs_optab_supported_p direct_optab_supported_p > >> +#define direct_vec_set_optab_supported_p direct_optab_supported_p > >> > >> /* Return the optab used by internal function FN. */ > >> > >> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > >> index 13e60828fcf..e6cfe1b6159 100644 > >> --- a/gcc/internal-fn.def > >> +++ b/gcc/internal-fn.def > >> @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) > >> DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) > >> DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) > >> > >> +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) > >> + > >> DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) > >> > >> DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) > >> diff --git a/gcc/optabs.c b/gcc/optabs.c > >> index 184827fdf4e..c8125670d2d 100644 > >> --- a/gcc/optabs.c > >> +++ b/gcc/optabs.c > >> @@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, > >> && insn_operand_matches (icode, 3, test); > >> } > >> > >> +bool > >> +can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode, > >> + machine_mode value_mode, machine_mode idx_mode) > > > > toplevel comment missing > > > >> +{ > >> + gcc_assert (code == VECTOR_TYPE); > > > > what's the point of pasing 'code' here then? Since the optab only has a single > > mode, the vector mode, the value_mode is redundant as well. And I guess > > we might want to handle "arbitrary" index modes? That is, the .md expanders > > should not restrict its mode - I guess it simply uses VOIDmode at the moment > > (for integer constants). Not sure how to best do this without an explicit mode > > in the optab ... > > Yes, removed 'code' and value_mode by checking VECTOR_MODE_P and use GET_MODE_INNER > for value_mode. ".md expanders" shall support for integer constants index mode, but > I guess they shouldn't be expanded by IFN as this function is for variable index > insert only? Anyway, the v3 patch used VOIDmode check... > > > Thanks, > Xionghu > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-23 11:33 ` Richard Biener @ 2020-09-24 3:24 ` xionghu luo 2020-09-24 12:39 ` Richard Sandiford 0 siblings, 1 reply; 24+ messages in thread From: xionghu luo @ 2020-09-24 3:24 UTC (permalink / raw) To: Richard Biener Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford Hi, On 2020/9/23 19:33, Richard Biener wrote: >> The first loop is for rhs stmt process, this loop is for lhs stmt process. >> I thought vec_extract also need to generate IFN before, but seems not >> necessary now? And that the first loop needs to update the lhs stmt while >> then second doesn't. > That's not good reasons to separate them, please move all the processing > into one loop. > > + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi)); > + if (!stmt) > + continue; > + > + enum tree_code code; > + code = TREE_CODE (gimple_assign_lhs (stmt)); > + switch (code) > + { > + case ARRAY_REF: > + gimple_expand_vec_set_expr (&gsi); > > you also do the assign and ARRAY_REF checking duplicate. > > The patch likely wasn't bootstrapped because I've seen unused and > set-but-not-used > variables. > > Otherwise the patch looks good to me - I guess you want to add the > vec_extract bits as well so you can overall assess the affect of the patch > on altivec code? That said, the patch misses a testcase where we verify > we properly expand the vector to a pseudo now. Thanks, fixed the bootstrap error. Actually the "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]" includes typed vec_insert tests for V4SI/V4SF/V8HI/V16QI/V2DI/V2DF of expanding the IFN VEC_SET and instruction count check, but I am discussing and refining with Segher's comments, will split and send it later once we reached agreement. Not sure whether this is the testcase you mentioned? (As you said *vec_extract*, but this patch series target for vec_insert only.) FYI, We are trying below or even better code generations: rlwinm 6,6,2,28,29 mtvsrwz 0,5 lvsr 1,0,6 lvsl 0,0,6 xxperm 34,34,33 xxinsertw 34,0,12 xxperm 34,34,32 Second thing is I removed the second loop and move the "gimple_expand_vec_set_expr (&gsi);" up as your comments. Thanks again. IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to VEC_SET internal function in gimple-isel pass if target supports vec_set with variable index by checking can_vec_set_var_idx_p. gcc/ChangeLog: 2020-09-24 Xionghu Luo <luoxhu@linux.ibm.com> * gimple-isel.cc (gimple_expand_vec_set_expr): New function. (gimple_expand_vec_cond_exprs): Rename to ... (gimple_expand_vec_exprs): ... this and call gimple_expand_vec_set_expr. * internal-fn.c (vec_set_direct): New define. (expand_vec_set_optab_fn): New function. (direct_vec_set_optab_supported_p): New define. * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN. * optabs.c (can_vec_set_var_idx_p): New function. * optabs.h (can_vec_set_var_idx_p): New declaration. --- gcc/gimple-isel.cc | 75 +++++++++++++++++++++++++++++++++++++++++++-- gcc/internal-fn.c | 39 +++++++++++++++++++++++ gcc/internal-fn.def | 2 ++ gcc/optabs.c | 21 +++++++++++++ gcc/optabs.h | 4 +++ 5 files changed, 139 insertions(+), 2 deletions(-) diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc index b330cf4c20e..02513e04900 100644 --- a/gcc/gimple-isel.cc +++ b/gcc/gimple-isel.cc @@ -35,6 +35,74 @@ along with GCC; see the file COPYING3. If not see #include "tree-cfg.h" #include "bitmap.h" #include "tree-ssa-dce.h" +#include "memmodel.h" +#include "optabs.h" + +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to + internal function based on vector type of selected expansion. + i.e.: + VIEW_CONVERT_EXPR<int[4]>(u)[_1] = = i_4(D); + => + _7 = u; + _8 = .VEC_SET (_7, i_4(D), _1); + u = _8; */ + +static gimple * +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) +{ + enum tree_code code; + gcall *new_stmt = NULL; + gassign *ass_stmt = NULL; + + /* Only consider code == GIMPLE_ASSIGN. */ + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi)); + if (!stmt) + return NULL; + + tree lhs = gimple_assign_lhs (stmt); + code = TREE_CODE (lhs); + if (code != ARRAY_REF) + return NULL; + + tree val = gimple_assign_rhs1 (stmt); + tree op0 = TREE_OPERAND (lhs, 0); + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0)) + && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0))) + && TYPE_MODE (TREE_TYPE (lhs)) + == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0))))) + { + tree pos = TREE_OPERAND (lhs, 1); + tree view_op0 = TREE_OPERAND (op0, 0); + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); + if (auto_var_in_fn_p (view_op0, cfun->decl) + && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx_p (outermode)) + { + location_t loc = gimple_location (stmt); + tree var_src = make_ssa_name (TREE_TYPE (view_op0)); + tree var_dst = make_ssa_name (TREE_TYPE (view_op0)); + + ass_stmt = gimple_build_assign (var_src, view_op0); + gimple_set_vuse (ass_stmt, gimple_vuse (stmt)); + gimple_set_location (ass_stmt, loc); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + + new_stmt + = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos); + gimple_call_set_lhs (new_stmt, var_dst); + gimple_set_location (new_stmt, loc); + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); + + ass_stmt = gimple_build_assign (view_op0, var_dst); + gimple_set_location (ass_stmt, loc); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + + gimple_move_vops (ass_stmt, stmt); + gsi_remove (gsi, true); + } + } + + return ass_stmt; +} /* Expand all VEC_COND_EXPR gimple assignments into calls to internal function based on type of selected expansion. */ @@ -176,7 +244,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi, VEC_COND_EXPR assignments. */ static unsigned int -gimple_expand_vec_cond_exprs (void) +gimple_expand_vec_exprs (void) { gimple_stmt_iterator gsi; basic_block bb; @@ -189,12 +257,15 @@ gimple_expand_vec_cond_exprs (void) { gimple *g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses); + if (g != NULL) { tree lhs = gimple_assign_lhs (gsi_stmt (gsi)); gimple_set_lhs (g, lhs); gsi_replace (&gsi, g, false); } + + gimple_expand_vec_set_expr (&gsi); } } @@ -237,7 +308,7 @@ public: virtual unsigned int execute (function *) { - return gimple_expand_vec_cond_exprs (); + return gimple_expand_vec_exprs (); } }; // class pass_gimple_isel diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 8efc77d986b..f97aea44253 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -115,6 +115,7 @@ init_internal_fns () #define vec_condeq_direct { 0, 0, false } #define scatter_store_direct { 3, 1, false } #define len_store_direct { 3, 3, false } +#define vec_set_direct { 3, 3, false } #define unary_direct { 0, 0, true } #define binary_direct { 0, 0, true } #define ternary_direct { 0, 0, true } @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn +/* Expand VEC_SET internal functions. */ + +static void +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) +{ + tree lhs = gimple_call_lhs (stmt); + tree op0 = gimple_call_arg (stmt, 0); + tree op1 = gimple_call_arg (stmt, 1); + tree op2 = gimple_call_arg (stmt, 2); + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); + + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); + scalar_mode innermode = GET_MODE_INNER (outermode); + + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); + + class expand_operand ops[3]; + enum insn_code icode = optab_handler (optab, outermode); + + if (icode != CODE_FOR_nothing) + { + pos = convert_to_mode (E_SImode, pos, 0); + + create_fixed_operand (&ops[0], src); + create_input_operand (&ops[1], value, innermode); + create_input_operand (&ops[2], pos, GET_MODE (pos)); + if (maybe_expand_insn (icode, 3, ops)) + { + emit_move_insn (target, src); + return; + } + } + gcc_unreachable (); +} + static void expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) { @@ -3253,6 +3291,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_fold_left_optab_supported_p direct_optab_supported_p #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p #define direct_check_ptrs_optab_supported_p direct_optab_supported_p +#define direct_vec_set_optab_supported_p direct_optab_supported_p /* Return the optab used by internal function FN. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 13e60828fcf..e6cfe1b6159 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) + DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) diff --git a/gcc/optabs.c b/gcc/optabs.c index 184827fdf4e..8e844028d92 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -3841,6 +3841,27 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, && insn_operand_matches (icode, 3, test); } +/* Return whether the backend can emit vector set instructions for inserting + element into vector at variable index position. */ + +bool +can_vec_set_var_idx_p (machine_mode vec_mode) +{ + if (!VECTOR_MODE_P (vec_mode)) + return false; + + machine_mode inner_mode = GET_MODE_INNER (vec_mode); + rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1); + rtx reg2 = alloca_raw_REG (inner_mode, LAST_VIRTUAL_REGISTER + 2); + rtx reg3 = alloca_raw_REG (VOIDmode, LAST_VIRTUAL_REGISTER + 3); + + enum insn_code icode = optab_handler (vec_set_optab, vec_mode); + + return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1) + && insn_operand_matches (icode, 1, reg2) + && insn_operand_matches (icode, 2, reg3); +} + /* This function is called when we are going to emit a compare instruction that compares the values found in X and Y, using the rtl operator COMPARISON. diff --git a/gcc/optabs.h b/gcc/optabs.h index 7c2ec257cb0..0b14700ab3d 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -249,6 +249,10 @@ extern int can_compare_p (enum rtx_code, machine_mode, VALUE_MODE. */ extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode); +/* Return whether the backend can emit vector set instructions for inserting + element into vector at variable index position. */ +extern bool can_vec_set_var_idx_p (machine_mode); + extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode, machine_mode, int); /* Emit a pair of rtl insns to compare two rtx's and to jump -- 2.27.0.90.geebb51ba8c ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-24 3:24 ` xionghu luo @ 2020-09-24 12:39 ` Richard Sandiford 2020-09-25 6:51 ` [PATCH v4 1/3] " xionghu luo 0 siblings, 1 reply; 24+ messages in thread From: Richard Sandiford @ 2020-09-24 12:39 UTC (permalink / raw) To: xionghu luo Cc: Richard Biener, GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw xionghu luo <luoxhu@linux.ibm.com> writes: > @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > > #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn > > +/* Expand VEC_SET internal functions. */ > + > +static void > +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > +{ > + tree lhs = gimple_call_lhs (stmt); > + tree op0 = gimple_call_arg (stmt, 0); > + tree op1 = gimple_call_arg (stmt, 1); > + tree op2 = gimple_call_arg (stmt, 2); > + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); > + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); I'm not sure about the expand_expr here. ISTM that op0 is a normal input and so should be expanded by expand_normal rather than EXPAND_WRITE. Also: > + > + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); > + scalar_mode innermode = GET_MODE_INNER (outermode); > + > + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + > + class expand_operand ops[3]; > + enum insn_code icode = optab_handler (optab, outermode); > + > + if (icode != CODE_FOR_nothing) > + { > + pos = convert_to_mode (E_SImode, pos, 0); > + > + create_fixed_operand (&ops[0], src); ...this would mean that if SRC happens to be a MEM, the pattern must also accept a MEM. ISTM that we're making more work for ourselves by not “fixing” the optab to have a natural pure-input + pure-output interface. :-) But if we stick with the current optab interface, I think we need to: - create a temporary register - move SRC into the temporary register before the insn - use create_fixed_operand with the temporary register for operand 0 - move the temporary register into TARGET after the insn > + create_input_operand (&ops[1], value, innermode); > + create_input_operand (&ops[2], pos, GET_MODE (pos)); For this I think we should use convert_operand_from on the original “pos”, so that the target gets to choose what the mode of the operand is. Thanks, Richard ^ permalink raw reply [flat|nested] 24+ messages in thread
* [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-24 12:39 ` Richard Sandiford @ 2020-09-25 6:51 ` xionghu luo 2020-09-25 11:32 ` Richard Biener 2020-09-25 13:28 ` Richard Sandiford 0 siblings, 2 replies; 24+ messages in thread From: xionghu luo @ 2020-09-25 6:51 UTC (permalink / raw) To: Richard Biener, GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, richard.sandiford Hi, On 2020/9/24 20:39, Richard Sandiford wrote: > xionghu luo <luoxhu@linux.ibm.com> writes: >> @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) >> >> #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn >> >> +/* Expand VEC_SET internal functions. */ >> + >> +static void >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) >> +{ >> + tree lhs = gimple_call_lhs (stmt); >> + tree op0 = gimple_call_arg (stmt, 0); >> + tree op1 = gimple_call_arg (stmt, 1); >> + tree op2 = gimple_call_arg (stmt, 2); >> + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); >> + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); > > I'm not sure about the expand_expr here. ISTM that op0 is a normal > input and so should be expanded by expand_normal rather than > EXPAND_WRITE. Also: > >> + >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); >> + scalar_mode innermode = GET_MODE_INNER (outermode); >> + >> + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); >> + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); >> + >> + class expand_operand ops[3]; >> + enum insn_code icode = optab_handler (optab, outermode); >> + >> + if (icode != CODE_FOR_nothing) >> + { >> + pos = convert_to_mode (E_SImode, pos, 0); >> + >> + create_fixed_operand (&ops[0], src); > > ...this would mean that if SRC happens to be a MEM, the pattern > must also accept a MEM. > > ISTM that we're making more work for ourselves by not “fixing” the optab > to have a natural pure-input + pure-output interface. :-) But if we > stick with the current optab interface, I think we need to: > > - create a temporary register > - move SRC into the temporary register before the insn > - use create_fixed_operand with the temporary register for operand 0 > - move the temporary register into TARGET after the insn > >> + create_input_operand (&ops[1], value, innermode); >> + create_input_operand (&ops[2], pos, GET_MODE (pos)); > > For this I think we should use convert_operand_from on the original “pos”, > so that the target gets to choose what the mode of the operand is. > Thanks a lot for the nice suggestions, fixed them all and updated the patch as below. [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to VEC_SET internal function in gimple-isel pass if target supports vec_set with variable index by checking can_vec_set_var_idx_p. gcc/ChangeLog: 2020-09-25 Xionghu Luo <luoxhu@linux.ibm.com> * gimple-isel.cc (gimple_expand_vec_set_expr): New function. (gimple_expand_vec_cond_exprs): Rename to ... (gimple_expand_vec_exprs): ... this and call gimple_expand_vec_set_expr. * internal-fn.c (vec_set_direct): New define. (expand_vec_set_optab_fn): New function. (direct_vec_set_optab_supported_p): New define. * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN. * optabs.c (can_vec_set_var_idx_p): New function. * optabs.h (can_vec_set_var_idx_p): New declaration. --- gcc/gimple-isel.cc | 75 +++++++++++++++++++++++++++++++++++++++++++-- gcc/internal-fn.c | 41 +++++++++++++++++++++++++ gcc/internal-fn.def | 2 ++ gcc/optabs.c | 21 +++++++++++++ gcc/optabs.h | 4 +++ 5 files changed, 141 insertions(+), 2 deletions(-) diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc index b330cf4c20e..02513e04900 100644 --- a/gcc/gimple-isel.cc +++ b/gcc/gimple-isel.cc @@ -35,6 +35,74 @@ along with GCC; see the file COPYING3. If not see #include "tree-cfg.h" #include "bitmap.h" #include "tree-ssa-dce.h" +#include "memmodel.h" +#include "optabs.h" + +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to + internal function based on vector type of selected expansion. + i.e.: + VIEW_CONVERT_EXPR<int[4]>(u)[_1] = = i_4(D); + => + _7 = u; + _8 = .VEC_SET (_7, i_4(D), _1); + u = _8; */ + +static gimple * +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) +{ + enum tree_code code; + gcall *new_stmt = NULL; + gassign *ass_stmt = NULL; + + /* Only consider code == GIMPLE_ASSIGN. */ + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi)); + if (!stmt) + return NULL; + + tree lhs = gimple_assign_lhs (stmt); + code = TREE_CODE (lhs); + if (code != ARRAY_REF) + return NULL; + + tree val = gimple_assign_rhs1 (stmt); + tree op0 = TREE_OPERAND (lhs, 0); + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0)) + && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0))) + && TYPE_MODE (TREE_TYPE (lhs)) + == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0))))) + { + tree pos = TREE_OPERAND (lhs, 1); + tree view_op0 = TREE_OPERAND (op0, 0); + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); + if (auto_var_in_fn_p (view_op0, cfun->decl) + && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx_p (outermode)) + { + location_t loc = gimple_location (stmt); + tree var_src = make_ssa_name (TREE_TYPE (view_op0)); + tree var_dst = make_ssa_name (TREE_TYPE (view_op0)); + + ass_stmt = gimple_build_assign (var_src, view_op0); + gimple_set_vuse (ass_stmt, gimple_vuse (stmt)); + gimple_set_location (ass_stmt, loc); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + + new_stmt + = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos); + gimple_call_set_lhs (new_stmt, var_dst); + gimple_set_location (new_stmt, loc); + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); + + ass_stmt = gimple_build_assign (view_op0, var_dst); + gimple_set_location (ass_stmt, loc); + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); + + gimple_move_vops (ass_stmt, stmt); + gsi_remove (gsi, true); + } + } + + return ass_stmt; +} /* Expand all VEC_COND_EXPR gimple assignments into calls to internal function based on type of selected expansion. */ @@ -176,7 +244,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi, VEC_COND_EXPR assignments. */ static unsigned int -gimple_expand_vec_cond_exprs (void) +gimple_expand_vec_exprs (void) { gimple_stmt_iterator gsi; basic_block bb; @@ -189,12 +257,15 @@ gimple_expand_vec_cond_exprs (void) { gimple *g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses); + if (g != NULL) { tree lhs = gimple_assign_lhs (gsi_stmt (gsi)); gimple_set_lhs (g, lhs); gsi_replace (&gsi, g, false); } + + gimple_expand_vec_set_expr (&gsi); } } @@ -237,7 +308,7 @@ public: virtual unsigned int execute (function *) { - return gimple_expand_vec_cond_exprs (); + return gimple_expand_vec_exprs (); } }; // class pass_gimple_isel diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 8efc77d986b..20c1d31fb73 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -115,6 +115,7 @@ init_internal_fns () #define vec_condeq_direct { 0, 0, false } #define scatter_store_direct { 3, 1, false } #define len_store_direct { 3, 3, false } +#define vec_set_direct { 3, 3, false } #define unary_direct { 0, 0, true } #define binary_direct { 0, 0, true } #define ternary_direct { 0, 0, true } @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn +/* Expand VEC_SET internal functions. */ + +static void +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) +{ + tree lhs = gimple_call_lhs (stmt); + tree op0 = gimple_call_arg (stmt, 0); + tree op1 = gimple_call_arg (stmt, 1); + tree op2 = gimple_call_arg (stmt, 2); + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + rtx src = expand_normal (op0); + + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); + scalar_mode innermode = GET_MODE_INNER (outermode); + + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); + + class expand_operand ops[3]; + enum insn_code icode = optab_handler (optab, outermode); + + if (icode != CODE_FOR_nothing) + { + rtx temp = gen_reg_rtx (outermode); + emit_move_insn (temp, src); + + create_fixed_operand (&ops[0], temp); + create_input_operand (&ops[1], value, innermode); + create_convert_operand_from (&ops[2], pos, TYPE_MODE (TREE_TYPE (op2)), + true); + if (maybe_expand_insn (icode, 3, ops)) + { + emit_move_insn (target, temp); + return; + } + } + gcc_unreachable (); +} + static void expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) { @@ -3253,6 +3293,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_fold_left_optab_supported_p direct_optab_supported_p #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p #define direct_check_ptrs_optab_supported_p direct_optab_supported_p +#define direct_vec_set_optab_supported_p direct_optab_supported_p /* Return the optab used by internal function FN. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 13e60828fcf..e6cfe1b6159 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) + DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) diff --git a/gcc/optabs.c b/gcc/optabs.c index 184827fdf4e..8e844028d92 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -3841,6 +3841,27 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, && insn_operand_matches (icode, 3, test); } +/* Return whether the backend can emit vector set instructions for inserting + element into vector at variable index position. */ + +bool +can_vec_set_var_idx_p (machine_mode vec_mode) +{ + if (!VECTOR_MODE_P (vec_mode)) + return false; + + machine_mode inner_mode = GET_MODE_INNER (vec_mode); + rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1); + rtx reg2 = alloca_raw_REG (inner_mode, LAST_VIRTUAL_REGISTER + 2); + rtx reg3 = alloca_raw_REG (VOIDmode, LAST_VIRTUAL_REGISTER + 3); + + enum insn_code icode = optab_handler (vec_set_optab, vec_mode); + + return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1) + && insn_operand_matches (icode, 1, reg2) + && insn_operand_matches (icode, 2, reg3); +} + /* This function is called when we are going to emit a compare instruction that compares the values found in X and Y, using the rtl operator COMPARISON. diff --git a/gcc/optabs.h b/gcc/optabs.h index 7c2ec257cb0..0b14700ab3d 100644 --- a/gcc/optabs.h +++ b/gcc/optabs.h @@ -249,6 +249,10 @@ extern int can_compare_p (enum rtx_code, machine_mode, VALUE_MODE. */ extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode); +/* Return whether the backend can emit vector set instructions for inserting + element into vector at variable index position. */ +extern bool can_vec_set_var_idx_p (machine_mode); + extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode, machine_mode, int); /* Emit a pair of rtl insns to compare two rtx's and to jump -- 2.27.0.90.geebb51ba8c ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-25 6:51 ` [PATCH v4 1/3] " xionghu luo @ 2020-09-25 11:32 ` Richard Biener 2020-09-25 13:28 ` Richard Sandiford 1 sibling, 0 replies; 24+ messages in thread From: Richard Biener @ 2020-09-25 11:32 UTC (permalink / raw) To: xionghu luo Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, Richard Sandiford On Fri, Sep 25, 2020 at 8:51 AM xionghu luo <luoxhu@linux.ibm.com> wrote: > > Hi, > > On 2020/9/24 20:39, Richard Sandiford wrote: > > xionghu luo <luoxhu@linux.ibm.com> writes: > >> @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > >> > >> #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn > >> > >> +/* Expand VEC_SET internal functions. */ > >> + > >> +static void > >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > >> +{ > >> + tree lhs = gimple_call_lhs (stmt); > >> + tree op0 = gimple_call_arg (stmt, 0); > >> + tree op1 = gimple_call_arg (stmt, 1); > >> + tree op2 = gimple_call_arg (stmt, 2); > >> + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); > >> + rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE); > > > > I'm not sure about the expand_expr here. ISTM that op0 is a normal > > input and so should be expanded by expand_normal rather than > > EXPAND_WRITE. Also: > > > >> + > >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); > >> + scalar_mode innermode = GET_MODE_INNER (outermode); > >> + > >> + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > >> + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); > >> + > >> + class expand_operand ops[3]; > >> + enum insn_code icode = optab_handler (optab, outermode); > >> + > >> + if (icode != CODE_FOR_nothing) > >> + { > >> + pos = convert_to_mode (E_SImode, pos, 0); > >> + > >> + create_fixed_operand (&ops[0], src); > > > > ...this would mean that if SRC happens to be a MEM, the pattern > > must also accept a MEM. > > > > ISTM that we're making more work for ourselves by not “fixing” the optab > > to have a natural pure-input + pure-output interface. :-) But if we > > stick with the current optab interface, I think we need to: > > > > - create a temporary register > > - move SRC into the temporary register before the insn > > - use create_fixed_operand with the temporary register for operand 0 > > - move the temporary register into TARGET after the insn > > > >> + create_input_operand (&ops[1], value, innermode); > >> + create_input_operand (&ops[2], pos, GET_MODE (pos)); > > > > For this I think we should use convert_operand_from on the original “pos”, > > so that the target gets to choose what the mode of the operand is. > > > > Thanks a lot for the nice suggestions, fixed them all and updated the patch as below. > > > [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR > > This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to > VEC_SET internal function in gimple-isel pass if target supports > vec_set with variable index by checking can_vec_set_var_idx_p. OK with me if Richard is happy with the updated patch. Thanks, Richard. > gcc/ChangeLog: > > 2020-09-25 Xionghu Luo <luoxhu@linux.ibm.com> > > * gimple-isel.cc (gimple_expand_vec_set_expr): New function. > (gimple_expand_vec_cond_exprs): Rename to ... > (gimple_expand_vec_exprs): ... this and call > gimple_expand_vec_set_expr. > * internal-fn.c (vec_set_direct): New define. > (expand_vec_set_optab_fn): New function. > (direct_vec_set_optab_supported_p): New define. > * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN. > * optabs.c (can_vec_set_var_idx_p): New function. > * optabs.h (can_vec_set_var_idx_p): New declaration. > --- > gcc/gimple-isel.cc | 75 +++++++++++++++++++++++++++++++++++++++++++-- > gcc/internal-fn.c | 41 +++++++++++++++++++++++++ > gcc/internal-fn.def | 2 ++ > gcc/optabs.c | 21 +++++++++++++ > gcc/optabs.h | 4 +++ > 5 files changed, 141 insertions(+), 2 deletions(-) > > diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc > index b330cf4c20e..02513e04900 100644 > --- a/gcc/gimple-isel.cc > +++ b/gcc/gimple-isel.cc > @@ -35,6 +35,74 @@ along with GCC; see the file COPYING3. If not see > #include "tree-cfg.h" > #include "bitmap.h" > #include "tree-ssa-dce.h" > +#include "memmodel.h" > +#include "optabs.h" > + > +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to > + internal function based on vector type of selected expansion. > + i.e.: > + VIEW_CONVERT_EXPR<int[4]>(u)[_1] = = i_4(D); > + => > + _7 = u; > + _8 = .VEC_SET (_7, i_4(D), _1); > + u = _8; */ > + > +static gimple * > +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi) > +{ > + enum tree_code code; > + gcall *new_stmt = NULL; > + gassign *ass_stmt = NULL; > + > + /* Only consider code == GIMPLE_ASSIGN. */ > + gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi)); > + if (!stmt) > + return NULL; > + > + tree lhs = gimple_assign_lhs (stmt); > + code = TREE_CODE (lhs); > + if (code != ARRAY_REF) > + return NULL; > + > + tree val = gimple_assign_rhs1 (stmt); > + tree op0 = TREE_OPERAND (lhs, 0); > + if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0)) > + && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0))) > + && TYPE_MODE (TREE_TYPE (lhs)) > + == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0))))) > + { > + tree pos = TREE_OPERAND (lhs, 1); > + tree view_op0 = TREE_OPERAND (op0, 0); > + machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0)); > + if (auto_var_in_fn_p (view_op0, cfun->decl) > + && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx_p (outermode)) > + { > + location_t loc = gimple_location (stmt); > + tree var_src = make_ssa_name (TREE_TYPE (view_op0)); > + tree var_dst = make_ssa_name (TREE_TYPE (view_op0)); > + > + ass_stmt = gimple_build_assign (var_src, view_op0); > + gimple_set_vuse (ass_stmt, gimple_vuse (stmt)); > + gimple_set_location (ass_stmt, loc); > + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); > + > + new_stmt > + = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos); > + gimple_call_set_lhs (new_stmt, var_dst); > + gimple_set_location (new_stmt, loc); > + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > + > + ass_stmt = gimple_build_assign (view_op0, var_dst); > + gimple_set_location (ass_stmt, loc); > + gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT); > + > + gimple_move_vops (ass_stmt, stmt); > + gsi_remove (gsi, true); > + } > + } > + > + return ass_stmt; > +} > > /* Expand all VEC_COND_EXPR gimple assignments into calls to internal > function based on type of selected expansion. */ > @@ -176,7 +244,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi, > VEC_COND_EXPR assignments. */ > > static unsigned int > -gimple_expand_vec_cond_exprs (void) > +gimple_expand_vec_exprs (void) > { > gimple_stmt_iterator gsi; > basic_block bb; > @@ -189,12 +257,15 @@ gimple_expand_vec_cond_exprs (void) > { > gimple *g = gimple_expand_vec_cond_expr (&gsi, > &vec_cond_ssa_name_uses); > + > if (g != NULL) > { > tree lhs = gimple_assign_lhs (gsi_stmt (gsi)); > gimple_set_lhs (g, lhs); > gsi_replace (&gsi, g, false); > } > + > + gimple_expand_vec_set_expr (&gsi); > } > } > > @@ -237,7 +308,7 @@ public: > > virtual unsigned int execute (function *) > { > - return gimple_expand_vec_cond_exprs (); > + return gimple_expand_vec_exprs (); > } > > }; // class pass_gimple_isel > diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c > index 8efc77d986b..20c1d31fb73 100644 > --- a/gcc/internal-fn.c > +++ b/gcc/internal-fn.c > @@ -115,6 +115,7 @@ init_internal_fns () > #define vec_condeq_direct { 0, 0, false } > #define scatter_store_direct { 3, 1, false } > #define len_store_direct { 3, 3, false } > +#define vec_set_direct { 3, 3, false } > #define unary_direct { 0, 0, true } > #define binary_direct { 0, 0, true } > #define ternary_direct { 0, 0, true } > @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > > #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn > > +/* Expand VEC_SET internal functions. */ > + > +static void > +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > +{ > + tree lhs = gimple_call_lhs (stmt); > + tree op0 = gimple_call_arg (stmt, 0); > + tree op1 = gimple_call_arg (stmt, 1); > + tree op2 = gimple_call_arg (stmt, 2); > + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); > + rtx src = expand_normal (op0); > + > + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); > + scalar_mode innermode = GET_MODE_INNER (outermode); > + > + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + > + class expand_operand ops[3]; > + enum insn_code icode = optab_handler (optab, outermode); > + > + if (icode != CODE_FOR_nothing) > + { > + rtx temp = gen_reg_rtx (outermode); > + emit_move_insn (temp, src); > + > + create_fixed_operand (&ops[0], temp); > + create_input_operand (&ops[1], value, innermode); > + create_convert_operand_from (&ops[2], pos, TYPE_MODE (TREE_TYPE (op2)), > + true); > + if (maybe_expand_insn (icode, 3, ops)) > + { > + emit_move_insn (target, temp); > + return; > + } > + } > + gcc_unreachable (); > +} > + > static void > expand_ABNORMAL_DISPATCHER (internal_fn, gcall *) > { > @@ -3253,6 +3293,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, > #define direct_fold_left_optab_supported_p direct_optab_supported_p > #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p > #define direct_check_ptrs_optab_supported_p direct_optab_supported_p > +#define direct_vec_set_optab_supported_p direct_optab_supported_p > > /* Return the optab used by internal function FN. */ > > diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def > index 13e60828fcf..e6cfe1b6159 100644 > --- a/gcc/internal-fn.def > +++ b/gcc/internal-fn.def > @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu) > DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq) > DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask) > > +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set) > + > DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store) > > DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while) > diff --git a/gcc/optabs.c b/gcc/optabs.c > index 184827fdf4e..8e844028d92 100644 > --- a/gcc/optabs.c > +++ b/gcc/optabs.c > @@ -3841,6 +3841,27 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode, > && insn_operand_matches (icode, 3, test); > } > > +/* Return whether the backend can emit vector set instructions for inserting > + element into vector at variable index position. */ > + > +bool > +can_vec_set_var_idx_p (machine_mode vec_mode) > +{ > + if (!VECTOR_MODE_P (vec_mode)) > + return false; > + > + machine_mode inner_mode = GET_MODE_INNER (vec_mode); > + rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1); > + rtx reg2 = alloca_raw_REG (inner_mode, LAST_VIRTUAL_REGISTER + 2); > + rtx reg3 = alloca_raw_REG (VOIDmode, LAST_VIRTUAL_REGISTER + 3); > + > + enum insn_code icode = optab_handler (vec_set_optab, vec_mode); > + > + return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1) > + && insn_operand_matches (icode, 1, reg2) > + && insn_operand_matches (icode, 2, reg3); > +} > + > /* This function is called when we are going to emit a compare instruction that > compares the values found in X and Y, using the rtl operator COMPARISON. > > diff --git a/gcc/optabs.h b/gcc/optabs.h > index 7c2ec257cb0..0b14700ab3d 100644 > --- a/gcc/optabs.h > +++ b/gcc/optabs.h > @@ -249,6 +249,10 @@ extern int can_compare_p (enum rtx_code, machine_mode, > VALUE_MODE. */ > extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode); > > +/* Return whether the backend can emit vector set instructions for inserting > + element into vector at variable index position. */ > +extern bool can_vec_set_var_idx_p (machine_mode); > + > extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode, > machine_mode, int); > /* Emit a pair of rtl insns to compare two rtx's and to jump > -- > 2.27.0.90.geebb51ba8c > ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-25 6:51 ` [PATCH v4 1/3] " xionghu luo 2020-09-25 11:32 ` Richard Biener @ 2020-09-25 13:28 ` Richard Sandiford 2020-09-27 5:45 ` xionghu luo 1 sibling, 1 reply; 24+ messages in thread From: Richard Sandiford @ 2020-09-25 13:28 UTC (permalink / raw) To: xionghu luo Cc: Richard Biener, GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw xionghu luo <luoxhu@linux.ibm.com> writes: > @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > > #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn > > +/* Expand VEC_SET internal functions. */ > + > +static void > +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) > +{ > + tree lhs = gimple_call_lhs (stmt); > + tree op0 = gimple_call_arg (stmt, 0); > + tree op1 = gimple_call_arg (stmt, 1); > + tree op2 = gimple_call_arg (stmt, 2); > + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); > + rtx src = expand_normal (op0); > + > + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); > + scalar_mode innermode = GET_MODE_INNER (outermode); > + > + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); > + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); These two can just use expand_normal. Might be easier to read if they come immediately after the expand_normal (op0). LGTM with that change for the internal-fn.c stuff, thanks. Richard ^ permalink raw reply [flat|nested] 24+ messages in thread
* Re: [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR 2020-09-25 13:28 ` Richard Sandiford @ 2020-09-27 5:45 ` xionghu luo 0 siblings, 0 replies; 24+ messages in thread From: xionghu luo @ 2020-09-27 5:45 UTC (permalink / raw) To: Richard Biener, GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt, Jiufu Guo, linkw, richard.sandiford On 2020/9/25 21:28, Richard Sandiford wrote: > xionghu luo <luoxhu@linux.ibm.com> writes: >> @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab) >> >> #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn >> >> +/* Expand VEC_SET internal functions. */ >> + >> +static void >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab) >> +{ >> + tree lhs = gimple_call_lhs (stmt); >> + tree op0 = gimple_call_arg (stmt, 0); >> + tree op1 = gimple_call_arg (stmt, 1); >> + tree op2 = gimple_call_arg (stmt, 2); >> + rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); >> + rtx src = expand_normal (op0); >> + >> + machine_mode outermode = TYPE_MODE (TREE_TYPE (op0)); >> + scalar_mode innermode = GET_MODE_INNER (outermode); >> + >> + rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL); >> + rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL); > > These two can just use expand_normal. Might be easier to read if > they come immediately after the expand_normal (op0). > > LGTM with that change for the internal-fn.c stuff, thanks. > Thank you, updated and committed as r11-3486. Tested and confirmed Power/X86/ARM still not supporting vec_set with register index, so there are no ICE regressions caused by generating IFN VEC_SET but not properly expanded. Thanks, Xionghu ^ permalink raw reply [flat|nested] 24+ messages in thread
end of thread, other threads:[~2020-09-28 8:01 UTC | newest] Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2020-09-18 6:17 [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Xiong Hu Luo 2020-09-18 6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo 2020-09-18 20:19 ` Segher Boessenkool 2020-09-24 8:21 ` xionghu luo 2020-09-24 13:27 ` Richard Biener 2020-09-24 14:55 ` Richard Biener 2020-09-24 19:36 ` Segher Boessenkool 2020-09-25 6:58 ` Richard Biener 2020-09-25 12:21 ` Richard Sandiford 2020-09-25 22:39 ` Segher Boessenkool 2020-09-28 8:00 ` Richard Biener 2020-09-24 19:02 ` Segher Boessenkool 2020-09-25 3:50 ` xionghu luo 2020-09-25 5:00 ` Richard Biener 2020-09-18 18:20 ` [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Segher Boessenkool 2020-09-21 8:31 ` Richard Biener 2020-09-22 3:55 ` [PATCH v3 " xionghu luo 2020-09-23 11:33 ` Richard Biener 2020-09-24 3:24 ` xionghu luo 2020-09-24 12:39 ` Richard Sandiford 2020-09-25 6:51 ` [PATCH v4 1/3] " xionghu luo 2020-09-25 11:32 ` Richard Biener 2020-09-25 13:28 ` Richard Sandiford 2020-09-27 5:45 ` xionghu luo
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).