public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
@ 2020-09-18  6:17 Xiong Hu Luo
  2020-09-18  6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Xiong Hu Luo @ 2020-09-18  6:17 UTC (permalink / raw)
  To: gcc-patches
  Cc: segher, dje.gcc, wschmidt, guojiufu, linkw, richard.guenther,
	richard.sandiford, Xiong Hu Luo

This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
VEC_SET internal function in gimple-isel pass if target supports
vec_set with variable index by checking can_vec_set_var_idx_p.

gcc/ChangeLog:

2020-09-18  Xionghu Luo  <luoxhu@linux.ibm.com>

	* gimple-isel.cc (gimple_expand_vec_set_expr): New function.
	(gimple_expand_vec_cond_exprs): Call gimple_expand_vec_set_expr.
	* internal-fn.c (vec_set_direct): New define.
	(expand_vec_set_optab_fn): New function.
	(direct_vec_set_optab_supported_p): New define.
	* internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN.
	* optabs.c (can_vec_set_var_idx_p): New function.
	* optabs.h (can_vec_set_var_idx_p): New declare.
---
 gcc/gimple-isel.cc  | 116 +++++++++++++++++++++++++++++++++++++++++++-
 gcc/internal-fn.c   |  36 ++++++++++++++
 gcc/internal-fn.def |   2 +
 gcc/optabs.c        |  17 +++++++
 gcc/optabs.h        |   3 ++
 5 files changed, 172 insertions(+), 2 deletions(-)

diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b330cf4c20e..bc61e2895be 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -35,6 +35,80 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-cfg.h"
 #include "bitmap.h"
 #include "tree-ssa-dce.h"
+#include "fold-const.h"
+#include "gimple-fold.h"
+#include "memmodel.h"
+#include "optabs.h"
+
+/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
+   internal function based on vector type of selected expansion.
+   i.e.:
+     VIEW_CONVERT_EXPR<int[4]>(u)[_1] =  = i_4(D);
+   =>
+     _7 = u;
+     _8 = .VEC_SET (_7, i_4(D), _1);
+     u = _8;  */
+
+static gimple *
+gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
+{
+  enum tree_code code;
+  gcall *new_stmt = NULL;
+  gassign *ass_stmt = NULL;
+
+  /* Only consider code == GIMPLE_ASSIGN.  */
+  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
+  if (!stmt)
+    return NULL;
+
+  code = TREE_CODE (gimple_assign_lhs (stmt));
+  if (code != ARRAY_REF)
+    return NULL;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  tree val = gimple_assign_rhs1 (stmt);
+
+  tree type = TREE_TYPE (lhs);
+  tree op0 = TREE_OPERAND (lhs, 0);
+  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR
+      && tree_fits_uhwi_p (TYPE_SIZE (type)))
+    {
+      tree pos = TREE_OPERAND (lhs, 1);
+      tree view_op0 = TREE_OPERAND (op0, 0);
+      machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
+      scalar_mode innermode = GET_MODE_INNER (outermode);
+      tree_code code = TREE_CODE (TREE_TYPE(view_op0));
+      if (!is_global_var (view_op0) && code == VECTOR_TYPE
+	  && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0)))
+	  && can_vec_set_var_idx_p (code, outermode, innermode,
+				    TYPE_MODE (TREE_TYPE (pos))))
+	{
+	  location_t loc = gimple_location (stmt);
+	  tree var_src = make_ssa_name (TREE_TYPE (view_op0));
+	  tree var_dst = make_ssa_name (TREE_TYPE (view_op0));
+
+	  ass_stmt = gimple_build_assign (var_src, view_op0);
+	  gimple_set_vuse (ass_stmt, gimple_vuse (stmt));
+	  gimple_set_location (ass_stmt, loc);
+	  gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+	  new_stmt
+	    = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
+	  gimple_call_set_lhs (new_stmt, var_dst);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+
+	  ass_stmt = gimple_build_assign (view_op0, var_dst);
+	  gimple_set_location (ass_stmt, loc);
+	  gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+	  gimple_move_vops (ass_stmt, stmt);
+	  gsi_remove (gsi, true);
+	}
+    }
+
+  return ass_stmt;
+}
 
 /* Expand all VEC_COND_EXPR gimple assignments into calls to internal
    function based on type of selected expansion.  */
@@ -187,8 +261,25 @@ gimple_expand_vec_cond_exprs (void)
     {
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 	{
-	  gimple *g = gimple_expand_vec_cond_expr (&gsi,
-						   &vec_cond_ssa_name_uses);
+	  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
+	  if (!stmt)
+	    continue;
+
+	  enum tree_code code;
+	  gimple *g = NULL;
+	  code = gimple_assign_rhs_code (stmt);
+	  switch (code)
+	    {
+	    case VEC_COND_EXPR:
+	      g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses);
+	      break;
+	    case ARRAY_REF:
+	      /*  TODO: generate IFN for vec_extract with variable index.  */
+	      break;
+	    default:
+	      break;
+	    }
+
 	  if (g != NULL)
 	    {
 	      tree lhs = gimple_assign_lhs (gsi_stmt (gsi));
@@ -204,6 +295,27 @@ gimple_expand_vec_cond_exprs (void)
 
   simple_dce_from_worklist (dce_ssa_names);
 
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
+	  if (!stmt)
+	    continue;
+
+	  enum tree_code code;
+	  code = TREE_CODE (gimple_assign_lhs (stmt));
+	  switch (code)
+	    {
+	    case ARRAY_REF:
+	      gimple_expand_vec_set_expr (&gsi);
+	      break;
+	    default:
+	      break;
+	    }
+	}
+    }
+
   return 0;
 }
 
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 8efc77d986b..36837381c04 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -115,6 +115,7 @@ init_internal_fns ()
 #define vec_condeq_direct { 0, 0, false }
 #define scatter_store_direct { 3, 1, false }
 #define len_store_direct { 3, 3, false }
+#define vec_set_direct { 3, 3, false }
 #define unary_direct { 0, 0, true }
 #define binary_direct { 0, 0, true }
 #define ternary_direct { 0, 0, true }
@@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 
 #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
 
+static void
+expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree op0 = gimple_call_arg (stmt, 0);
+  tree op1 = gimple_call_arg (stmt, 1);
+  tree op2 = gimple_call_arg (stmt, 2);
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
+
+  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
+  scalar_mode innermode = GET_MODE_INNER (outermode);
+
+  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+
+  class expand_operand ops[3];
+  enum insn_code icode = optab_handler (optab, outermode);
+
+  if (icode != CODE_FOR_nothing)
+    {
+      pos = convert_to_mode (E_SImode, pos, 0);
+
+      create_fixed_operand (&ops[0], src);
+      create_input_operand (&ops[1], value, innermode);
+      create_input_operand (&ops[2], pos, GET_MODE (pos));
+      if (maybe_expand_insn (icode, 3, ops))
+	{
+	  emit_move_insn (target, src);
+	  return;
+	}
+    }
+}
+
 static void
 expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
 {
@@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
+#define direct_vec_set_optab_supported_p direct_optab_supported_p
 
 /* Return the optab used by internal function FN.  */
 
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 13e60828fcf..e6cfe1b6159 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
 DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
 DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
 
+DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
+
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 184827fdf4e..c8125670d2d 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode,
 	 && insn_operand_matches (icode, 3, test);
 }
 
+bool
+can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode,
+		       machine_mode value_mode, machine_mode idx_mode)
+{
+  gcc_assert (code == VECTOR_TYPE);
+
+  rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1);
+  rtx reg2 = alloca_raw_REG (value_mode, LAST_VIRTUAL_REGISTER + 2);
+  rtx reg3 = alloca_raw_REG (idx_mode, LAST_VIRTUAL_REGISTER + 3);
+
+  enum insn_code icode = optab_handler (vec_set_optab, vec_mode);
+
+  return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1)
+	 && insn_operand_matches (icode, 1, reg2)
+	 && insn_operand_matches (icode, 2, reg3);
+}
+
 /* This function is called when we are going to emit a compare instruction that
    compares the values found in X and Y, using the rtl operator COMPARISON.
 
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 7c2ec257cb0..c018d127756 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -249,6 +249,9 @@ extern int can_compare_p (enum rtx_code, machine_mode,
    VALUE_MODE.  */
 extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode);
 
+extern bool can_vec_set_var_idx_p (enum tree_code, machine_mode, machine_mode,
+				   machine_mode);
+
 extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode,
 			    machine_mode, int);
 /* Emit a pair of rtl insns to compare two rtx's and to jump
-- 
2.27.0.90.geebb51ba8c


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-18  6:17 [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Xiong Hu Luo
@ 2020-09-18  6:17 ` Xiong Hu Luo
  2020-09-18 20:19   ` Segher Boessenkool
  2020-09-18 18:20 ` [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Segher Boessenkool
  2020-09-21  8:31 ` Richard Biener
  2 siblings, 1 reply; 24+ messages in thread
From: Xiong Hu Luo @ 2020-09-18  6:17 UTC (permalink / raw)
  To: gcc-patches
  Cc: segher, dje.gcc, wschmidt, guojiufu, linkw, richard.guenther,
	richard.sandiford, Xiong Hu Luo

vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
to be insert, arg2 is the place to insert arg1 to arg0.  Current expander
generates stxv+stwx+lxv if arg2 is variable instead of constant, which
causes serious store hit load performance issue on Power.  This patch tries
 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to
unify the gimple code, then expander could use vec_set_optab to expand.
 2) Expand the IFN VEC_SET to fast instructions: lvsl+xxperm+xxsel.
In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too
early in gimple stage if arg2 is variable, avoid generating store hit load
instructions.

For Power9 V4SI:
	addi 9,1,-16
	rldic 6,6,2,60
	stxv 34,-16(1)
	stwx 5,9,6
	lxv 34,-16(1)
=>
	addis 9,2,.LC0@toc@ha
	addi 9,9,.LC0@toc@l
	mtvsrwz 33,5
	lxv 32,0(9)
	sradi 9,6,2
	addze 9,9
	sldi 9,9,2
	subf 9,9,6
	subfic 9,9,3
	sldi 9,9,2
	subfic 9,9,20
	lvsl 13,0,9
	xxperm 33,33,45
	xxperm 32,32,45
	xxsel 34,34,33,32

Though instructions increase from 5 to 15, the performance is improved
60% in typical cases.
Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE and
Power8-BE, bootstrap tested pass.

gcc/ChangeLog:

2020-09-18  Xionghu Luo  <luoxhu@linux.ibm.com>

	* config/rs6000/altivec.md (altivec_lvsl_reg_<mode>2): Rename to
	 (altivec_lvsl_reg_<mode>2) and extend to SDI mode.
	* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
	Ajdust variable index vec_insert to VIEW_CONVERT_EXPR.
	* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var):
	New declare.
	* config/rs6000/rs6000.c (rs6000_expand_vector_set_var):
	New function.
	* config/rs6000/rs6000.md (FQHS): New mode iterator.
	(FD): New mode iterator.
	p8_mtvsrwz_v16qi<mode>2: New define_insn.
	p8_mtvsrd_v16qi<mode>2: New define_insn.
	* config/rs6000/vector.md: Add register operand2 match for
	vec_set index.
	* config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2.

gcc/testsuite/ChangeLog:

2020-09-18  Xionghu Luo  <luoxhu@linux.ibm.com>

	* gcc.target/powerpc/pr79251.c: New test.
	* gcc.target/powerpc/pr79251-run.c: New test.
	* gcc.target/powerpc/pr79251.h: New header.
---
 gcc/config/rs6000/altivec.md                  |   4 +-
 gcc/config/rs6000/rs6000-c.c                  |  22 ++-
 gcc/config/rs6000/rs6000-protos.h             |   1 +
 gcc/config/rs6000/rs6000.c                    | 146 ++++++++++++++++++
 gcc/config/rs6000/rs6000.md                   |  19 +++
 gcc/config/rs6000/vector.md                   |  19 ++-
 gcc/config/rs6000/vsx.md                      |   2 +-
 .../gcc.target/powerpc/pr79251-run.c          |  29 ++++
 gcc/testsuite/gcc.target/powerpc/pr79251.c    |  15 ++
 gcc/testsuite/gcc.target/powerpc/pr79251.h    |  19 +++
 10 files changed, 257 insertions(+), 19 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 0a2e634d6b0..66b636059a6 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -2772,10 +2772,10 @@ (define_expand "altivec_lvsl"
   DONE;
 })
 
-(define_insn "altivec_lvsl_reg"
+(define_insn "altivec_lvsl_reg_<mode>2"
   [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")
 	(unspec:V16QI
-	[(match_operand:DI 1 "gpc_reg_operand" "b")]
+	[(match_operand:SDI 1 "gpc_reg_operand" "b")]
 	UNSPEC_LVSL_REG))]
   "TARGET_ALTIVEC"
   "lvsl %0,0,%1"
diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 2fad3d94706..78abe49c833 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -1509,9 +1509,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
       tree arg1;
       tree arg2;
       tree arg1_type;
-      tree arg1_inner_type;
       tree decl, stmt;
-      tree innerptrtype;
       machine_mode mode;
 
       /* No second or third arguments. */
@@ -1563,8 +1561,13 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 	  return build_call_expr (call, 3, arg1, arg0, arg2);
 	}
 
-      /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */
-      arg1_inner_type = TREE_TYPE (arg1_type);
+      /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with
+	 VIEW_CONVERT_EXPR.  i.e.:
+	 D.3192 = v1;
+	 _1 = n & 3;
+	 VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i;
+	 v1 = D.3192;
+	 D.3194 = v1;  */
       if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1)
 	arg2 = build_int_cst (TREE_TYPE (arg2), 0);
       else
@@ -1593,15 +1596,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 	  SET_EXPR_LOCATION (stmt, loc);
 	  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
 	}
-
-      innerptrtype = build_pointer_type (arg1_inner_type);
-
-      stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
-      stmt = convert (innerptrtype, stmt);
-      stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
-      stmt = build_indirect_ref (loc, stmt, RO_NULL);
-      stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
-		     convert (TREE_TYPE (stmt), arg0));
+      stmt = build_array_ref (loc, stmt, arg2);
+      stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0);
       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
       return stmt;
     }
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 28e859f4381..f6f8bd65c2f 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
 extern void rs6000_expand_vector_set (rtx, rtx, int);
+extern void rs6000_expand_vector_set_var (rtx, rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index fe93cf6ff2b..d22d7999a61 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6788,6 +6788,152 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt)
   emit_insn (gen_rtx_SET (target, x));
 }
 
+/* Insert value from VEC into idx of TARGET.  */
+
+void
+rs6000_expand_vector_set_var (rtx target, rtx vec, rtx val, rtx idx)
+{
+  machine_mode mode = GET_MODE (vec);
+
+  if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx))
+    gcc_unreachable ();
+  else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)
+	   && TARGET_DIRECT_MOVE_64BIT)
+    {
+      gcc_assert (GET_MODE (idx) == E_SImode);
+      machine_mode inner_mode = GET_MODE (val);
+      HOST_WIDE_INT mode_mask = GET_MODE_MASK (inner_mode);
+
+      rtx tmp = gen_reg_rtx (GET_MODE (idx));
+      if (GET_MODE_SIZE (inner_mode) == 8)
+	{
+	  if (!BYTES_BIG_ENDIAN)
+	    {
+	      /*  idx = 1 - idx.  */
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx));
+	      /*  idx = idx * 8.  */
+	      emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3)));
+	      /*  idx = 16 - idx.  */
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp));
+	    }
+	  else
+	    {
+	      emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (3)));
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp));
+	    }
+	}
+      else if (GET_MODE_SIZE (inner_mode) == 4)
+	{
+	  if (!BYTES_BIG_ENDIAN)
+	    {
+	      /*  idx = 3 - idx.  */
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx));
+	      /*  idx = idx * 4.  */
+	      emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2)));
+	      /*  idx = 20 - idx.  */
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp));
+	    }
+	  else
+	  {
+	      emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2)));
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp));
+	  }
+	}
+      else if (GET_MODE_SIZE (inner_mode) == 2)
+	{
+	  if (!BYTES_BIG_ENDIAN)
+	    {
+	      /*  idx = 7 - idx.  */
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (7), idx));
+	      /*  idx = idx * 2.  */
+	      emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1)));
+	      /*  idx = 22 - idx.  */
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (22), tmp));
+	    }
+	  else
+	    {
+	      emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (1)));
+	      emit_insn (gen_subsi3 (tmp, GEN_INT (22), idx));
+	    }
+	}
+      else if (GET_MODE_SIZE (inner_mode) == 1)
+	if (!BYTES_BIG_ENDIAN)
+	  emit_insn (gen_addsi3 (tmp, idx, GEN_INT (8)));
+	else
+	  emit_insn (gen_subsi3 (tmp, GEN_INT (23), idx));
+      else
+	gcc_unreachable ();
+
+      /*  lxv vs32, mask.
+	  DImode: 0xffffffffffffffff0000000000000000
+	  SImode: 0x00000000ffffffff0000000000000000
+	  HImode: 0x000000000000ffff0000000000000000.
+	  QImode: 0x00000000000000ff0000000000000000.  */
+      rtx mask = gen_reg_rtx (V16QImode);
+      rtx mask_v2di = gen_reg_rtx (V2DImode);
+      rtvec v = rtvec_alloc (2);
+      if (!BYTES_BIG_ENDIAN)
+	{
+	  RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0);
+	  RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask);
+	}
+      else
+      {
+	  RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask);
+	  RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0);
+	}
+      emit_insn (
+	gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v)));
+      rtx sub_mask = simplify_gen_subreg (V16QImode, mask_v2di, V2DImode, 0);
+      emit_insn (gen_rtx_SET (mask, sub_mask));
+
+      /*  mtvsrd[wz] f0,val.  */
+      rtx val_v16qi = gen_reg_rtx (V16QImode);
+      switch (inner_mode)
+	{
+	default:
+	  gcc_unreachable ();
+	  break;
+	case E_QImode:
+	  emit_insn (gen_p8_mtvsrwz_v16qiqi2 (val_v16qi, val));
+	  break;
+	case E_HImode:
+	  emit_insn (gen_p8_mtvsrwz_v16qihi2 (val_v16qi, val));
+	  break;
+	case E_SImode:
+	  emit_insn (gen_p8_mtvsrwz_v16qisi2 (val_v16qi, val));
+	  break;
+	case E_SFmode:
+	  emit_insn (gen_p8_mtvsrwz_v16qisf2 (val_v16qi, val));
+	  break;
+	case E_DImode:
+	  emit_insn (gen_p8_mtvsrd_v16qidi2 (val_v16qi, val));
+	  break;
+	case E_DFmode:
+	  emit_insn (gen_p8_mtvsrd_v16qidf2 (val_v16qi, val));
+	  break;
+	}
+
+      /*  lvsl    v1,0,idx.  */
+      rtx pcv = gen_reg_rtx (V16QImode);
+      emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp));
+
+      /*  xxperm  vs0,vs0,vs33.  */
+      /*  xxperm  vs32,vs32,vs33.  */
+      rtx val_perm = gen_reg_rtx (V16QImode);
+      rtx mask_perm = gen_reg_rtx (V16QImode);
+      emit_insn (
+	gen_altivec_vperm_v8hiv16qi (val_perm, val_v16qi, val_v16qi, pcv));
+      emit_insn (gen_altivec_vperm_v8hiv16qi (mask_perm, mask, mask, pcv));
+
+      rtx sub_target = simplify_gen_subreg (V16QImode, vec, mode, 0);
+      emit_insn (gen_rtx_SET (target, sub_target));
+
+      /*  xxsel   vs34,vs34,vs0,vs32.  */
+      emit_insn (gen_vector_select_v16qi (target, target, val_perm, mask_perm));
+    }
+}
+
 /* Extract field ELT from VEC into TARGET.  */
 
 void
diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index 43b620ae1c0..b02fda836d4 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -8713,6 +8713,25 @@ (define_insn "p8_mtvsrwz"
   "mtvsrwz %x0,%1"
   [(set_attr "type" "mftgpr")])
 
+(define_mode_iterator FQHS [SF QI HI SI])
+(define_mode_iterator FD [DF DI])
+
+(define_insn "p8_mtvsrwz_v16qi<mode>2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+	(unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")]
+		   UNSPEC_P8V_MTVSRWZ))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrwz %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
+(define_insn "p8_mtvsrd_v16qi<mode>2"
+  [(set (match_operand:V16QI 0 "register_operand" "=wa")
+	(unspec:V16QI [(match_operand:FD 1 "register_operand" "r")]
+		   UNSPEC_P8V_MTVSRD))]
+  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
+  "mtvsrd %x0,%1"
+  [(set_attr "type" "mftgpr")])
+
 (define_insn_and_split "reload_fpr_from_gpr<mode>"
   [(set (match_operand:FMOVE64X 0 "register_operand" "=d")
 	(unspec:FMOVE64X [(match_operand:FMOVE64X 1 "register_operand" "r")]
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 796345c80d3..28e59c1c995 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -1227,11 +1227,24 @@ (define_expand "vec_init<mode><VEC_base_l>"
 (define_expand "vec_set<mode>"
   [(match_operand:VEC_E 0 "vlogical_operand")
    (match_operand:<VEC_base> 1 "register_operand")
-   (match_operand 2 "const_int_operand")]
+   (match_operand 2 "reg_or_cint_operand")]
   "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
 {
-  rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2]));
-  DONE;
+  if (CONST_INT_P (operands[2]))
+    {
+      rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2]));
+      DONE;
+    }
+  else
+    {
+      rtx target = gen_reg_rtx (V16QImode);
+      rs6000_expand_vector_set_var (target, operands[0], operands[1],
+				    operands[2]);
+      rtx sub_target
+	= simplify_gen_subreg (GET_MODE (operands[0]), target, V16QImode, 0);
+      emit_insn (gen_rtx_SET (operands[0], sub_target));
+      DONE;
+    }
 })
 
 (define_expand "vec_extract<mode><VEC_base_l>"
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index dd750210758..7e82690d12d 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -5349,7 +5349,7 @@ (define_expand "xl_len_r"
   rtx rtx_vtmp = gen_reg_rtx (V16QImode);
   rtx tmp = gen_reg_rtx (DImode);
 
-  emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2]));
+  emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2]));
   emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56)));
   emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp));
   emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp,
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c
new file mode 100644
index 00000000000..840f6712ad2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target { lp64 && p9vector_hw } } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */
+
+#include <stddef.h>
+#include <altivec.h>
+#include "pr79251.h"
+
+TEST_VEC_INSERT_ALL (test)
+
+#define run_test(TYPE, num)                                                    \
+  {                                                                            \
+    vector TYPE v;                                                             \
+    vector TYPE u = {0x0};                                                     \
+    for (long k = 0; k < 16 / sizeof (TYPE); k++)                              \
+      v[k] = 0xaa;                                                             \
+    for (long k = 0; k < 16 / sizeof (TYPE); k++)                              \
+      {                                                                        \
+	u = test##num (v, 254, k);                                             \
+	if (u[k] != (TYPE) 254)                                                \
+	  __builtin_abort ();                                                  \
+      }                                                                        \
+  }
+
+int
+main (void)
+{
+  TEST_VEC_INSERT_ALL (run_test)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c
new file mode 100644
index 00000000000..8124f503df9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */
+
+#include <stddef.h>
+#include <altivec.h>
+#include "pr79251.h"
+
+TEST_VEC_INSERT_ALL (test)
+
+/* { dg-final { scan-assembler-not {\mstxw\M} } } */
+/* { dg-final { scan-assembler-times {\mlvsl\M} 10 } } */
+/* { dg-final { scan-assembler-times {\mxxperm\M} 20 } } */
+/* { dg-final { scan-assembler-times {\mxxsel\M} 10 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.h b/gcc/testsuite/gcc.target/powerpc/pr79251.h
new file mode 100644
index 00000000000..609371c96cd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251.h
@@ -0,0 +1,19 @@
+
+#define test(TYPE, num)                                                        \
+  __attribute__ ((noinline, noclone))                                          \
+    vector TYPE test##num (vector TYPE v, TYPE i, unsigned int n)              \
+  {                                                                            \
+    return vec_insert (i, v, n);                                               \
+  }
+
+#define TEST_VEC_INSERT_ALL(T)                                                 \
+  T (char, 0)                                                                  \
+  T (unsigned char, 1)                                                         \
+  T (short, 2)                                                                 \
+  T (unsigned short, 3)                                                        \
+  T (int, 4)                                                                   \
+  T (unsigned int, 5)                                                          \
+  T (long long, 6)                                                             \
+  T (unsigned long long, 7)                                                    \
+  T (float, 8)                                                                 \
+  T (double, 9)
-- 
2.27.0.90.geebb51ba8c


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-18  6:17 [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Xiong Hu Luo
  2020-09-18  6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo
@ 2020-09-18 18:20 ` Segher Boessenkool
  2020-09-21  8:31 ` Richard Biener
  2 siblings, 0 replies; 24+ messages in thread
From: Segher Boessenkool @ 2020-09-18 18:20 UTC (permalink / raw)
  To: Xiong Hu Luo
  Cc: gcc-patches, dje.gcc, wschmidt, guojiufu, linkw,
	richard.guenther, richard.sandiford

On Fri, Sep 18, 2020 at 01:17:40AM -0500, Xiong Hu Luo wrote:
> This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
> VEC_SET internal function in gimple-isel pass if target supports
> vec_set with variable index by checking can_vec_set_var_idx_p.

> +      tree_code code = TREE_CODE (TREE_TYPE(view_op0));
                                              ^
Missing space here ---------------------------/

Thanks,


Segher

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-18  6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo
@ 2020-09-18 20:19   ` Segher Boessenkool
  2020-09-24  8:21     ` xionghu luo
  0 siblings, 1 reply; 24+ messages in thread
From: Segher Boessenkool @ 2020-09-18 20:19 UTC (permalink / raw)
  To: Xiong Hu Luo
  Cc: gcc-patches, dje.gcc, wschmidt, guojiufu, linkw,
	richard.guenther, richard.sandiford

Hi!

On Fri, Sep 18, 2020 at 01:17:41AM -0500, Xiong Hu Luo wrote:
> vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
> to be insert, arg2 is the place to insert arg1 to arg0.  Current expander
> generates stxv+stwx+lxv if arg2 is variable instead of constant, which
> causes serious store hit load performance issue on Power.  This patch tries
>  1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to
> unify the gimple code, then expander could use vec_set_optab to expand.
>  2) Expand the IFN VEC_SET to fast instructions: lvsl+xxperm+xxsel.
> In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too
> early in gimple stage if arg2 is variable, avoid generating store hit load
> instructions.

Sounds good.

> For Power9 V4SI:
> 	addi 9,1,-16
> 	rldic 6,6,2,60
> 	stxv 34,-16(1)
> 	stwx 5,9,6
> 	lxv 34,-16(1)
> =>
> 	addis 9,2,.LC0@toc@ha
> 	addi 9,9,.LC0@toc@l
> 	mtvsrwz 33,5
> 	lxv 32,0(9)
> 	sradi 9,6,2
> 	addze 9,9

These last two insns are a signed modulo.  That seems wrong.

> 	sldi 9,9,2
> 	subf 9,9,6
> 	subfic 9,9,3
> 	sldi 9,9,2
> 	subfic 9,9,20

This should be optimised quite a bit.  Why isn't it?  Is this -O0
output?  That isn't interesting, please show -O2 instead.

> 	lvsl 13,0,9
> 	xxperm 33,33,45
> 	xxperm 32,32,45
> 	xxsel 34,34,33,32

It feels like this can be done in fewer insns.  Hrm.  Since you already
rotate one of the data and the mask, does it work better if you rotate
to some known position and then use vinsertw (and then rotate again)?

> Though instructions increase from 5 to 15, the performance is improved
> 60% in typical cases.

Yeah, great :-)

> Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI

That is what the new testcases do, right?

> 	* config/rs6000/altivec.md (altivec_lvsl_reg_<mode>2): Rename to
> 	 (altivec_lvsl_reg_<mode>2) and extend to SDI mode.

(wrong leading space)

You say the same thing is renamed to the same thing (and not how we
normally do that).

"extend to SDI mode"...  Please rephrase, "extend" usually means sign-
or zero-extension in RTL.  But, see below.

> 	* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
> 	Ajdust variable index vec_insert to VIEW_CONVERT_EXPR.

"Adjust", but this changelog entry does not tell me what the change is.

> 	* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var):
> 	New declare.

(declaration)

> 	* config/rs6000/vsx.md: Call gen_altivec_lvsl_reg_di2.

From some specific define_something, surely?

> -(define_insn "altivec_lvsl_reg"
> +(define_insn "altivec_lvsl_reg_<mode>2"
>    [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")
>  	(unspec:V16QI
> -	[(match_operand:DI 1 "gpc_reg_operand" "b")]
> +	[(match_operand:SDI 1 "gpc_reg_operand" "b")]
>  	UNSPEC_LVSL_REG))]
>    "TARGET_ALTIVEC"
>    "lvsl %0,0,%1"

"DI" is wrong, yes; but so is SDI.  It should just be P here, afaics?
SI on -m32, DI on -m64.

> +/* Insert value from VEC into idx of TARGET.  */

What does idx count?  Bytes?  Things the same size as "value"?

What does "value" mean, anyway?  Do you mean "VAL" (we use upercase for
all parameter names; "IDX" as well).

> +void
> +rs6000_expand_vector_set_var (rtx target, rtx vec, rtx val, rtx idx)
> +{
> +  machine_mode mode = GET_MODE (vec);
> +
> +  if (VECTOR_MEM_VSX_P (mode) && CONST_INT_P (idx))
> +    gcc_unreachable ();
> +  else if (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)
> +	   && TARGET_DIRECT_MOVE_64BIT)

You already handled the CONST_INT_P case a line before, CONST_INT_P is
always true here.

This actually does *nothing* if this if() isn't true, which cannot be
right.

Maybe you want to replace these four lines with just
  gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx));

and handle !TARGET_DIRECT_MOVE_64BIT some way as well (maybe this cannot
be called then; assert it as well, then, or at least document it is
required).

> +	  if (!BYTES_BIG_ENDIAN)
> +	    {
> +	      /*  idx = 1 - idx.  */
> +	      emit_insn (gen_subsi3 (tmp, GEN_INT (1), idx));
> +	      /*  idx = idx * 8.  */
> +	      emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (3)));
> +	      /*  idx = 16 - idx.  */
> +	      emit_insn (gen_subsi3 (tmp, GEN_INT (16), tmp));
> +	    }

Maybe combine this here already?  16 - 8*(1 - idx) is the same as
8 + 8*idx.  Which might not be correct at all?

(GEN_INT (1) is const1_rtx, btw.)

> +      else if (GET_MODE_SIZE (inner_mode) == 4)
> +	{
> +	  if (!BYTES_BIG_ENDIAN)
> +	    {
> +	      /*  idx = 3 - idx.  */
> +	      emit_insn (gen_subsi3 (tmp, GEN_INT (3), idx));
> +	      /*  idx = idx * 4.  */
> +	      emit_insn (gen_ashlsi3 (tmp, tmp, GEN_INT (2)));
> +	      /*  idx = 20 - idx.  */
> +	      emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp));
> +	    }
> +	  else
> +	  {
> +	      emit_insn (gen_ashlsi3 (tmp, idx, GEN_INT (2)));
> +	      emit_insn (gen_subsi3 (tmp, GEN_INT (20), tmp));
> +	  }
> +	}

Please have all the different sizes in just one block, and compute the
contants you need there.

> +      /*  lxv vs32, mask.
> +	  DImode: 0xffffffffffffffff0000000000000000
> +	  SImode: 0x00000000ffffffff0000000000000000
> +	  HImode: 0x000000000000ffff0000000000000000.
> +	  QImode: 0x00000000000000ff0000000000000000.  */
> +      rtx mask = gen_reg_rtx (V16QImode);
> +      rtx mask_v2di = gen_reg_rtx (V2DImode);
> +      rtvec v = rtvec_alloc (2);
> +      if (!BYTES_BIG_ENDIAN)
> +	{
> +	  RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, 0);
> +	  RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, mode_mask);
> +	}
> +      else
> +      {
> +	  RTVEC_ELT (v, 0) = gen_rtx_CONST_INT (DImode, mode_mask);
> +	  RTVEC_ELT (v, 1) = gen_rtx_CONST_INT (DImode, 0);
> +	}
> +      emit_insn (
> +	gen_vec_initv2didi (mask_v2di, gen_rtx_PARALLEL (V2DImode, v)));

Please use two (or more) statements if things like this do not fit.  If
it is hard to layout code, you almost always should factor the code a
bit, and it will be better for it.

> +      /*  mtvsrd[wz] f0,val.  */
> +      rtx val_v16qi = gen_reg_rtx (V16QImode);
> +      switch (inner_mode)

So it looks like we want a parameterized name for p8_mtvsrwz, and for
p8_mtvsrd as well.  And then you can do the switch as just an if() on
the size of inner_mode (whether it is <= 4 or not).

> +      /*  lvsl    v1,0,idx.  */
> +      rtx pcv = gen_reg_rtx (V16QImode);
> +      emit_insn (gen_altivec_lvsl_reg_si2 (pcv, tmp));
> +
> +      /*  xxperm  vs0,vs0,vs33.  */
> +      /*  xxperm  vs32,vs32,vs33.  */

But you generate vperm, instead!  vperm has a parameter more, and is
much easier to understand because of that.

> +(define_mode_iterator FQHS [SF QI HI SI])
> +(define_mode_iterator FD [DF DI])

Those are pretty bad names.  The concept does not make much sense
either?  SF is normally stored in registers as DF, so that makes it
even weirder.

Why have floating point modes here at all?

> +(define_insn "p8_mtvsrwz_v16qi<mode>2"
> +  [(set (match_operand:V16QI 0 "register_operand" "=wa")
> +	(unspec:V16QI [(match_operand:FQHS 1 "register_operand" "r")]
> +		   UNSPEC_P8V_MTVSRWZ))]
> +  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> +  "mtvsrwz %x0,%1"
> +  [(set_attr "type" "mftgpr")])

This insn does not require TARGET_POWERPC64, it only looks at the low
32 bits of a GPR after all.

> +(define_insn "p8_mtvsrd_v16qi<mode>2"
> +  [(set (match_operand:V16QI 0 "register_operand" "=wa")
> +	(unspec:V16QI [(match_operand:FD 1 "register_operand" "r")]
> +		   UNSPEC_P8V_MTVSRD))]
> +  "TARGET_POWERPC64 && TARGET_DIRECT_MOVE"
> +  "mtvsrd %x0,%1"
> +  [(set_attr "type" "mftgpr")])

All this can be written without unspecs (and it should bet like all
existing patterns that use it!)  Unspec is the great optimisation
inhibitor.

What do you need new patterns for, anyway?

>  (define_expand "vec_set<mode>"

[...]

> +  if (CONST_INT_P (operands[2]))
> +    {
> +      rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2]));
> +      DONE;
> +    }
> +  else
> +    {
> +      rtx target = gen_reg_rtx (V16QImode);
> +      rs6000_expand_vector_set_var (target, operands[0], operands[1],
> +				    operands[2]);
> +      rtx sub_target
> +	= simplify_gen_subreg (GET_MODE (operands[0]), target, V16QImode, 0);
> +      emit_insn (gen_rtx_SET (operands[0], sub_target));
> +      DONE;
> +    }

Please handle this in rs6000_expand_vector_set, instead.  It is fine
to call rs6000_vector_set_var early in that, for example.

> diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
> index dd750210758..7e82690d12d 100644
> --- a/gcc/config/rs6000/vsx.md
> +++ b/gcc/config/rs6000/vsx.md
> @@ -5349,7 +5349,7 @@ (define_expand "xl_len_r"
>    rtx rtx_vtmp = gen_reg_rtx (V16QImode);
>    rtx tmp = gen_reg_rtx (DImode);
>  
> -  emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2]));
> +  emit_insn (gen_altivec_lvsl_reg_di2 (shift_mask, operands[2]));

So this becomes
  emit_insn (gen_altivec_lvsl_reg (DImode, shift_mask, operands[2]));
if you use parameterized names.


Tests...  You don't need lp64 anywhere, but then you probably need to
disallow the 64-bit tests.  That may be hard?


When you resend, please split it into separate patches for separate
things.  Small patches are fine, no, *good* even!


Segher

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-18  6:17 [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Xiong Hu Luo
  2020-09-18  6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo
  2020-09-18 18:20 ` [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Segher Boessenkool
@ 2020-09-21  8:31 ` Richard Biener
  2020-09-22  3:55   ` [PATCH v3 " xionghu luo
  2 siblings, 1 reply; 24+ messages in thread
From: Richard Biener @ 2020-09-21  8:31 UTC (permalink / raw)
  To: Xiong Hu Luo
  Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On Fri, Sep 18, 2020 at 8:18 AM Xiong Hu Luo <luoxhu@linux.ibm.com> wrote:
>
> This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
> VEC_SET internal function in gimple-isel pass if target supports
> vec_set with variable index by checking can_vec_set_var_idx_p.
>
> gcc/ChangeLog:
>
> 2020-09-18  Xionghu Luo  <luoxhu@linux.ibm.com>
>
>         * gimple-isel.cc (gimple_expand_vec_set_expr): New function.
>         (gimple_expand_vec_cond_exprs): Call gimple_expand_vec_set_expr.
>         * internal-fn.c (vec_set_direct): New define.
>         (expand_vec_set_optab_fn): New function.
>         (direct_vec_set_optab_supported_p): New define.
>         * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN.
>         * optabs.c (can_vec_set_var_idx_p): New function.
>         * optabs.h (can_vec_set_var_idx_p): New declare.
> ---
>  gcc/gimple-isel.cc  | 116 +++++++++++++++++++++++++++++++++++++++++++-
>  gcc/internal-fn.c   |  36 ++++++++++++++
>  gcc/internal-fn.def |   2 +
>  gcc/optabs.c        |  17 +++++++
>  gcc/optabs.h        |   3 ++
>  5 files changed, 172 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
> index b330cf4c20e..bc61e2895be 100644
> --- a/gcc/gimple-isel.cc
> +++ b/gcc/gimple-isel.cc
> @@ -35,6 +35,80 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-cfg.h"
>  #include "bitmap.h"
>  #include "tree-ssa-dce.h"
> +#include "fold-const.h"
> +#include "gimple-fold.h"
> +#include "memmodel.h"
> +#include "optabs.h"
> +
> +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
> +   internal function based on vector type of selected expansion.
> +   i.e.:
> +     VIEW_CONVERT_EXPR<int[4]>(u)[_1] =  = i_4(D);
> +   =>
> +     _7 = u;
> +     _8 = .VEC_SET (_7, i_4(D), _1);
> +     u = _8;  */
> +
> +static gimple *
> +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
> +{
> +  enum tree_code code;
> +  gcall *new_stmt = NULL;
> +  gassign *ass_stmt = NULL;
> +
> +  /* Only consider code == GIMPLE_ASSIGN.  */
> +  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
> +  if (!stmt)
> +    return NULL;
> +
> +  code = TREE_CODE (gimple_assign_lhs (stmt));

do the lhs = gimple_assign_lhs (stmt) before and elide cond,
putting the TREE_CODE into the if below.

> +  if (code != ARRAY_REF)
> +    return NULL;
> +
> +  tree lhs = gimple_assign_lhs (stmt);
> +  tree val = gimple_assign_rhs1 (stmt);
> +
> +  tree type = TREE_TYPE (lhs);
> +  tree op0 = TREE_OPERAND (lhs, 0);
> +  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR

So I think we want to have an exact structural match first here, so

  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR
      && DECL_P (TREE_OPERAND (op0, 0))
      && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
      && TYPE_MODE (TREE_TYPE (lhs)) == TYPE_MODE (TREE_TYPE
(TREE_TYPE (TREE_OPERAND (op0, 0))))

which means we're sure to do an element extract from a vector type
(and we know all vector types have sane element types).


> +      && tree_fits_uhwi_p (TYPE_SIZE (type)))
> +    {
> +      tree pos = TREE_OPERAND (lhs, 1);
> +      tree view_op0 = TREE_OPERAND (op0, 0);
> +      machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
> +      scalar_mode innermode = GET_MODE_INNER (outermode);
> +      tree_code code = TREE_CODE (TREE_TYPE(view_op0));
> +      if (!is_global_var (view_op0) && code == VECTOR_TYPE
> +         && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0)))

why did you need those TYPE_SIZE checks?  As said earlier
you want !TREE_ADDRESSABLE (view_op0) and eventually
the stronger auto_var_in_fn_p (view_op0, cfun) rather than !is_global_var.

> +         && can_vec_set_var_idx_p (code, outermode, innermode,
> +                                   TYPE_MODE (TREE_TYPE (pos))))
> +       {
> +         location_t loc = gimple_location (stmt);
> +         tree var_src = make_ssa_name (TREE_TYPE (view_op0));
> +         tree var_dst = make_ssa_name (TREE_TYPE (view_op0));
> +
> +         ass_stmt = gimple_build_assign (var_src, view_op0);
> +         gimple_set_vuse (ass_stmt, gimple_vuse (stmt));
> +         gimple_set_location (ass_stmt, loc);
> +         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
> +
> +         new_stmt
> +           = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
> +         gimple_call_set_lhs (new_stmt, var_dst);
> +         gimple_set_location (new_stmt, loc);
> +         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> +
> +         ass_stmt = gimple_build_assign (view_op0, var_dst);
> +         gimple_set_location (ass_stmt, loc);
> +         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
> +
> +         gimple_move_vops (ass_stmt, stmt);
> +         gsi_remove (gsi, true);
> +       }
> +    }
> +
> +  return ass_stmt;
> +}
>
>  /* Expand all VEC_COND_EXPR gimple assignments into calls to internal
>     function based on type of selected expansion.  */
> @@ -187,8 +261,25 @@ gimple_expand_vec_cond_exprs (void)
>      {
>        for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>         {
> -         gimple *g = gimple_expand_vec_cond_expr (&gsi,
> -                                                  &vec_cond_ssa_name_uses);
> +         gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
> +         if (!stmt)
> +           continue;
> +
> +         enum tree_code code;
> +         gimple *g = NULL;
> +         code = gimple_assign_rhs_code (stmt);
> +         switch (code)
> +           {
> +           case VEC_COND_EXPR:
> +             g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses);
> +             break;
> +           case ARRAY_REF:
> +             /*  TODO: generate IFN for vec_extract with variable index.  */

so why not do this here?

> +             break;
> +           default:
> +             break;
> +           }
> +
>           if (g != NULL)
>             {
>               tree lhs = gimple_assign_lhs (gsi_stmt (gsi));
> @@ -204,6 +295,27 @@ gimple_expand_vec_cond_exprs (void)
>
>    simple_dce_from_worklist (dce_ssa_names);
>
> +  FOR_EACH_BB_FN (bb, cfun)

but in a separate loop?

> +    {
> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +       {
> +         gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
> +         if (!stmt)
> +           continue;
> +
> +         enum tree_code code;
> +         code = TREE_CODE (gimple_assign_lhs (stmt));
> +         switch (code)
> +           {
> +           case ARRAY_REF:
> +             gimple_expand_vec_set_expr (&gsi);
> +             break;
> +           default:
> +             break;
> +           }
> +       }
> +    }
> +
>    return 0;
>  }
>
> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 8efc77d986b..36837381c04 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -115,6 +115,7 @@ init_internal_fns ()
>  #define vec_condeq_direct { 0, 0, false }
>  #define scatter_store_direct { 3, 1, false }
>  #define len_store_direct { 3, 3, false }
> +#define vec_set_direct { 3, 3, false }
>  #define unary_direct { 0, 0, true }
>  #define binary_direct { 0, 0, true }
>  #define ternary_direct { 0, 0, true }
> @@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>
>  #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>
> +static void
> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)

all new functions require a function level comment

> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree op0 = gimple_call_arg (stmt, 0);
> +  tree op1 = gimple_call_arg (stmt, 1);
> +  tree op2 = gimple_call_arg (stmt, 2);
> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +
> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
> +  scalar_mode innermode = GET_MODE_INNER (outermode);
> +
> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> +
> +  class expand_operand ops[3];
> +  enum insn_code icode = optab_handler (optab, outermode);
> +
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      pos = convert_to_mode (E_SImode, pos, 0);
> +
> +      create_fixed_operand (&ops[0], src);
> +      create_input_operand (&ops[1], value, innermode);
> +      create_input_operand (&ops[2], pos, GET_MODE (pos));
> +      if (maybe_expand_insn (icode, 3, ops))
> +       {
> +         emit_move_insn (target, src);

I think you need to assert that we end up here.

> +         return;
> +       }
> +    }
> +}
> +
>  static void
>  expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
>  {
> @@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>  #define direct_fold_left_optab_supported_p direct_optab_supported_p
>  #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
>  #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
> +#define direct_vec_set_optab_supported_p direct_optab_supported_p
>
>  /* Return the optab used by internal function FN.  */
>
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 13e60828fcf..e6cfe1b6159 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
>  DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
>  DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
>
> +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> +
>  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>
>  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> diff --git a/gcc/optabs.c b/gcc/optabs.c
> index 184827fdf4e..c8125670d2d 100644
> --- a/gcc/optabs.c
> +++ b/gcc/optabs.c
> @@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode,
>          && insn_operand_matches (icode, 3, test);
>  }
>
> +bool
> +can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode,
> +                      machine_mode value_mode, machine_mode idx_mode)

toplevel comment missing

> +{
> +  gcc_assert (code == VECTOR_TYPE);

what's the point of pasing 'code' here then?  Since the optab only has a single
mode, the vector mode, the value_mode is redundant as well.  And I guess
we might want to handle "arbitrary" index modes?  That is, the .md expanders
should not restrict its mode - I guess it simply uses VOIDmode at the moment
(for integer constants).  Not sure how to best do this without an explicit mode
in the optab ...

> +
> +  rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1);
> +  rtx reg2 = alloca_raw_REG (value_mode, LAST_VIRTUAL_REGISTER + 2);
> +  rtx reg3 = alloca_raw_REG (idx_mode, LAST_VIRTUAL_REGISTER + 3);
> +
> +  enum insn_code icode = optab_handler (vec_set_optab, vec_mode);
> +
> +  return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1)
> +        && insn_operand_matches (icode, 1, reg2)
> +        && insn_operand_matches (icode, 2, reg3);
> +}
> +
>  /* This function is called when we are going to emit a compare instruction that
>     compares the values found in X and Y, using the rtl operator COMPARISON.
>
> diff --git a/gcc/optabs.h b/gcc/optabs.h
> index 7c2ec257cb0..c018d127756 100644
> --- a/gcc/optabs.h
> +++ b/gcc/optabs.h
> @@ -249,6 +249,9 @@ extern int can_compare_p (enum rtx_code, machine_mode,
>     VALUE_MODE.  */
>  extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode);
>
> +extern bool can_vec_set_var_idx_p (enum tree_code, machine_mode, machine_mode,
> +                                  machine_mode);
> +
>  extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode,
>                             machine_mode, int);
>  /* Emit a pair of rtl insns to compare two rtx's and to jump
> --
> 2.27.0.90.geebb51ba8c
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-21  8:31 ` Richard Biener
@ 2020-09-22  3:55   ` xionghu luo
  2020-09-23 11:33     ` Richard Biener
  0 siblings, 1 reply; 24+ messages in thread
From: xionghu luo @ 2020-09-22  3:55 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 9867 bytes --]

Thanks for the review,


On 2020/9/21 16:31, Richard Biener wrote:
>> +
>> +static gimple *
>> +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
>> +{
>> +  enum tree_code code;
>> +  gcall *new_stmt = NULL;
>> +  gassign *ass_stmt = NULL;
>> +
>> +  /* Only consider code == GIMPLE_ASSIGN.  */
>> +  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
>> +  if (!stmt)
>> +    return NULL;
>> +
>> +  code = TREE_CODE (gimple_assign_lhs (stmt));
> 
> do the lhs = gimple_assign_lhs (stmt) before and elide cond,
> putting the TREE_CODE into the if below.

Done. 

> 
>> +  if (code != ARRAY_REF)
>> +    return NULL;
>> +
>> +  tree lhs = gimple_assign_lhs (stmt);
>> +  tree val = gimple_assign_rhs1 (stmt);
>> +
>> +  tree type = TREE_TYPE (lhs);
>> +  tree op0 = TREE_OPERAND (lhs, 0);
>> +  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR
> 
> So I think we want to have an exact structural match first here, so
> 
>    if (TREE_CODE (op0) == VIEW_CONVERT_EXPR
>        && DECL_P (TREE_OPERAND (op0, 0))
>        && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
>        && TYPE_MODE (TREE_TYPE (lhs)) == TYPE_MODE (TREE_TYPE
> (TREE_TYPE (TREE_OPERAND (op0, 0))))
> 
> which means we're sure to do an element extract from a vector type
> (and we know all vector types have sane element types).

Done.

> 
> 
>> +      && tree_fits_uhwi_p (TYPE_SIZE (type)))
>> +    {
>> +      tree pos = TREE_OPERAND (lhs, 1);
>> +      tree view_op0 = TREE_OPERAND (op0, 0);
>> +      machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
>> +      scalar_mode innermode = GET_MODE_INNER (outermode);
>> +      tree_code code = TREE_CODE (TREE_TYPE(view_op0));
>> +      if (!is_global_var (view_op0) && code == VECTOR_TYPE
>> +         && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0)))
> 
> why did you need those TYPE_SIZE checks?  As said earlier
> you want !TREE_ADDRESSABLE (view_op0) and eventually
> the stronger auto_var_in_fn_p (view_op0, cfun) rather than !is_global_var.

Done.

> 
>> +         && can_vec_set_var_idx_p (code, outermode, innermode,
>> +                                   TYPE_MODE (TREE_TYPE (pos))))
>> +       {
>> +         location_t loc = gimple_location (stmt);
>> +         tree var_src = make_ssa_name (TREE_TYPE (view_op0));
>> +         tree var_dst = make_ssa_name (TREE_TYPE (view_op0));
>> +
>> +         ass_stmt = gimple_build_assign (var_src, view_op0);
>> +         gimple_set_vuse (ass_stmt, gimple_vuse (stmt));
>> +         gimple_set_location (ass_stmt, loc);
>> +         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
>> +
>> +         new_stmt
>> +           = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
>> +         gimple_call_set_lhs (new_stmt, var_dst);
>> +         gimple_set_location (new_stmt, loc);
>> +         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
>> +
>> +         ass_stmt = gimple_build_assign (view_op0, var_dst);
>> +         gimple_set_location (ass_stmt, loc);
>> +         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
>> +
>> +         gimple_move_vops (ass_stmt, stmt);
>> +         gsi_remove (gsi, true);
>> +       }
>> +    }
>> +
>> +  return ass_stmt;
>> +}
>>
>>   /* Expand all VEC_COND_EXPR gimple assignments into calls to internal
>>      function based on type of selected expansion.  */
>> @@ -187,8 +261,25 @@ gimple_expand_vec_cond_exprs (void)
>>       {
>>         for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>>          {
>> -         gimple *g = gimple_expand_vec_cond_expr (&gsi,
>> -                                                  &vec_cond_ssa_name_uses);
>> +         gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
>> +         if (!stmt)
>> +           continue;
>> +
>> +         enum tree_code code;
>> +         gimple *g = NULL;
>> +         code = gimple_assign_rhs_code (stmt);
>> +         switch (code)
>> +           {
>> +           case VEC_COND_EXPR:
>> +             g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses);
>> +             break;
>> +           case ARRAY_REF:
>> +             /*  TODO: generate IFN for vec_extract with variable index.  */
> 
> so why not do this here?
> 
>> +             break;
>> +           default:
>> +             break;
>> +           }
>> +
>>            if (g != NULL)
>>              {
>>                tree lhs = gimple_assign_lhs (gsi_stmt (gsi));
>> @@ -204,6 +295,27 @@ gimple_expand_vec_cond_exprs (void)
>>
>>     simple_dce_from_worklist (dce_ssa_names);
>>
>> +  FOR_EACH_BB_FN (bb, cfun)
> 
> but in a separate loop?

The first loop is for rhs stmt process, this loop is for lhs stmt process.
I thought vec_extract also need to generate IFN before, but seems not
necessary now?  And that the first loop needs to update the lhs stmt while
then second doesn't.

> 
>> +    {
>> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>> +       {
>> +         gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
>> +         if (!stmt)
>> +           continue;
>> +
>> +         enum tree_code code;
>> +         code = TREE_CODE (gimple_assign_lhs (stmt));
>> +         switch (code)
>> +           {
>> +           case ARRAY_REF:
>> +             gimple_expand_vec_set_expr (&gsi);
>> +             break;
>> +           default:
>> +             break;
>> +           }
>> +       }
>> +    }
>> +
>>     return 0;
>>   }
>>
>> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
>> index 8efc77d986b..36837381c04 100644
>> --- a/gcc/internal-fn.c
>> +++ b/gcc/internal-fn.c
>> @@ -115,6 +115,7 @@ init_internal_fns ()
>>   #define vec_condeq_direct { 0, 0, false }
>>   #define scatter_store_direct { 3, 1, false }
>>   #define len_store_direct { 3, 3, false }
>> +#define vec_set_direct { 3, 3, false }
>>   #define unary_direct { 0, 0, true }
>>   #define binary_direct { 0, 0, true }
>>   #define ternary_direct { 0, 0, true }
>> @@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>>
>>   #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>>
>> +static void
>> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> 
> all new functions require a function level comment

Done.

> 
>> +{
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  tree op0 = gimple_call_arg (stmt, 0);
>> +  tree op1 = gimple_call_arg (stmt, 1);
>> +  tree op2 = gimple_call_arg (stmt, 2);
>> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
>> +  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
>> +
>> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
>> +  scalar_mode innermode = GET_MODE_INNER (outermode);
>> +
>> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
>> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
>> +
>> +  class expand_operand ops[3];
>> +  enum insn_code icode = optab_handler (optab, outermode);
>> +
>> +  if (icode != CODE_FOR_nothing)
>> +    {
>> +      pos = convert_to_mode (E_SImode, pos, 0);
>> +
>> +      create_fixed_operand (&ops[0], src);
>> +      create_input_operand (&ops[1], value, innermode);
>> +      create_input_operand (&ops[2], pos, GET_MODE (pos));
>> +      if (maybe_expand_insn (icode, 3, ops))
>> +       {
>> +         emit_move_insn (target, src);
> 
> I think you need to assert that we end up here.

Added gcc_unreachable at the end of this function.

> 
>> +         return;
>> +       }
>> +    }
>> +}
>> +
>>   static void
>>   expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
>>   {
>> @@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>>   #define direct_fold_left_optab_supported_p direct_optab_supported_p
>>   #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
>>   #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
>> +#define direct_vec_set_optab_supported_p direct_optab_supported_p
>>
>>   /* Return the optab used by internal function FN.  */
>>
>> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
>> index 13e60828fcf..e6cfe1b6159 100644
>> --- a/gcc/internal-fn.def
>> +++ b/gcc/internal-fn.def
>> @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
>>   DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
>>   DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
>>
>> +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
>> +
>>   DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>>
>>   DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
>> diff --git a/gcc/optabs.c b/gcc/optabs.c
>> index 184827fdf4e..c8125670d2d 100644
>> --- a/gcc/optabs.c
>> +++ b/gcc/optabs.c
>> @@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode,
>>           && insn_operand_matches (icode, 3, test);
>>   }
>>
>> +bool
>> +can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode,
>> +                      machine_mode value_mode, machine_mode idx_mode)
> 
> toplevel comment missing
> 
>> +{
>> +  gcc_assert (code == VECTOR_TYPE);
> 
> what's the point of pasing 'code' here then?  Since the optab only has a single
> mode, the vector mode, the value_mode is redundant as well.  And I guess
> we might want to handle "arbitrary" index modes?  That is, the .md expanders
> should not restrict its mode - I guess it simply uses VOIDmode at the moment
> (for integer constants).  Not sure how to best do this without an explicit mode
> in the optab ...

Yes, removed 'code' and value_mode by checking VECTOR_MODE_P and use GET_MODE_INNER
for value_mode.  ".md expanders" shall support for integer constants index mode, but
I guess they shouldn't be expanded by IFN as this function is for variable index
insert only?  Anyway, the v3 patch used VOIDmode check...


Thanks,
Xionghu


[-- Attachment #2: v3-0001-IFN-Implement-IFN_VEC_SET-for-ARRAY_REF-with-VIEW.patch --]
[-- Type: text/plain, Size: 10373 bytes --]

From 571717aea126380d3e36fdb4504f9a6337eed206 Mon Sep 17 00:00:00 2001
From: Xiong Hu Luo <luoxhu@linux.ibm.com>
Date: Mon, 14 Sep 2020 21:08:11 -0500
Subject: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with
 VIEW_CONVERT_EXPR

This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
VEC_SET internal function in gimple-isel pass if target supports
vec_set with variable index by checking can_vec_set_var_idx_p.

gcc/ChangeLog:

2020-09-22  Xionghu Luo  <luoxhu@linux.ibm.com>

	* gimple-isel.cc (gimple_expand_vec_set_expr): New function.
	(gimple_expand_vec_cond_exprs): Rename to ...
	(gimple_expand_vec_exprs): ... this and call
	gimple_expand_vec_set_expr.
	* internal-fn.c (vec_set_direct): New define.
	(expand_vec_set_optab_fn): New function.
	(direct_vec_set_optab_supported_p): New define.
	* internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN.
	* optabs.c (can_vec_set_var_idx_p): New function.
	* optabs.h (can_vec_set_var_idx_p): New declaration.
---
 gcc/gimple-isel.cc  | 115 ++++++++++++++++++++++++++++++++++++++++++--
 gcc/internal-fn.c   |  39 +++++++++++++++
 gcc/internal-fn.def |   2 +
 gcc/optabs.c        |  21 ++++++++
 gcc/optabs.h        |   4 ++
 5 files changed, 177 insertions(+), 4 deletions(-)

diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b330cf4c20e..d16d67b1a21 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -35,6 +35,78 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-cfg.h"
 #include "bitmap.h"
 #include "tree-ssa-dce.h"
+#include "fold-const.h"
+#include "gimple-fold.h"
+#include "memmodel.h"
+#include "optabs.h"
+
+/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
+   internal function based on vector type of selected expansion.
+   i.e.:
+     VIEW_CONVERT_EXPR<int[4]>(u)[_1] =  = i_4(D);
+   =>
+     _7 = u;
+     _8 = .VEC_SET (_7, i_4(D), _1);
+     u = _8;  */
+
+static gimple *
+gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
+{
+  enum tree_code code;
+  gcall *new_stmt = NULL;
+  gassign *ass_stmt = NULL;
+
+  /* Only consider code == GIMPLE_ASSIGN.  */
+  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
+  if (!stmt)
+    return NULL;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  code = TREE_CODE (lhs);
+  if (code != ARRAY_REF)
+    return NULL;
+
+  tree val = gimple_assign_rhs1 (stmt);
+  tree type = TREE_TYPE (lhs);
+  tree op0 = TREE_OPERAND (lhs, 0);
+  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0))
+      && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
+      && TYPE_MODE (TREE_TYPE (lhs))
+	   == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0)))))
+    {
+      tree pos = TREE_OPERAND (lhs, 1);
+      tree view_op0 = TREE_OPERAND (op0, 0);
+      machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
+      scalar_mode innermode = GET_MODE_INNER (outermode);
+      if (auto_var_in_fn_p (view_op0, cfun->decl)
+	  && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx_p (outermode))
+	{
+	  location_t loc = gimple_location (stmt);
+	  tree var_src = make_ssa_name (TREE_TYPE (view_op0));
+	  tree var_dst = make_ssa_name (TREE_TYPE (view_op0));
+
+	  ass_stmt = gimple_build_assign (var_src, view_op0);
+	  gimple_set_vuse (ass_stmt, gimple_vuse (stmt));
+	  gimple_set_location (ass_stmt, loc);
+	  gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+	  new_stmt
+	    = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
+	  gimple_call_set_lhs (new_stmt, var_dst);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+
+	  ass_stmt = gimple_build_assign (view_op0, var_dst);
+	  gimple_set_location (ass_stmt, loc);
+	  gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+	  gimple_move_vops (ass_stmt, stmt);
+	  gsi_remove (gsi, true);
+	}
+    }
+
+  return ass_stmt;
+}
 
 /* Expand all VEC_COND_EXPR gimple assignments into calls to internal
    function based on type of selected expansion.  */
@@ -176,7 +248,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi,
    VEC_COND_EXPR assignments.  */
 
 static unsigned int
-gimple_expand_vec_cond_exprs (void)
+gimple_expand_vec_exprs (void)
 {
   gimple_stmt_iterator gsi;
   basic_block bb;
@@ -187,8 +259,22 @@ gimple_expand_vec_cond_exprs (void)
     {
       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 	{
-	  gimple *g = gimple_expand_vec_cond_expr (&gsi,
-						   &vec_cond_ssa_name_uses);
+	  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
+	  if (!stmt)
+	    continue;
+
+	  enum tree_code code;
+	  gimple *g = NULL;
+	  code = gimple_assign_rhs_code (stmt);
+	  switch (code)
+	    {
+	    case VEC_COND_EXPR:
+	      g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses);
+	      break;
+	    default:
+	      break;
+	    }
+
 	  if (g != NULL)
 	    {
 	      tree lhs = gimple_assign_lhs (gsi_stmt (gsi));
@@ -204,6 +290,27 @@ gimple_expand_vec_cond_exprs (void)
 
   simple_dce_from_worklist (dce_ssa_names);
 
+  FOR_EACH_BB_FN (bb, cfun)
+    {
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+	{
+	  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
+	  if (!stmt)
+	    continue;
+
+	  enum tree_code code;
+	  code = TREE_CODE (gimple_assign_lhs (stmt));
+	  switch (code)
+	    {
+	    case ARRAY_REF:
+	      gimple_expand_vec_set_expr (&gsi);
+	      break;
+	    default:
+	      break;
+	    }
+	}
+    }
+
   return 0;
 }
 
@@ -237,7 +344,7 @@ public:
 
   virtual unsigned int execute (function *)
     {
-      return gimple_expand_vec_cond_exprs ();
+      return gimple_expand_vec_exprs ();
     }
 
 }; // class pass_gimple_isel
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 8efc77d986b..f97aea44253 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -115,6 +115,7 @@ init_internal_fns ()
 #define vec_condeq_direct { 0, 0, false }
 #define scatter_store_direct { 3, 1, false }
 #define len_store_direct { 3, 3, false }
+#define vec_set_direct { 3, 3, false }
 #define unary_direct { 0, 0, true }
 #define binary_direct { 0, 0, true }
 #define ternary_direct { 0, 0, true }
@@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 
 #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
 
+/* Expand VEC_SET internal functions.  */
+
+static void
+expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree op0 = gimple_call_arg (stmt, 0);
+  tree op1 = gimple_call_arg (stmt, 1);
+  tree op2 = gimple_call_arg (stmt, 2);
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
+
+  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
+  scalar_mode innermode = GET_MODE_INNER (outermode);
+
+  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+
+  class expand_operand ops[3];
+  enum insn_code icode = optab_handler (optab, outermode);
+
+  if (icode != CODE_FOR_nothing)
+    {
+      pos = convert_to_mode (E_SImode, pos, 0);
+
+      create_fixed_operand (&ops[0], src);
+      create_input_operand (&ops[1], value, innermode);
+      create_input_operand (&ops[2], pos, GET_MODE (pos));
+      if (maybe_expand_insn (icode, 3, ops))
+	{
+	  emit_move_insn (target, src);
+	  return;
+	}
+    }
+  gcc_unreachable ();
+}
+
 static void
 expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
 {
@@ -3253,6 +3291,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
+#define direct_vec_set_optab_supported_p direct_optab_supported_p
 
 /* Return the optab used by internal function FN.  */
 
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 13e60828fcf..e6cfe1b6159 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
 DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
 DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
 
+DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
+
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 184827fdf4e..8e844028d92 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -3841,6 +3841,27 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode,
 	 && insn_operand_matches (icode, 3, test);
 }
 
+/* Return whether the backend can emit vector set instructions for inserting
+   element into vector at variable index position.  */
+
+bool
+can_vec_set_var_idx_p (machine_mode vec_mode)
+{
+  if (!VECTOR_MODE_P (vec_mode))
+    return false;
+
+  machine_mode inner_mode = GET_MODE_INNER (vec_mode);
+  rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1);
+  rtx reg2 = alloca_raw_REG (inner_mode, LAST_VIRTUAL_REGISTER + 2);
+  rtx reg3 = alloca_raw_REG (VOIDmode, LAST_VIRTUAL_REGISTER + 3);
+
+  enum insn_code icode = optab_handler (vec_set_optab, vec_mode);
+
+  return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1)
+	 && insn_operand_matches (icode, 1, reg2)
+	 && insn_operand_matches (icode, 2, reg3);
+}
+
 /* This function is called when we are going to emit a compare instruction that
    compares the values found in X and Y, using the rtl operator COMPARISON.
 
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 7c2ec257cb0..0b14700ab3d 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -249,6 +249,10 @@ extern int can_compare_p (enum rtx_code, machine_mode,
    VALUE_MODE.  */
 extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode);
 
+/* Return whether the backend can emit vector set instructions for inserting
+   element into vector at variable index position.  */
+extern bool can_vec_set_var_idx_p (machine_mode);
+
 extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode,
 			    machine_mode, int);
 /* Emit a pair of rtl insns to compare two rtx's and to jump
-- 
2.27.0.90.geebb51ba8c


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-22  3:55   ` [PATCH v3 " xionghu luo
@ 2020-09-23 11:33     ` Richard Biener
  2020-09-24  3:24       ` xionghu luo
  0 siblings, 1 reply; 24+ messages in thread
From: Richard Biener @ 2020-09-23 11:33 UTC (permalink / raw)
  To: xionghu luo
  Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On Tue, Sep 22, 2020 at 5:55 AM xionghu luo <luoxhu@linux.ibm.com> wrote:
>
> Thanks for the review,
>
>
> On 2020/9/21 16:31, Richard Biener wrote:
> >> +
> >> +static gimple *
> >> +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
> >> +{
> >> +  enum tree_code code;
> >> +  gcall *new_stmt = NULL;
> >> +  gassign *ass_stmt = NULL;
> >> +
> >> +  /* Only consider code == GIMPLE_ASSIGN.  */
> >> +  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
> >> +  if (!stmt)
> >> +    return NULL;
> >> +
> >> +  code = TREE_CODE (gimple_assign_lhs (stmt));
> >
> > do the lhs = gimple_assign_lhs (stmt) before and elide cond,
> > putting the TREE_CODE into the if below.
>
> Done.
>
> >
> >> +  if (code != ARRAY_REF)
> >> +    return NULL;
> >> +
> >> +  tree lhs = gimple_assign_lhs (stmt);
> >> +  tree val = gimple_assign_rhs1 (stmt);
> >> +
> >> +  tree type = TREE_TYPE (lhs);
> >> +  tree op0 = TREE_OPERAND (lhs, 0);
> >> +  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR
> >
> > So I think we want to have an exact structural match first here, so
> >
> >    if (TREE_CODE (op0) == VIEW_CONVERT_EXPR
> >        && DECL_P (TREE_OPERAND (op0, 0))
> >        && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
> >        && TYPE_MODE (TREE_TYPE (lhs)) == TYPE_MODE (TREE_TYPE
> > (TREE_TYPE (TREE_OPERAND (op0, 0))))
> >
> > which means we're sure to do an element extract from a vector type
> > (and we know all vector types have sane element types).
>
> Done.
>
> >
> >
> >> +      && tree_fits_uhwi_p (TYPE_SIZE (type)))
> >> +    {
> >> +      tree pos = TREE_OPERAND (lhs, 1);
> >> +      tree view_op0 = TREE_OPERAND (op0, 0);
> >> +      machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
> >> +      scalar_mode innermode = GET_MODE_INNER (outermode);
> >> +      tree_code code = TREE_CODE (TREE_TYPE(view_op0));
> >> +      if (!is_global_var (view_op0) && code == VECTOR_TYPE
> >> +         && tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (view_op0)))
> >
> > why did you need those TYPE_SIZE checks?  As said earlier
> > you want !TREE_ADDRESSABLE (view_op0) and eventually
> > the stronger auto_var_in_fn_p (view_op0, cfun) rather than !is_global_var.
>
> Done.
>
> >
> >> +         && can_vec_set_var_idx_p (code, outermode, innermode,
> >> +                                   TYPE_MODE (TREE_TYPE (pos))))
> >> +       {
> >> +         location_t loc = gimple_location (stmt);
> >> +         tree var_src = make_ssa_name (TREE_TYPE (view_op0));
> >> +         tree var_dst = make_ssa_name (TREE_TYPE (view_op0));
> >> +
> >> +         ass_stmt = gimple_build_assign (var_src, view_op0);
> >> +         gimple_set_vuse (ass_stmt, gimple_vuse (stmt));
> >> +         gimple_set_location (ass_stmt, loc);
> >> +         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
> >> +
> >> +         new_stmt
> >> +           = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
> >> +         gimple_call_set_lhs (new_stmt, var_dst);
> >> +         gimple_set_location (new_stmt, loc);
> >> +         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> >> +
> >> +         ass_stmt = gimple_build_assign (view_op0, var_dst);
> >> +         gimple_set_location (ass_stmt, loc);
> >> +         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
> >> +
> >> +         gimple_move_vops (ass_stmt, stmt);
> >> +         gsi_remove (gsi, true);
> >> +       }
> >> +    }
> >> +
> >> +  return ass_stmt;
> >> +}
> >>
> >>   /* Expand all VEC_COND_EXPR gimple assignments into calls to internal
> >>      function based on type of selected expansion.  */
> >> @@ -187,8 +261,25 @@ gimple_expand_vec_cond_exprs (void)
> >>       {
> >>         for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> >>          {
> >> -         gimple *g = gimple_expand_vec_cond_expr (&gsi,
> >> -                                                  &vec_cond_ssa_name_uses);
> >> +         gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
> >> +         if (!stmt)
> >> +           continue;
> >> +
> >> +         enum tree_code code;
> >> +         gimple *g = NULL;
> >> +         code = gimple_assign_rhs_code (stmt);
> >> +         switch (code)
> >> +           {
> >> +           case VEC_COND_EXPR:
> >> +             g = gimple_expand_vec_cond_expr (&gsi, &vec_cond_ssa_name_uses);
> >> +             break;
> >> +           case ARRAY_REF:
> >> +             /*  TODO: generate IFN for vec_extract with variable index.  */
> >
> > so why not do this here?
> >
> >> +             break;
> >> +           default:
> >> +             break;
> >> +           }
> >> +
> >>            if (g != NULL)
> >>              {
> >>                tree lhs = gimple_assign_lhs (gsi_stmt (gsi));
> >> @@ -204,6 +295,27 @@ gimple_expand_vec_cond_exprs (void)
> >>
> >>     simple_dce_from_worklist (dce_ssa_names);
> >>
> >> +  FOR_EACH_BB_FN (bb, cfun)
> >
> > but in a separate loop?
>
> The first loop is for rhs stmt process, this loop is for lhs stmt process.
> I thought vec_extract also need to generate IFN before, but seems not
> necessary now?  And that the first loop needs to update the lhs stmt while
> then second doesn't.

That's not good reasons to separate them, please move all the processing
into one loop.

+         gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
+         if (!stmt)
+           continue;
+
+         enum tree_code code;
+         code = TREE_CODE (gimple_assign_lhs (stmt));
+         switch (code)
+           {
+           case ARRAY_REF:
+             gimple_expand_vec_set_expr (&gsi);

you also do the assign and ARRAY_REF checking duplicate.

The patch likely wasn't bootstrapped because I've seen unused and
set-but-not-used
variables.

Otherwise the patch looks good to me - I guess you want to add the
vec_extract bits as well so you can overall assess the affect of the patch
on altivec code?  That said, the patch misses a testcase where we verify
we properly expand the vector to a pseudo now.

Richard.

> >
> >> +    {
> >> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> >> +       {
> >> +         gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
> >> +         if (!stmt)
> >> +           continue;
> >> +
> >> +         enum tree_code code;
> >> +         code = TREE_CODE (gimple_assign_lhs (stmt));
> >> +         switch (code)
> >> +           {
> >> +           case ARRAY_REF:
> >> +             gimple_expand_vec_set_expr (&gsi);
> >> +             break;
> >> +           default:
> >> +             break;
> >> +           }
> >> +       }
> >> +    }
> >> +
> >>     return 0;
> >>   }
> >>
> >> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> >> index 8efc77d986b..36837381c04 100644
> >> --- a/gcc/internal-fn.c
> >> +++ b/gcc/internal-fn.c
> >> @@ -115,6 +115,7 @@ init_internal_fns ()
> >>   #define vec_condeq_direct { 0, 0, false }
> >>   #define scatter_store_direct { 3, 1, false }
> >>   #define len_store_direct { 3, 3, false }
> >> +#define vec_set_direct { 3, 3, false }
> >>   #define unary_direct { 0, 0, true }
> >>   #define binary_direct { 0, 0, true }
> >>   #define ternary_direct { 0, 0, true }
> >> @@ -2658,6 +2659,40 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> >>
> >>   #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
> >>
> >> +static void
> >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> >
> > all new functions require a function level comment
>
> Done.
>
> >
> >> +{
> >> +  tree lhs = gimple_call_lhs (stmt);
> >> +  tree op0 = gimple_call_arg (stmt, 0);
> >> +  tree op1 = gimple_call_arg (stmt, 1);
> >> +  tree op2 = gimple_call_arg (stmt, 2);
> >> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> >> +  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
> >> +
> >> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
> >> +  scalar_mode innermode = GET_MODE_INNER (outermode);
> >> +
> >> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> >> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> >> +
> >> +  class expand_operand ops[3];
> >> +  enum insn_code icode = optab_handler (optab, outermode);
> >> +
> >> +  if (icode != CODE_FOR_nothing)
> >> +    {
> >> +      pos = convert_to_mode (E_SImode, pos, 0);
> >> +
> >> +      create_fixed_operand (&ops[0], src);
> >> +      create_input_operand (&ops[1], value, innermode);
> >> +      create_input_operand (&ops[2], pos, GET_MODE (pos));
> >> +      if (maybe_expand_insn (icode, 3, ops))
> >> +       {
> >> +         emit_move_insn (target, src);
> >
> > I think you need to assert that we end up here.
>
> Added gcc_unreachable at the end of this function.
>
> >
> >> +         return;
> >> +       }
> >> +    }
> >> +}
> >> +
> >>   static void
> >>   expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
> >>   {
> >> @@ -3253,6 +3288,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
> >>   #define direct_fold_left_optab_supported_p direct_optab_supported_p
> >>   #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
> >>   #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
> >> +#define direct_vec_set_optab_supported_p direct_optab_supported_p
> >>
> >>   /* Return the optab used by internal function FN.  */
> >>
> >> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> >> index 13e60828fcf..e6cfe1b6159 100644
> >> --- a/gcc/internal-fn.def
> >> +++ b/gcc/internal-fn.def
> >> @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
> >>   DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
> >>   DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
> >>
> >> +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> >> +
> >>   DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
> >>
> >>   DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> >> diff --git a/gcc/optabs.c b/gcc/optabs.c
> >> index 184827fdf4e..c8125670d2d 100644
> >> --- a/gcc/optabs.c
> >> +++ b/gcc/optabs.c
> >> @@ -3841,6 +3841,23 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode,
> >>           && insn_operand_matches (icode, 3, test);
> >>   }
> >>
> >> +bool
> >> +can_vec_set_var_idx_p (enum tree_code code, machine_mode vec_mode,
> >> +                      machine_mode value_mode, machine_mode idx_mode)
> >
> > toplevel comment missing
> >
> >> +{
> >> +  gcc_assert (code == VECTOR_TYPE);
> >
> > what's the point of pasing 'code' here then?  Since the optab only has a single
> > mode, the vector mode, the value_mode is redundant as well.  And I guess
> > we might want to handle "arbitrary" index modes?  That is, the .md expanders
> > should not restrict its mode - I guess it simply uses VOIDmode at the moment
> > (for integer constants).  Not sure how to best do this without an explicit mode
> > in the optab ...
>
> Yes, removed 'code' and value_mode by checking VECTOR_MODE_P and use GET_MODE_INNER
> for value_mode.  ".md expanders" shall support for integer constants index mode, but
> I guess they shouldn't be expanded by IFN as this function is for variable index
> insert only?  Anyway, the v3 patch used VOIDmode check...
>
>
> Thanks,
> Xionghu
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-23 11:33     ` Richard Biener
@ 2020-09-24  3:24       ` xionghu luo
  2020-09-24 12:39         ` Richard Sandiford
  0 siblings, 1 reply; 24+ messages in thread
From: xionghu luo @ 2020-09-24  3:24 UTC (permalink / raw)
  To: Richard Biener
  Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

Hi,

On 2020/9/23 19:33, Richard Biener wrote:
>> The first loop is for rhs stmt process, this loop is for lhs stmt process.
>> I thought vec_extract also need to generate IFN before, but seems not
>> necessary now?  And that the first loop needs to update the lhs stmt while
>> then second doesn't.
> That's not good reasons to separate them, please move all the processing
> into one loop.
> 
> +         gassign *stmt = dyn_cast<gassign *> (gsi_stmt (gsi));
> +         if (!stmt)
> +           continue;
> +
> +         enum tree_code code;
> +         code = TREE_CODE (gimple_assign_lhs (stmt));
> +         switch (code)
> +           {
> +           case ARRAY_REF:
> +             gimple_expand_vec_set_expr (&gsi);
> 
> you also do the assign and ARRAY_REF checking duplicate.
> 
> The patch likely wasn't bootstrapped because I've seen unused and
> set-but-not-used
> variables.
> 
> Otherwise the patch looks good to me - I guess you want to add the
> vec_extract bits as well so you can overall assess the affect of the patch
> on altivec code?  That said, the patch misses a testcase where we verify
> we properly expand the vector to a pseudo now.

Thanks, fixed the bootstrap error.  Actually the "[PATCH v2 2/2] rs6000: Expand vec_insert
in expander instead of gimple [PR79251]" includes typed vec_insert tests for 
V4SI/V4SF/V8HI/V16QI/V2DI/V2DF of expanding the IFN VEC_SET and instruction count check,
but I am discussing and refining with Segher's comments, will split and send it later once
we reached agreement.  Not sure whether this is the testcase you mentioned? (As you said
*vec_extract*, but this patch series target for vec_insert only.)  

FYI, We are trying below or even better code generations:

       rlwinm 6,6,2,28,29
       mtvsrwz 0,5
       lvsr 1,0,6
       lvsl 0,0,6
       xxperm 34,34,33
       xxinsertw 34,0,12
       xxperm 34,34,32

Second thing is I removed the second loop and move the "gimple_expand_vec_set_expr (&gsi);"
up as your comments.  Thanks again.


IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR

This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
VEC_SET internal function in gimple-isel pass if target supports
vec_set with variable index by checking can_vec_set_var_idx_p.

gcc/ChangeLog:

2020-09-24  Xionghu Luo  <luoxhu@linux.ibm.com>

	* gimple-isel.cc (gimple_expand_vec_set_expr): New function.
	(gimple_expand_vec_cond_exprs): Rename to ...
	(gimple_expand_vec_exprs): ... this and call
	gimple_expand_vec_set_expr.
	* internal-fn.c (vec_set_direct): New define.
	(expand_vec_set_optab_fn): New function.
	(direct_vec_set_optab_supported_p): New define.
	* internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN.
	* optabs.c (can_vec_set_var_idx_p): New function.
	* optabs.h (can_vec_set_var_idx_p): New declaration.
---
 gcc/gimple-isel.cc  | 75 +++++++++++++++++++++++++++++++++++++++++++--
 gcc/internal-fn.c   | 39 +++++++++++++++++++++++
 gcc/internal-fn.def |  2 ++
 gcc/optabs.c        | 21 +++++++++++++
 gcc/optabs.h        |  4 +++
 5 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b330cf4c20e..02513e04900 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -35,6 +35,74 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-cfg.h"
 #include "bitmap.h"
 #include "tree-ssa-dce.h"
+#include "memmodel.h"
+#include "optabs.h"
+
+/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
+   internal function based on vector type of selected expansion.
+   i.e.:
+     VIEW_CONVERT_EXPR<int[4]>(u)[_1] =  = i_4(D);
+   =>
+     _7 = u;
+     _8 = .VEC_SET (_7, i_4(D), _1);
+     u = _8;  */
+
+static gimple *
+gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
+{
+  enum tree_code code;
+  gcall *new_stmt = NULL;
+  gassign *ass_stmt = NULL;
+
+  /* Only consider code == GIMPLE_ASSIGN.  */
+  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
+  if (!stmt)
+    return NULL;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  code = TREE_CODE (lhs);
+  if (code != ARRAY_REF)
+    return NULL;
+
+  tree val = gimple_assign_rhs1 (stmt);
+  tree op0 = TREE_OPERAND (lhs, 0);
+  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0))
+      && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
+      && TYPE_MODE (TREE_TYPE (lhs))
+	   == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0)))))
+    {
+      tree pos = TREE_OPERAND (lhs, 1);
+      tree view_op0 = TREE_OPERAND (op0, 0);
+      machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
+      if (auto_var_in_fn_p (view_op0, cfun->decl)
+	  && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx_p (outermode))
+	{
+	  location_t loc = gimple_location (stmt);
+	  tree var_src = make_ssa_name (TREE_TYPE (view_op0));
+	  tree var_dst = make_ssa_name (TREE_TYPE (view_op0));
+
+	  ass_stmt = gimple_build_assign (var_src, view_op0);
+	  gimple_set_vuse (ass_stmt, gimple_vuse (stmt));
+	  gimple_set_location (ass_stmt, loc);
+	  gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+	  new_stmt
+	    = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
+	  gimple_call_set_lhs (new_stmt, var_dst);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+
+	  ass_stmt = gimple_build_assign (view_op0, var_dst);
+	  gimple_set_location (ass_stmt, loc);
+	  gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+	  gimple_move_vops (ass_stmt, stmt);
+	  gsi_remove (gsi, true);
+	}
+    }
+
+  return ass_stmt;
+}
 
 /* Expand all VEC_COND_EXPR gimple assignments into calls to internal
    function based on type of selected expansion.  */
@@ -176,7 +244,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi,
    VEC_COND_EXPR assignments.  */
 
 static unsigned int
-gimple_expand_vec_cond_exprs (void)
+gimple_expand_vec_exprs (void)
 {
   gimple_stmt_iterator gsi;
   basic_block bb;
@@ -189,12 +257,15 @@ gimple_expand_vec_cond_exprs (void)
 	{
 	  gimple *g = gimple_expand_vec_cond_expr (&gsi,
 						   &vec_cond_ssa_name_uses);
+
 	  if (g != NULL)
 	    {
 	      tree lhs = gimple_assign_lhs (gsi_stmt (gsi));
 	      gimple_set_lhs (g, lhs);
 	      gsi_replace (&gsi, g, false);
 	    }
+
+	  gimple_expand_vec_set_expr (&gsi);
 	}
     }
 
@@ -237,7 +308,7 @@ public:
 
   virtual unsigned int execute (function *)
     {
-      return gimple_expand_vec_cond_exprs ();
+      return gimple_expand_vec_exprs ();
     }
 
 }; // class pass_gimple_isel
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 8efc77d986b..f97aea44253 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -115,6 +115,7 @@ init_internal_fns ()
 #define vec_condeq_direct { 0, 0, false }
 #define scatter_store_direct { 3, 1, false }
 #define len_store_direct { 3, 3, false }
+#define vec_set_direct { 3, 3, false }
 #define unary_direct { 0, 0, true }
 #define binary_direct { 0, 0, true }
 #define ternary_direct { 0, 0, true }
@@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 
 #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
 
+/* Expand VEC_SET internal functions.  */
+
+static void
+expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree op0 = gimple_call_arg (stmt, 0);
+  tree op1 = gimple_call_arg (stmt, 1);
+  tree op2 = gimple_call_arg (stmt, 2);
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
+
+  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
+  scalar_mode innermode = GET_MODE_INNER (outermode);
+
+  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+
+  class expand_operand ops[3];
+  enum insn_code icode = optab_handler (optab, outermode);
+
+  if (icode != CODE_FOR_nothing)
+    {
+      pos = convert_to_mode (E_SImode, pos, 0);
+
+      create_fixed_operand (&ops[0], src);
+      create_input_operand (&ops[1], value, innermode);
+      create_input_operand (&ops[2], pos, GET_MODE (pos));
+      if (maybe_expand_insn (icode, 3, ops))
+	{
+	  emit_move_insn (target, src);
+	  return;
+	}
+    }
+  gcc_unreachable ();
+}
+
 static void
 expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
 {
@@ -3253,6 +3291,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
+#define direct_vec_set_optab_supported_p direct_optab_supported_p
 
 /* Return the optab used by internal function FN.  */
 
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 13e60828fcf..e6cfe1b6159 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
 DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
 DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
 
+DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
+
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 184827fdf4e..8e844028d92 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -3841,6 +3841,27 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode,
 	 && insn_operand_matches (icode, 3, test);
 }
 
+/* Return whether the backend can emit vector set instructions for inserting
+   element into vector at variable index position.  */
+
+bool
+can_vec_set_var_idx_p (machine_mode vec_mode)
+{
+  if (!VECTOR_MODE_P (vec_mode))
+    return false;
+
+  machine_mode inner_mode = GET_MODE_INNER (vec_mode);
+  rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1);
+  rtx reg2 = alloca_raw_REG (inner_mode, LAST_VIRTUAL_REGISTER + 2);
+  rtx reg3 = alloca_raw_REG (VOIDmode, LAST_VIRTUAL_REGISTER + 3);
+
+  enum insn_code icode = optab_handler (vec_set_optab, vec_mode);
+
+  return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1)
+	 && insn_operand_matches (icode, 1, reg2)
+	 && insn_operand_matches (icode, 2, reg3);
+}
+
 /* This function is called when we are going to emit a compare instruction that
    compares the values found in X and Y, using the rtl operator COMPARISON.
 
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 7c2ec257cb0..0b14700ab3d 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -249,6 +249,10 @@ extern int can_compare_p (enum rtx_code, machine_mode,
    VALUE_MODE.  */
 extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode);
 
+/* Return whether the backend can emit vector set instructions for inserting
+   element into vector at variable index position.  */
+extern bool can_vec_set_var_idx_p (machine_mode);
+
 extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode,
 			    machine_mode, int);
 /* Emit a pair of rtl insns to compare two rtx's and to jump
-- 
2.27.0.90.geebb51ba8c


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-18 20:19   ` Segher Boessenkool
@ 2020-09-24  8:21     ` xionghu luo
  2020-09-24 13:27       ` Richard Biener
  0 siblings, 1 reply; 24+ messages in thread
From: xionghu luo @ 2020-09-24  8:21 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: gcc-patches, dje.gcc, wschmidt, guojiufu, linkw,
	richard.guenther, richard.sandiford

[-- Attachment #1: Type: text/plain, Size: 586 bytes --]

Hi Segher,

The attached two patches are updated and split from
 "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]"
as your comments.


[PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param

This one is preparation work of fix lvsl&lvsr arg mode and rs6000_expand_vector_set
parameter support for both constant and variable index input.


[PATCH v3 2/3] rs6000: Support variable insert and Expand vec_insert in expander [PR79251]

This one is Building VIEW_CONVERT_EXPR and expand the IFN VEC_SET to fast.


Thanks,
Xionghu

[-- Attachment #2: v3-0002-rs6000-Fix-lvsl-lvsr-mode-and-change-rs6000_expan.patch --]
[-- Type: text/plain, Size: 6880 bytes --]

From 9d74c488ad3c7cad8c276cc49749ec05158d1e96 Mon Sep 17 00:00:00 2001
From: Xiong Hu Luo <luoxhu@linux.ibm.com>
Date: Thu, 24 Sep 2020 00:52:35 -0500
Subject: [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change
 rs6000_expand_vector_set param

lvsl and lvsr looks only at the low 4 bits, use SI for index param.
 rs6000_expand_vector_set could accept insert either to constant position
or variable position, so change the operand to reg_or_cint_operand.

gcc/ChangeLog:

2020-09-24  Xionghu Luo  <luoxhu@linux.ibm.com>

	* config/rs6000/altivec.md (altivec_lvsl_reg): Change to
	SImode.
	(altivec_lvsr_reg): Likewise.
	* config/rs6000/rs6000-call.c (altivec_expand_vec_set_builtin):
	Change call param 2 from type int to rtx.
	* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set):
	Likewise.
	* config/rs6000/rs6000.c (rs6000_expand_vector_init):
	Change call param 2 from type int to rtx.
	(rs6000_expand_vector_set): Likewise.
	* config/rs6000/vector.md (vec_set<mode>): Support both constant
	and variable index vec_set.
	* config/rs6000/vsx.md: Call gen_altivec_lvsl_reg with SImode.
---
 gcc/config/rs6000/altivec.md      |  4 ++--
 gcc/config/rs6000/rs6000-call.c   |  2 +-
 gcc/config/rs6000/rs6000-protos.h |  2 +-
 gcc/config/rs6000/rs6000.c        | 16 +++++++++-------
 gcc/config/rs6000/vector.md       |  4 ++--
 gcc/config/rs6000/vsx.md          |  3 ++-
 6 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/gcc/config/rs6000/altivec.md b/gcc/config/rs6000/altivec.md
index 0a2e634d6b0..a1c06c9ab8c 100644
--- a/gcc/config/rs6000/altivec.md
+++ b/gcc/config/rs6000/altivec.md
@@ -2775,7 +2775,7 @@ (define_expand "altivec_lvsl"
 (define_insn "altivec_lvsl_reg"
   [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")
 	(unspec:V16QI
-	[(match_operand:DI 1 "gpc_reg_operand" "b")]
+	[(match_operand:SI 1 "gpc_reg_operand" "b")]
 	UNSPEC_LVSL_REG))]
   "TARGET_ALTIVEC"
   "lvsl %0,0,%1"
@@ -2813,7 +2813,7 @@ (define_expand "altivec_lvsr"
 (define_insn "altivec_lvsr_reg"
   [(set (match_operand:V16QI 0 "altivec_register_operand" "=v")
        (unspec:V16QI
-       [(match_operand:DI 1 "gpc_reg_operand" "b")]
+       [(match_operand:SI 1 "gpc_reg_operand" "b")]
        UNSPEC_LVSR_REG))]
   "TARGET_ALTIVEC"
   "lvsr %0,0,%1"
diff --git a/gcc/config/rs6000/rs6000-call.c b/gcc/config/rs6000/rs6000-call.c
index e39cfcf672b..51f278933bd 100644
--- a/gcc/config/rs6000/rs6000-call.c
+++ b/gcc/config/rs6000/rs6000-call.c
@@ -10655,7 +10655,7 @@ altivec_expand_vec_set_builtin (tree exp)
   op0 = force_reg (tmode, op0);
   op1 = force_reg (mode1, op1);
 
-  rs6000_expand_vector_set (op0, op1, elt);
+  rs6000_expand_vector_set (op0, op1, GEN_INT (elt));
 
   return op0;
 }
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 28e859f4381..6a0fbc3ba2e 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -57,7 +57,7 @@ extern bool rs6000_move_128bit_ok_p (rtx []);
 extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
-extern void rs6000_expand_vector_set (rtx, rtx, int);
+extern void rs6000_expand_vector_set (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index fe93cf6ff2b..c46ec14f060 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6669,7 +6669,8 @@ rs6000_expand_vector_init (rtx target, rtx vals)
       rs6000_expand_vector_init (target, copy);
 
       /* Insert variable.  */
-      rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var), one_var);
+      rs6000_expand_vector_set (target, XVECEXP (vals, 0, one_var),
+				GEN_INT (one_var));
       return;
     }
 
@@ -6683,10 +6684,10 @@ rs6000_expand_vector_init (rtx target, rtx vals)
   emit_move_insn (target, mem);
 }
 
-/* Set field ELT of TARGET to VAL.  */
+/* Set field ELT_RTX of TARGET to VAL.  */
 
 void
-rs6000_expand_vector_set (rtx target, rtx val, int elt)
+rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
@@ -6700,7 +6701,6 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt)
   if (VECTOR_MEM_VSX_P (mode))
     {
       rtx insn = NULL_RTX;
-      rtx elt_rtx = GEN_INT (elt);
 
       if (mode == V2DFmode)
 	insn = gen_vsx_set_v2df (target, target, val, elt_rtx);
@@ -6727,8 +6727,11 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt)
 	}
     }
 
+  gcc_assert (CONST_INT_P (elt_rtx));
+
   /* Simplify setting single element vectors like V1TImode.  */
-  if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode) && elt == 0)
+  if (GET_MODE_SIZE (mode) == GET_MODE_SIZE (inner_mode)
+      && INTVAL (elt_rtx) == 0)
     {
       emit_move_insn (target, gen_lowpart (mode, val));
       return;
@@ -6751,8 +6754,7 @@ rs6000_expand_vector_set (rtx target, rtx val, int elt)
 
   /* Set permute mask to insert element into target.  */
   for (i = 0; i < width; ++i)
-    XVECEXP (mask, 0, elt*width + i)
-      = GEN_INT (i + 0x10);
+    XVECEXP (mask, 0, INTVAL (elt_rtx) * width + i) = GEN_INT (i + 0x10);
   x = gen_rtx_CONST_VECTOR (V16QImode, XVEC (mask, 0));
 
   if (BYTES_BIG_ENDIAN)
diff --git a/gcc/config/rs6000/vector.md b/gcc/config/rs6000/vector.md
index 796345c80d3..7aab1887cf5 100644
--- a/gcc/config/rs6000/vector.md
+++ b/gcc/config/rs6000/vector.md
@@ -1227,10 +1227,10 @@ (define_expand "vec_init<mode><VEC_base_l>"
 (define_expand "vec_set<mode>"
   [(match_operand:VEC_E 0 "vlogical_operand")
    (match_operand:<VEC_base> 1 "register_operand")
-   (match_operand 2 "const_int_operand")]
+   (match_operand 2 "reg_or_cint_operand")]
   "VECTOR_MEM_ALTIVEC_OR_VSX_P (<MODE>mode)"
 {
-  rs6000_expand_vector_set (operands[0], operands[1], INTVAL (operands[2]));
+  rs6000_expand_vector_set (operands[0], operands[1], operands[2]);
   DONE;
 })
 
diff --git a/gcc/config/rs6000/vsx.md b/gcc/config/rs6000/vsx.md
index dd750210758..96bfeb63e0d 100644
--- a/gcc/config/rs6000/vsx.md
+++ b/gcc/config/rs6000/vsx.md
@@ -5349,7 +5349,8 @@ (define_expand "xl_len_r"
   rtx rtx_vtmp = gen_reg_rtx (V16QImode);
   rtx tmp = gen_reg_rtx (DImode);
 
-  emit_insn (gen_altivec_lvsl_reg (shift_mask, operands[2]));
+  rtx op2_si = gen_rtx_REG (SImode, reg_or_subregno (operands[2]));
+  emit_insn (gen_altivec_lvsl_reg (shift_mask, op2_si));
   emit_insn (gen_ashldi3 (tmp, operands[2], GEN_INT (56)));
   emit_insn (gen_lxvll (rtx_vtmp, operands[1], tmp));
   emit_insn (gen_altivec_vperm_v8hiv16qi (operands[0], rtx_vtmp, rtx_vtmp,
-- 
2.27.0.90.geebb51ba8c


[-- Attachment #3: v3-0003-rs6000-Support-variable-insert-and-Expand-vec_ins.patch --]
[-- Type: text/plain, Size: 11110 bytes --]

From 0946aae4af8286a002a5625034fa7195628f2004 Mon Sep 17 00:00:00 2001
From: Xiong Hu Luo <luoxhu@linux.ibm.com>
Date: Thu, 24 Sep 2020 01:17:20 -0500
Subject: [PATCH v3 3/3] rs6000: Support variable insert and Expand vec_insert
 in expander [PR79251]

vec_insert accepts 3 arguments, arg0 is input vector, arg1 is the value
to be insert, arg2 is the place to insert arg1 to arg0.  Current expander
generates stxv+stwx+lxv if arg2 is variable instead of constant, which
causes serious store hit load performance issue on Power.  This patch tries
 1) Build VIEW_CONVERT_EXPR for vec_insert (i, v, n) like v[n&3] = i to
unify the gimple code, then expander could use vec_set_optab to expand.
 2) Expand the IFN VEC_SET to fast instructions: lvsr+insert+lvsl.
In this way, "vec_insert (i, v, n)" and "v[n&3] = i" won't be expanded too
early in gimple stage if arg2 is variable, avoid generating store hit load
instructions.

For Power9 V4SI:
	addi 9,1,-16
	rldic 6,6,2,60
	stxv 34,-16(1)
	stwx 5,9,6
	lxv 34,-16(1)
=>
	rlwinm 6,6,2,28,29
	mtvsrwz 0,5
	lvsr 1,0,6
	lvsl 0,0,6
	xxperm 34,34,33
	xxinsertw 34,0,12
	xxperm 34,34,32

Though instructions increase from 5 to 7, the performance is improved
60% in typical cases.
Tested with V2DI, V2DF V4SI, V4SF, V8HI, V16QI on Power9-LE.

gcc/ChangeLog:

2020-09-24  Xionghu Luo  <luoxhu@linux.ibm.com>

	* config/rs6000/rs6000-c.c (altivec_resolve_overloaded_builtin):
	Ajdust variable index vec_insert from address dereference to
	ARRAY_REF(VIEW_CONVERT_EXPR) tree expression.
	* config/rs6000/rs6000-protos.h (rs6000_expand_vector_set_var):
	New declaration.
	* config/rs6000/rs6000.c (rs6000_expand_vector_set_var): New function.
	* config/rs6000/vector.md (vec_set<mode>): Support both constant
	and variable index vec_set.

gcc/testsuite/ChangeLog:

2020-09-24  Xionghu Luo  <luoxhu@linux.ibm.com>

	* gcc.target/powerpc/pr79251.c: New test.
	* gcc.target/powerpc/pr79251-run.c: New test.
	* gcc.target/powerpc/pr79251.h: New header.
---
 gcc/config/rs6000/rs6000-c.c                  | 22 ++++----
 gcc/config/rs6000/rs6000-protos.h             |  1 +
 gcc/config/rs6000/rs6000.c                    | 50 +++++++++++++++++++
 .../gcc.target/powerpc/pr79251-run.c          | 29 +++++++++++
 gcc/testsuite/gcc.target/powerpc/pr79251.c    | 18 +++++++
 gcc/testsuite/gcc.target/powerpc/pr79251.h    | 19 +++++++
 6 files changed, 126 insertions(+), 13 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251-run.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr79251.h

diff --git a/gcc/config/rs6000/rs6000-c.c b/gcc/config/rs6000/rs6000-c.c
index 2fad3d94706..78abe49c833 100644
--- a/gcc/config/rs6000/rs6000-c.c
+++ b/gcc/config/rs6000/rs6000-c.c
@@ -1509,9 +1509,7 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
       tree arg1;
       tree arg2;
       tree arg1_type;
-      tree arg1_inner_type;
       tree decl, stmt;
-      tree innerptrtype;
       machine_mode mode;
 
       /* No second or third arguments. */
@@ -1563,8 +1561,13 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 	  return build_call_expr (call, 3, arg1, arg0, arg2);
 	}
 
-      /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0. */
-      arg1_inner_type = TREE_TYPE (arg1_type);
+      /* Build *(((arg1_inner_type*)&(vector type){arg1})+arg2) = arg0 with
+	 VIEW_CONVERT_EXPR.  i.e.:
+	 D.3192 = v1;
+	 _1 = n & 3;
+	 VIEW_CONVERT_EXPR<int[4]>(D.3192)[_1] = i;
+	 v1 = D.3192;
+	 D.3194 = v1;  */
       if (TYPE_VECTOR_SUBPARTS (arg1_type) == 1)
 	arg2 = build_int_cst (TREE_TYPE (arg2), 0);
       else
@@ -1593,15 +1596,8 @@ altivec_resolve_overloaded_builtin (location_t loc, tree fndecl,
 	  SET_EXPR_LOCATION (stmt, loc);
 	  stmt = build1 (COMPOUND_LITERAL_EXPR, arg1_type, stmt);
 	}
-
-      innerptrtype = build_pointer_type (arg1_inner_type);
-
-      stmt = build_unary_op (loc, ADDR_EXPR, stmt, 0);
-      stmt = convert (innerptrtype, stmt);
-      stmt = build_binary_op (loc, PLUS_EXPR, stmt, arg2, 1);
-      stmt = build_indirect_ref (loc, stmt, RO_NULL);
-      stmt = build2 (MODIFY_EXPR, TREE_TYPE (stmt), stmt,
-		     convert (TREE_TYPE (stmt), arg0));
+      stmt = build_array_ref (loc, stmt, arg2);
+      stmt = fold_build2 (MODIFY_EXPR, TREE_TYPE (arg0), stmt, arg0);
       stmt = build2 (COMPOUND_EXPR, arg1_type, stmt, decl);
       return stmt;
     }
diff --git a/gcc/config/rs6000/rs6000-protos.h b/gcc/config/rs6000/rs6000-protos.h
index 6a0fbc3ba2e..9687767ec16 100644
--- a/gcc/config/rs6000/rs6000-protos.h
+++ b/gcc/config/rs6000/rs6000-protos.h
@@ -58,6 +58,7 @@ extern bool rs6000_split_128bit_ok_p (rtx []);
 extern void rs6000_expand_float128_convert (rtx, rtx, bool);
 extern void rs6000_expand_vector_init (rtx, rtx);
 extern void rs6000_expand_vector_set (rtx, rtx, rtx);
+extern void rs6000_expand_vector_set_var (rtx, rtx, rtx);
 extern void rs6000_expand_vector_extract (rtx, rtx, rtx);
 extern void rs6000_split_vec_extract_var (rtx, rtx, rtx, rtx, rtx);
 extern rtx rs6000_adjust_vec_address (rtx, rtx, rtx, rtx, machine_mode);
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index c46ec14f060..b53f946063b 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -6700,6 +6700,12 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
 
   if (VECTOR_MEM_VSX_P (mode))
     {
+      if (!CONST_INT_P (elt_rtx))
+	{
+	  rs6000_expand_vector_set_var (target, val, elt_rtx);
+	  return;
+	}
+
       rtx insn = NULL_RTX;
 
       if (mode == V2DFmode)
@@ -6790,6 +6796,50 @@ rs6000_expand_vector_set (rtx target, rtx val, rtx elt_rtx)
   emit_insn (gen_rtx_SET (target, x));
 }
 
+/* Insert VAL into IDX of TARGET, VAL size is same of the vector element, IDX
+   is variable and also counts by vector element size.  */
+
+void
+rs6000_expand_vector_set_var (rtx target, rtx val, rtx idx)
+{
+  machine_mode mode = GET_MODE (target);
+
+  gcc_assert (VECTOR_MEM_VSX_P (mode) && !CONST_INT_P (idx)
+	      && TARGET_DIRECT_MOVE_64BIT);
+
+  gcc_assert (GET_MODE (idx) == E_SImode);
+
+  machine_mode inner_mode = GET_MODE (val);
+
+  rtx tmp = gen_reg_rtx (GET_MODE (idx));
+  int width = GET_MODE_SIZE (inner_mode);
+
+  gcc_assert (width >= 1 && width <= 8);
+
+  /* Generate the IDX for permute shift, width is the vector element size.
+     idx = idx * width.  */
+  emit_insn (gen_mulsi3 (tmp, idx, GEN_INT (width)));
+
+  /*  lvsr    v1,0,idx.  */
+  rtx pcvr = gen_reg_rtx (V16QImode);
+  emit_insn (gen_altivec_lvsr_reg (pcvr, tmp));
+
+  /*  lvsl    v2,0,idx.  */
+  rtx pcvl = gen_reg_rtx (V16QImode);
+  emit_insn (gen_altivec_lvsl_reg (pcvl, tmp));
+
+  rtx sub_target = simplify_gen_subreg (V16QImode, target, mode, 0);
+
+  rtx perm;
+  perm = gen_altivec_vperm_v8hiv16qi (sub_target, sub_target, sub_target, pcvr);
+  emit_insn (perm);
+
+  rs6000_expand_vector_set (target, val, const0_rtx);
+
+  perm = gen_altivec_vperm_v8hiv16qi (sub_target, sub_target, sub_target, pcvl);
+  emit_insn (perm);
+}
+
 /* Extract field ELT from VEC into TARGET.  */
 
 void
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251-run.c b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c
new file mode 100644
index 00000000000..8f555d96622
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251-run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target { p9vector_hw } } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */
+
+#include <stddef.h>
+#include <altivec.h>
+#include "pr79251.h"
+
+TEST_VEC_INSERT_ALL (test)
+
+#define run_test(TYPE, num)                                                    \
+  {                                                                            \
+    vector TYPE v;                                                             \
+    vector TYPE u = {0x0};                                                     \
+    for (long k = 0; k < 16 / sizeof (TYPE); k++)                              \
+      v[k] = 0xaa;                                                             \
+    for (long k = 0; k < 16 / sizeof (TYPE); k++)                              \
+      {                                                                        \
+	u = test##num (v, 254, k);                                             \
+	if (u[k] != (TYPE) 254)                                                \
+	  __builtin_abort ();                                                  \
+      }                                                                        \
+  }
+
+int
+main (void)
+{
+  TEST_VEC_INSERT_ALL (run_test)
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.c b/gcc/testsuite/gcc.target/powerpc/pr79251.c
new file mode 100644
index 00000000000..ec1cb255888
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9 -maltivec" } */
+
+#include <stddef.h>
+#include <altivec.h>
+#include "pr79251.h"
+
+TEST_VEC_INSERT_ALL (test)
+
+/* { dg-final { scan-assembler-not {\mstxw\M} } } */
+/* { dg-final { scan-assembler-times {\mlvsl\M} 10 } } */
+/* { dg-final { scan-assembler-times {\mlvsr\M} 10 } } */
+/* { dg-final { scan-assembler-times {\mxxperm\M} 20 } } */
+/* { dg-final { scan-assembler-times {\mxxinsertw\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mvinserth\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mvinsertb\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mxxpermdi\M} 3 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr79251.h b/gcc/testsuite/gcc.target/powerpc/pr79251.h
new file mode 100644
index 00000000000..addb067f9ed
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr79251.h
@@ -0,0 +1,19 @@
+
+#define test(TYPE, num)                                                        \
+  __attribute__ ((noinline, noclone))                                          \
+    vector TYPE test##num (vector TYPE v, TYPE i, signed int n)                \
+  {                                                                            \
+    return vec_insert (i, v, n);                                               \
+  }
+
+#define TEST_VEC_INSERT_ALL(T)                                                 \
+  T (char, 0)                                                                  \
+  T (unsigned char, 1)                                                         \
+  T (short, 2)                                                                 \
+  T (unsigned short, 3)                                                        \
+  T (int, 4)                                                                   \
+  T (unsigned int, 5)                                                          \
+  T (long long, 6)                                                             \
+  T (unsigned long long, 7)                                                    \
+  T (float, 8)                                                                 \
+  T (double, 9)
-- 
2.27.0.90.geebb51ba8c


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v3 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-24  3:24       ` xionghu luo
@ 2020-09-24 12:39         ` Richard Sandiford
  2020-09-25  6:51           ` [PATCH v4 1/3] " xionghu luo
  0 siblings, 1 reply; 24+ messages in thread
From: Richard Sandiford @ 2020-09-24 12:39 UTC (permalink / raw)
  To: xionghu luo
  Cc: Richard Biener, GCC Patches, Segher Boessenkool, David Edelsohn,
	Bill Schmidt, Jiufu Guo, linkw

xionghu luo <luoxhu@linux.ibm.com> writes:
> @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  
>  #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>  
> +/* Expand VEC_SET internal functions.  */
> +
> +static void
> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree op0 = gimple_call_arg (stmt, 0);
> +  tree op1 = gimple_call_arg (stmt, 1);
> +  tree op2 = gimple_call_arg (stmt, 2);
> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);

I'm not sure about the expand_expr here.  ISTM that op0 is a normal
input and so should be expanded by expand_normal rather than
EXPAND_WRITE.  Also:

> +
> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
> +  scalar_mode innermode = GET_MODE_INNER (outermode);
> +
> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> +
> +  class expand_operand ops[3];
> +  enum insn_code icode = optab_handler (optab, outermode);
> +
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      pos = convert_to_mode (E_SImode, pos, 0);
> +
> +      create_fixed_operand (&ops[0], src);

...this would mean that if SRC happens to be a MEM, the pattern
must also accept a MEM.

ISTM that we're making more work for ourselves by not “fixing” the optab
to have a natural pure-input + pure-output interface. :-)  But if we
stick with the current optab interface, I think we need to:

- create a temporary register
- move SRC into the temporary register before the insn
- use create_fixed_operand with the temporary register for operand 0
- move the temporary register into TARGET after the insn

> +      create_input_operand (&ops[1], value, innermode);
> +      create_input_operand (&ops[2], pos, GET_MODE (pos));

For this I think we should use convert_operand_from on the original “pos”,
so that the target gets to choose what the mode of the operand is.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-24  8:21     ` xionghu luo
@ 2020-09-24 13:27       ` Richard Biener
  2020-09-24 14:55         ` Richard Biener
                           ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Richard Biener @ 2020-09-24 13:27 UTC (permalink / raw)
  To: xionghu luo
  Cc: Segher Boessenkool, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> wrote:
>
> Hi Segher,
>
> The attached two patches are updated and split from
>  "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]"
> as your comments.
>
>
> [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param
>
> This one is preparation work of fix lvsl&lvsr arg mode and rs6000_expand_vector_set
> parameter support for both constant and variable index input.
>
>
> [PATCH v3 2/3] rs6000: Support variable insert and Expand vec_insert in expander [PR79251]
>
> This one is Building VIEW_CONVERT_EXPR and expand the IFN VEC_SET to fast.

I'll just comment that

        xxperm 34,34,33
        xxinsertw 34,0,12
        xxperm 34,34,32

doesn't look like a variable-position insert instruction but
this is a variable whole-vector rotate plus an insert at index zero
followed by a variable whole-vector rotate.  I'm not fluend in
ppc assembly but

        rlwinm 6,6,2,28,29
        mtvsrwz 0,5
        lvsr 1,0,6
        lvsl 0,0,6

possibly computes the shift masks for r33/r32?  though
I do not see those registers mentioned...

This might be a generic viable expansion strathegy btw,
which is why I asked before whether the CPU supports
inserts at a variable position ...  the building blocks are
already there with vec_set at constant zero position
plus vec_perm_const for the rotates.

But well, I did ask this question.  Multiple times.

ppc does _not_ have a VSX instruction
like xxinsertw r34, r8, r12 where r8 denotes
the vector element (or byte position or whatever).

So I don't think vec_set with a variable index is the
best approach.
Xionghu - you said even without the patch the stack
storage is eventually elided but

        addi 9,1,-16
        rldic 6,6,2,60
        stxv 34,-16(1)
        stwx 5,9,6
        lxv 34,-16(1)

still shows stack(?) store/load with a bad STLF penalty.

Richard.

>
> Thanks,
> Xionghu

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-24 13:27       ` Richard Biener
@ 2020-09-24 14:55         ` Richard Biener
  2020-09-24 19:36           ` Segher Boessenkool
  2020-09-24 19:02         ` Segher Boessenkool
  2020-09-25  3:50         ` xionghu luo
  2 siblings, 1 reply; 24+ messages in thread
From: Richard Biener @ 2020-09-24 14:55 UTC (permalink / raw)
  To: xionghu luo
  Cc: Segher Boessenkool, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On Thu, Sep 24, 2020 at 3:27 PM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> wrote:
> >
> > Hi Segher,
> >
> > The attached two patches are updated and split from
> >  "[PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]"
> > as your comments.
> >
> >
> > [PATCH v3 2/3] rs6000: Fix lvsl&lvsr mode and change rs6000_expand_vector_set param
> >
> > This one is preparation work of fix lvsl&lvsr arg mode and rs6000_expand_vector_set
> > parameter support for both constant and variable index input.
> >
> >
> > [PATCH v3 2/3] rs6000: Support variable insert and Expand vec_insert in expander [PR79251]
> >
> > This one is Building VIEW_CONVERT_EXPR and expand the IFN VEC_SET to fast.
>
> I'll just comment that
>
>         xxperm 34,34,33
>         xxinsertw 34,0,12
>         xxperm 34,34,32

Btw, on x86_64 the following produces sth reasonable:

#define N 32
typedef int T;
typedef T V __attribute__((vector_size(N)));
V setg (V v, int idx, T val)
{
  V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
  V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv);
  v = (v & ~mask) | (valv & mask);
  return v;
}

        vmovd   %edi, %xmm1
        vpbroadcastd    %xmm1, %ymm1
        vpcmpeqd        .LC0(%rip), %ymm1, %ymm2
        vpblendvb       %ymm2, %ymm1, %ymm0, %ymm0
        ret

I'm quite sure you could do sth similar on power?

> doesn't look like a variable-position insert instruction but
> this is a variable whole-vector rotate plus an insert at index zero
> followed by a variable whole-vector rotate.  I'm not fluend in
> ppc assembly but
>
>         rlwinm 6,6,2,28,29
>         mtvsrwz 0,5
>         lvsr 1,0,6
>         lvsl 0,0,6
>
> possibly computes the shift masks for r33/r32?  though
> I do not see those registers mentioned...
>
> This might be a generic viable expansion strathegy btw,
> which is why I asked before whether the CPU supports
> inserts at a variable position ...  the building blocks are
> already there with vec_set at constant zero position
> plus vec_perm_const for the rotates.
>
> But well, I did ask this question.  Multiple times.
>
> ppc does _not_ have a VSX instruction
> like xxinsertw r34, r8, r12 where r8 denotes
> the vector element (or byte position or whatever).
>
> So I don't think vec_set with a variable index is the
> best approach.
> Xionghu - you said even without the patch the stack
> storage is eventually elided but
>
>         addi 9,1,-16
>         rldic 6,6,2,60
>         stxv 34,-16(1)
>         stwx 5,9,6
>         lxv 34,-16(1)
>
> still shows stack(?) store/load with a bad STLF penalty.
>
> Richard.
>
> >
> > Thanks,
> > Xionghu

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-24 13:27       ` Richard Biener
  2020-09-24 14:55         ` Richard Biener
@ 2020-09-24 19:02         ` Segher Boessenkool
  2020-09-25  3:50         ` xionghu luo
  2 siblings, 0 replies; 24+ messages in thread
From: Segher Boessenkool @ 2020-09-24 19:02 UTC (permalink / raw)
  To: Richard Biener
  Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

Hi!

On Thu, Sep 24, 2020 at 03:27:48PM +0200, Richard Biener wrote:
> On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> wrote:
> I'll just comment that
> 
>         xxperm 34,34,33
>         xxinsertw 34,0,12
>         xxperm 34,34,32
> 
> doesn't look like a variable-position insert instruction but
> this is a variable whole-vector rotate plus an insert at index zero
> followed by a variable whole-vector rotate.  I'm not fluend in
> ppc assembly but
> 
>         rlwinm 6,6,2,28,29
>         mtvsrwz 0,5
>         lvsr 1,0,6
>         lvsl 0,0,6
> 
> possibly computes the shift masks for r33/r32?  though
> I do not see those registers mentioned...

v0/v1 (what the lvs[lr] write to) are the same as vs32/vs33.

The low half of the VSRs (vector-scalar registers) are the FP registers
(expanded to 16B each), and the high half are the original VRs (vector
registers).  AltiVec insns (like lvsl, lvsr) naturally only work on VRs,
as do some newer insns for which there wasn't enough budget in the
opcode space to have for VSRs (which take 6 bits each, while VRs take
only 5, just like FPRs and GPRs).

> This might be a generic viable expansion strathegy btw,
> which is why I asked before whether the CPU supports
> inserts at a variable position ...

ISA 3.1 (Power10) supports variable position inserts.  Power9 supports
fixed position inserts.  Older CPUs can of course construct it some
other way.

> ppc does _not_ have a VSX instruction
> like xxinsertw r34, r8, r12 where r8 denotes
> the vector element (or byte position or whatever).

vins[bhwd][v][lr]x does this.  Those are Power10 instructions.


Segher

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-24 14:55         ` Richard Biener
@ 2020-09-24 19:36           ` Segher Boessenkool
  2020-09-25  6:58             ` Richard Biener
  0 siblings, 1 reply; 24+ messages in thread
From: Segher Boessenkool @ 2020-09-24 19:36 UTC (permalink / raw)
  To: Richard Biener
  Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

Hi!

On Thu, Sep 24, 2020 at 04:55:21PM +0200, Richard Biener wrote:
> Btw, on x86_64 the following produces sth reasonable:
> 
> #define N 32
> typedef int T;
> typedef T V __attribute__((vector_size(N)));
> V setg (V v, int idx, T val)
> {
>   V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
>   V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv);
>   v = (v & ~mask) | (valv & mask);
>   return v;
> }
> 
>         vmovd   %edi, %xmm1
>         vpbroadcastd    %xmm1, %ymm1
>         vpcmpeqd        .LC0(%rip), %ymm1, %ymm2
>         vpblendvb       %ymm2, %ymm1, %ymm0, %ymm0
>         ret
> 
> I'm quite sure you could do sth similar on power?

This only allows inserting aligned elements.  Which is probably fine
of course (we don't allow elements that straddle vector boundaries
either, anyway).

And yes, we can do that :-)

That should be
  #define N 32
  typedef int T;
  typedef T V __attribute__((vector_size(N)));
  V setg (V v, int idx, T val)
  {
    V valv = (V){val, val, val, val, val, val, val, val};
    V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
    V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
    v = (v & ~mask) | (valv & mask);
    return v;
  }

after which I get (-march=znver2)

setg:
        vmovd   %edi, %xmm1
        vmovd   %esi, %xmm2
        vpbroadcastd    %xmm1, %ymm1
        vpbroadcastd    %xmm2, %ymm2
        vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
        vpandn  %ymm0, %ymm1, %ymm0
        vpand   %ymm2, %ymm1, %ymm1
        vpor    %ymm0, %ymm1, %ymm0
        ret

.LC0:
        .long   0
        .long   1
        .long   2
        .long   3
        .long   4
        .long   5
        .long   6
        .long   7

and for powerpc (changing it to 16B vectors, -mcpu=power9) it is

setg:
        addis 9,2,.LC0@toc@ha
        mtvsrws 32,5
        mtvsrws 33,6
        addi 9,9,.LC0@toc@l
        lxv 45,0(9)
        vcmpequw 0,0,13
        xxsel 34,34,33,32
        blr

.LC0:
        .long   0
        .long   1
        .long   2
        .long   3

(We can generate that 0..3 vector without doing loads; I guess x86 can
do that as well?  But it takes more than one insn to do (of course we
have to set up the memory address first *with* the load, heh).)

For power8 it becomes (we need to splat in separate insns):

setg:
        addis 9,2,.LC0@toc@ha
        mtvsrwz 32,5
        mtvsrwz 33,6
        addi 9,9,.LC0@toc@l
        lxvw4x 45,0,9
        xxspltw 32,32,1
        xxspltw 33,33,1
        vcmpequw 0,0,13
        xxsel 34,34,33,32
        blr


Segher

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-24 13:27       ` Richard Biener
  2020-09-24 14:55         ` Richard Biener
  2020-09-24 19:02         ` Segher Boessenkool
@ 2020-09-25  3:50         ` xionghu luo
  2020-09-25  5:00           ` Richard Biener
  2 siblings, 1 reply; 24+ messages in thread
From: xionghu luo @ 2020-09-25  3:50 UTC (permalink / raw)
  To: Richard Biener
  Cc: Segher Boessenkool, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

Hi,

On 2020/9/24 21:27, Richard Biener wrote:
> On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com> wrote:
> 
> I'll just comment that
> 
>          xxperm 34,34,33
>          xxinsertw 34,0,12
>          xxperm 34,34,32
> 
> doesn't look like a variable-position insert instruction but
> this is a variable whole-vector rotate plus an insert at index zero
> followed by a variable whole-vector rotate.  I'm not fluend in
> ppc assembly but
> 
>          rlwinm 6,6,2,28,29
>          mtvsrwz 0,5
>          lvsr 1,0,6
>          lvsl 0,0,6
> 
> possibly computes the shift masks for r33/r32?  though
> I do not see those registers mentioned...

For V4SI:
       rlwinm 6,6,2,28,29      // r6*4
       mtvsrwz 0,5             // vs0   <- r5  (0xfe)
       lvsr 1,0,6              // vs33  <- lvsr[r6]
       lvsl 0,0,6              // vs32  <- lvsl[r6] 
       xxperm 34,34,33       
       xxinsertw 34,0,12
       xxperm 34,34,32
       blr


idx = idx * 4; 
0    0      0x4000000030000000200000001   xxperm:0x4000000030000000200000001   vs33:0x101112131415161718191a1b1c1d1e1f  vs32:0x102030405060708090a0b0c0d0e0f
1    4      0x4000000030000000200000001   xxperm:0x1000000040000000300000002   vs33:0xc0d0e0f101112131415161718191a1b   vs32:0x405060708090a0b0c0d0e0f10111213
2    8      0x4000000030000000200000001   xxperm:0x2000000010000000400000003   vs33:0x8090a0b0c0d0e0f1011121314151617   vs32:0x8090a0b0c0d0e0f1011121314151617
3    12     0x4000000030000000200000001   xxperm:0x3000000020000000100000004   vs33:0x405060708090a0b0c0d0e0f10111213   vs32:0xc0d0e0f101112131415161718191a1b

vs34:
 0x40000000300000002000000fe
 0x400000003000000fe00000001
 0x4000000fe0000000200000001
0xfe000000030000000200000001


"xxinsertw 34,0,12" will always insert vs0[32:63] content to the forth word of
target vector, bits[96:127].  Then the second xxperm rotate the modified vector
back. 

All the instructions are register based operation, as Segher replied, power9
supports only fixed position inserts, so we need do some trick here to support
it instead of generate short store wide load instructions.


> 
> This might be a generic viable expansion strathegy btw,
> which is why I asked before whether the CPU supports
> inserts at a variable position ...  the building blocks are
> already there with vec_set at constant zero position
> plus vec_perm_const for the rotates.
> 
> But well, I did ask this question.  Multiple times.
> 
> ppc does _not_ have a VSX instruction
> like xxinsertw r34, r8, r12 where r8 denotes
> the vector element (or byte position or whatever).
> 
> So I don't think vec_set with a variable index is the
> best approach.
> Xionghu - you said even without the patch the stack
> storage is eventually elided but
> 
>          addi 9,1,-16
>          rldic 6,6,2,60
>          stxv 34,-16(1)
>          stwx 5,9,6
>          lxv 34,-16(1)
> 
> still shows stack(?) store/load with a bad STLF penalty.


Sorry that if I didn't describe clearly and misunderstood you, I mean if insert many
instructions(tested with a loop inserted) between "stwx 5,9,6" and "lxv 34,-16(1)",
the store hit load performance issue could be elided, but this is not the solution
we want.

I also changed your test as below and build for X86, seems it also generates
inefficient code?  What my patch does maybe different usage from your pasted
case? 

#define N 32
typedef int T;
typedef T V __attribute__((vector_size(N)));
  V setg3 (V v, int idx, T val)
{
    v[idx&31] = val;
    return v;
}

-O2 -S -mavx -march=znver2:

setg3:
        push    rbp
        and     edi, 31
        mov     rbp, rsp
        and     rsp, -32
        vmovdqa YMMWORD PTR [rsp-32], ymm0
        mov     DWORD PTR [rsp-32+rdi*4], esi
        vmovdqa ymm0, YMMWORD PTR [rsp-32]
        leave
        ret


While idx is constant: 

setg3:
        vpinsrd xmm1, xmm0, esi, 3
        vinserti128     ymm0, ymm0, xmm1, 0x0
        ret

And ARM with -O2 -S -march=armv8.2-a+sve (N change to 16): 

setg3:
        sub     sp, sp, #16
        and     x0, x0, 15
        str     q0, [sp]
        str     w1, [sp, x0, lsl 2]
        ldr     q0, [sp]
        add     sp, sp, 16
        ret

While idx is constant: 

setg3:
        ins     v0.s[3], w1
        ret


Though I've no idea how to optimize this on X86 and ARM with vector instructions
to avoid short store with wide load followed on stack.


Thanks,
Xionghu

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-25  3:50         ` xionghu luo
@ 2020-09-25  5:00           ` Richard Biener
  0 siblings, 0 replies; 24+ messages in thread
From: Richard Biener @ 2020-09-25  5:00 UTC (permalink / raw)
  To: xionghu luo
  Cc: Segher Boessenkool, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On September 25, 2020 5:50:40 AM GMT+02:00, xionghu luo <luoxhu@linux.ibm.com> wrote:
>Hi,
>
>On 2020/9/24 21:27, Richard Biener wrote:
>> On Thu, Sep 24, 2020 at 10:21 AM xionghu luo <luoxhu@linux.ibm.com>
>wrote:
>> 
>> I'll just comment that
>> 
>>          xxperm 34,34,33
>>          xxinsertw 34,0,12
>>          xxperm 34,34,32
>> 
>> doesn't look like a variable-position insert instruction but
>> this is a variable whole-vector rotate plus an insert at index zero
>> followed by a variable whole-vector rotate.  I'm not fluend in
>> ppc assembly but
>> 
>>          rlwinm 6,6,2,28,29
>>          mtvsrwz 0,5
>>          lvsr 1,0,6
>>          lvsl 0,0,6
>> 
>> possibly computes the shift masks for r33/r32?  though
>> I do not see those registers mentioned...
>
>For V4SI:
>       rlwinm 6,6,2,28,29      // r6*4
>       mtvsrwz 0,5             // vs0   <- r5  (0xfe)
>       lvsr 1,0,6              // vs33  <- lvsr[r6]
>       lvsl 0,0,6              // vs32  <- lvsl[r6] 
>       xxperm 34,34,33       
>       xxinsertw 34,0,12
>       xxperm 34,34,32
>       blr
>
>
>idx = idx * 4; 
>0    0      0x4000000030000000200000001  
>xxperm:0x4000000030000000200000001  
>vs33:0x101112131415161718191a1b1c1d1e1f 
>vs32:0x102030405060708090a0b0c0d0e0f
>1    4      0x4000000030000000200000001  
>xxperm:0x1000000040000000300000002  
>vs33:0xc0d0e0f101112131415161718191a1b  
>vs32:0x405060708090a0b0c0d0e0f10111213
>2    8      0x4000000030000000200000001  
>xxperm:0x2000000010000000400000003  
>vs33:0x8090a0b0c0d0e0f1011121314151617  
>vs32:0x8090a0b0c0d0e0f1011121314151617
>3    12     0x4000000030000000200000001  
>xxperm:0x3000000020000000100000004  
>vs33:0x405060708090a0b0c0d0e0f10111213  
>vs32:0xc0d0e0f101112131415161718191a1b
>
>vs34:
> 0x40000000300000002000000fe
> 0x400000003000000fe00000001
> 0x4000000fe0000000200000001
>0xfe000000030000000200000001
>
>
>"xxinsertw 34,0,12" will always insert vs0[32:63] content to the forth
>word of
>target vector, bits[96:127].  Then the second xxperm rotate the
>modified vector
>back. 
>
>All the instructions are register based operation, as Segher replied,
>power9
>supports only fixed position inserts, so we need do some trick here to
>support
>it instead of generate short store wide load instructions.

OK, I fair enough - I've heard power 10 does support those inserts. 

>
>> 
>> This might be a generic viable expansion strathegy btw,
>> which is why I asked before whether the CPU supports
>> inserts at a variable position ...  the building blocks are
>> already there with vec_set at constant zero position
>> plus vec_perm_const for the rotates.
>> 
>> But well, I did ask this question.  Multiple times.
>> 
>> ppc does _not_ have a VSX instruction
>> like xxinsertw r34, r8, r12 where r8 denotes
>> the vector element (or byte position or whatever).
>> 
>> So I don't think vec_set with a variable index is the
>> best approach.
>> Xionghu - you said even without the patch the stack
>> storage is eventually elided but
>> 
>>          addi 9,1,-16
>>          rldic 6,6,2,60
>>          stxv 34,-16(1)
>>          stwx 5,9,6
>>          lxv 34,-16(1)
>> 
>> still shows stack(?) store/load with a bad STLF penalty.
>
>
>Sorry that if I didn't describe clearly and misunderstood you, I mean
>if insert many
>instructions(tested with a loop inserted) between "stwx 5,9,6" and "lxv
>34,-16(1)",
>the store hit load performance issue could be elided, but this is not
>the solution
>we want.
>
>I also changed your test as below and build for X86, seems it also
>generates
>inefficient code?  What my patch does maybe different usage from your
>pasted
>case? 
>
>#define N 32
>typedef int T;
>typedef T V __attribute__((vector_size(N)));
>  V setg3 (V v, int idx, T val)
>{
>    v[idx&31] = val;
>    return v;
>}
>
>-O2 -S -mavx -march=znver2:
>
>setg3:
>        push    rbp
>        and     edi, 31
>        mov     rbp, rsp
>        and     rsp, -32
>        vmovdqa YMMWORD PTR [rsp-32], ymm0
>        mov     DWORD PTR [rsp-32+rdi*4], esi
>        vmovdqa ymm0, YMMWORD PTR [rsp-32]
>        leave
>        ret
>
>
>While idx is constant: 
>
>setg3:
>        vpinsrd xmm1, xmm0, esi, 3
>        vinserti128     ymm0, ymm0, xmm1, 0x0
>        ret
>
>And ARM with -O2 -S -march=armv8.2-a+sve (N change to 16): 
>
>setg3:
>        sub     sp, sp, #16
>        and     x0, x0, 15
>        str     q0, [sp]
>        str     w1, [sp, x0, lsl 2]
>        ldr     q0, [sp]
>        add     sp, sp, 16
>        ret
>
>While idx is constant: 
>
>setg3:
>        ins     v0.s[3], w1
>        ret
>
>
>Though I've no idea how to optimize this on X86 and ARM with vector
>instructions
>to avoid short store with wide load followed on stack.

Yes, all targets suffer from this. On x86 you can play similar tricks as on power9, 
See my attempt in the other mail (and Seghers fix). 

Richard. 

>
>Thanks,
>Xionghu


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-24 12:39         ` Richard Sandiford
@ 2020-09-25  6:51           ` xionghu luo
  2020-09-25 11:32             ` Richard Biener
  2020-09-25 13:28             ` Richard Sandiford
  0 siblings, 2 replies; 24+ messages in thread
From: xionghu luo @ 2020-09-25  6:51 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, Segher Boessenkool, David Edelsohn,
	Bill Schmidt, Jiufu Guo, linkw, richard.sandiford

Hi,

On 2020/9/24 20:39, Richard Sandiford wrote:
> xionghu luo <luoxhu@linux.ibm.com> writes:
>> @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>>   
>>   #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>>   
>> +/* Expand VEC_SET internal functions.  */
>> +
>> +static void
>> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>> +{
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  tree op0 = gimple_call_arg (stmt, 0);
>> +  tree op1 = gimple_call_arg (stmt, 1);
>> +  tree op2 = gimple_call_arg (stmt, 2);
>> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
>> +  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
> 
> I'm not sure about the expand_expr here.  ISTM that op0 is a normal
> input and so should be expanded by expand_normal rather than
> EXPAND_WRITE.  Also:
> 
>> +
>> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
>> +  scalar_mode innermode = GET_MODE_INNER (outermode);
>> +
>> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
>> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
>> +
>> +  class expand_operand ops[3];
>> +  enum insn_code icode = optab_handler (optab, outermode);
>> +
>> +  if (icode != CODE_FOR_nothing)
>> +    {
>> +      pos = convert_to_mode (E_SImode, pos, 0);
>> +
>> +      create_fixed_operand (&ops[0], src);
> 
> ...this would mean that if SRC happens to be a MEM, the pattern
> must also accept a MEM.
> 
> ISTM that we're making more work for ourselves by not “fixing” the optab
> to have a natural pure-input + pure-output interface. :-)  But if we
> stick with the current optab interface, I think we need to:
> 
> - create a temporary register
> - move SRC into the temporary register before the insn
> - use create_fixed_operand with the temporary register for operand 0
> - move the temporary register into TARGET after the insn
> 
>> +      create_input_operand (&ops[1], value, innermode);
>> +      create_input_operand (&ops[2], pos, GET_MODE (pos));
> 
> For this I think we should use convert_operand_from on the original “pos”,
> so that the target gets to choose what the mode of the operand is.
> 

Thanks a lot for the nice suggestions, fixed them all and updated the patch as below.


[PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR

This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
VEC_SET internal function in gimple-isel pass if target supports
vec_set with variable index by checking can_vec_set_var_idx_p.

gcc/ChangeLog:

2020-09-25  Xionghu Luo  <luoxhu@linux.ibm.com>

	* gimple-isel.cc (gimple_expand_vec_set_expr): New function.
	(gimple_expand_vec_cond_exprs): Rename to ...
	(gimple_expand_vec_exprs): ... this and call
	gimple_expand_vec_set_expr.
	* internal-fn.c (vec_set_direct): New define.
	(expand_vec_set_optab_fn): New function.
	(direct_vec_set_optab_supported_p): New define.
	* internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN.
	* optabs.c (can_vec_set_var_idx_p): New function.
	* optabs.h (can_vec_set_var_idx_p): New declaration.
---
 gcc/gimple-isel.cc  | 75 +++++++++++++++++++++++++++++++++++++++++++--
 gcc/internal-fn.c   | 41 +++++++++++++++++++++++++
 gcc/internal-fn.def |  2 ++
 gcc/optabs.c        | 21 +++++++++++++
 gcc/optabs.h        |  4 +++
 5 files changed, 141 insertions(+), 2 deletions(-)

diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
index b330cf4c20e..02513e04900 100644
--- a/gcc/gimple-isel.cc
+++ b/gcc/gimple-isel.cc
@@ -35,6 +35,74 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-cfg.h"
 #include "bitmap.h"
 #include "tree-ssa-dce.h"
+#include "memmodel.h"
+#include "optabs.h"
+
+/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
+   internal function based on vector type of selected expansion.
+   i.e.:
+     VIEW_CONVERT_EXPR<int[4]>(u)[_1] =  = i_4(D);
+   =>
+     _7 = u;
+     _8 = .VEC_SET (_7, i_4(D), _1);
+     u = _8;  */
+
+static gimple *
+gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
+{
+  enum tree_code code;
+  gcall *new_stmt = NULL;
+  gassign *ass_stmt = NULL;
+
+  /* Only consider code == GIMPLE_ASSIGN.  */
+  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
+  if (!stmt)
+    return NULL;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  code = TREE_CODE (lhs);
+  if (code != ARRAY_REF)
+    return NULL;
+
+  tree val = gimple_assign_rhs1 (stmt);
+  tree op0 = TREE_OPERAND (lhs, 0);
+  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0))
+      && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
+      && TYPE_MODE (TREE_TYPE (lhs))
+	   == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0)))))
+    {
+      tree pos = TREE_OPERAND (lhs, 1);
+      tree view_op0 = TREE_OPERAND (op0, 0);
+      machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
+      if (auto_var_in_fn_p (view_op0, cfun->decl)
+	  && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx_p (outermode))
+	{
+	  location_t loc = gimple_location (stmt);
+	  tree var_src = make_ssa_name (TREE_TYPE (view_op0));
+	  tree var_dst = make_ssa_name (TREE_TYPE (view_op0));
+
+	  ass_stmt = gimple_build_assign (var_src, view_op0);
+	  gimple_set_vuse (ass_stmt, gimple_vuse (stmt));
+	  gimple_set_location (ass_stmt, loc);
+	  gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+	  new_stmt
+	    = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
+	  gimple_call_set_lhs (new_stmt, var_dst);
+	  gimple_set_location (new_stmt, loc);
+	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+
+	  ass_stmt = gimple_build_assign (view_op0, var_dst);
+	  gimple_set_location (ass_stmt, loc);
+	  gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
+
+	  gimple_move_vops (ass_stmt, stmt);
+	  gsi_remove (gsi, true);
+	}
+    }
+
+  return ass_stmt;
+}
 
 /* Expand all VEC_COND_EXPR gimple assignments into calls to internal
    function based on type of selected expansion.  */
@@ -176,7 +244,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi,
    VEC_COND_EXPR assignments.  */
 
 static unsigned int
-gimple_expand_vec_cond_exprs (void)
+gimple_expand_vec_exprs (void)
 {
   gimple_stmt_iterator gsi;
   basic_block bb;
@@ -189,12 +257,15 @@ gimple_expand_vec_cond_exprs (void)
 	{
 	  gimple *g = gimple_expand_vec_cond_expr (&gsi,
 						   &vec_cond_ssa_name_uses);
+
 	  if (g != NULL)
 	    {
 	      tree lhs = gimple_assign_lhs (gsi_stmt (gsi));
 	      gimple_set_lhs (g, lhs);
 	      gsi_replace (&gsi, g, false);
 	    }
+
+	  gimple_expand_vec_set_expr (&gsi);
 	}
     }
 
@@ -237,7 +308,7 @@ public:
 
   virtual unsigned int execute (function *)
     {
-      return gimple_expand_vec_cond_exprs ();
+      return gimple_expand_vec_exprs ();
     }
 
 }; // class pass_gimple_isel
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
index 8efc77d986b..20c1d31fb73 100644
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -115,6 +115,7 @@ init_internal_fns ()
 #define vec_condeq_direct { 0, 0, false }
 #define scatter_store_direct { 3, 1, false }
 #define len_store_direct { 3, 3, false }
+#define vec_set_direct { 3, 3, false }
 #define unary_direct { 0, 0, true }
 #define binary_direct { 0, 0, true }
 #define ternary_direct { 0, 0, true }
@@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
 
 #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
 
+/* Expand VEC_SET internal functions.  */
+
+static void
+expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
+{
+  tree lhs = gimple_call_lhs (stmt);
+  tree op0 = gimple_call_arg (stmt, 0);
+  tree op1 = gimple_call_arg (stmt, 1);
+  tree op2 = gimple_call_arg (stmt, 2);
+  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
+  rtx src = expand_normal (op0);
+
+  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
+  scalar_mode innermode = GET_MODE_INNER (outermode);
+
+  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
+
+  class expand_operand ops[3];
+  enum insn_code icode = optab_handler (optab, outermode);
+
+  if (icode != CODE_FOR_nothing)
+    {
+      rtx temp = gen_reg_rtx (outermode);
+      emit_move_insn (temp, src);
+
+      create_fixed_operand (&ops[0], temp);
+      create_input_operand (&ops[1], value, innermode);
+      create_convert_operand_from (&ops[2], pos, TYPE_MODE (TREE_TYPE (op2)),
+				   true);
+      if (maybe_expand_insn (icode, 3, ops))
+	{
+	  emit_move_insn (target, temp);
+	  return;
+	}
+    }
+  gcc_unreachable ();
+}
+
 static void
 expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
 {
@@ -3253,6 +3293,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
 #define direct_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
 #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
+#define direct_vec_set_optab_supported_p direct_optab_supported_p
 
 /* Return the optab used by internal function FN.  */
 
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index 13e60828fcf..e6cfe1b6159 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
 DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
 DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
 
+DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
+
 DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
 
 DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
diff --git a/gcc/optabs.c b/gcc/optabs.c
index 184827fdf4e..8e844028d92 100644
--- a/gcc/optabs.c
+++ b/gcc/optabs.c
@@ -3841,6 +3841,27 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode,
 	 && insn_operand_matches (icode, 3, test);
 }
 
+/* Return whether the backend can emit vector set instructions for inserting
+   element into vector at variable index position.  */
+
+bool
+can_vec_set_var_idx_p (machine_mode vec_mode)
+{
+  if (!VECTOR_MODE_P (vec_mode))
+    return false;
+
+  machine_mode inner_mode = GET_MODE_INNER (vec_mode);
+  rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1);
+  rtx reg2 = alloca_raw_REG (inner_mode, LAST_VIRTUAL_REGISTER + 2);
+  rtx reg3 = alloca_raw_REG (VOIDmode, LAST_VIRTUAL_REGISTER + 3);
+
+  enum insn_code icode = optab_handler (vec_set_optab, vec_mode);
+
+  return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1)
+	 && insn_operand_matches (icode, 1, reg2)
+	 && insn_operand_matches (icode, 2, reg3);
+}
+
 /* This function is called when we are going to emit a compare instruction that
    compares the values found in X and Y, using the rtl operator COMPARISON.
 
diff --git a/gcc/optabs.h b/gcc/optabs.h
index 7c2ec257cb0..0b14700ab3d 100644
--- a/gcc/optabs.h
+++ b/gcc/optabs.h
@@ -249,6 +249,10 @@ extern int can_compare_p (enum rtx_code, machine_mode,
    VALUE_MODE.  */
 extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode);
 
+/* Return whether the backend can emit vector set instructions for inserting
+   element into vector at variable index position.  */
+extern bool can_vec_set_var_idx_p (machine_mode);
+
 extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode,
 			    machine_mode, int);
 /* Emit a pair of rtl insns to compare two rtx's and to jump
-- 
2.27.0.90.geebb51ba8c


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-24 19:36           ` Segher Boessenkool
@ 2020-09-25  6:58             ` Richard Biener
  2020-09-25 12:21               ` Richard Sandiford
  2020-09-25 22:39               ` Segher Boessenkool
  0 siblings, 2 replies; 24+ messages in thread
From: Richard Biener @ 2020-09-25  6:58 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> Hi!
>
> On Thu, Sep 24, 2020 at 04:55:21PM +0200, Richard Biener wrote:
> > Btw, on x86_64 the following produces sth reasonable:
> >
> > #define N 32
> > typedef int T;
> > typedef T V __attribute__((vector_size(N)));
> > V setg (V v, int idx, T val)
> > {
> >   V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
> >   V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv);
> >   v = (v & ~mask) | (valv & mask);
> >   return v;
> > }
> >
> >         vmovd   %edi, %xmm1
> >         vpbroadcastd    %xmm1, %ymm1
> >         vpcmpeqd        .LC0(%rip), %ymm1, %ymm2
> >         vpblendvb       %ymm2, %ymm1, %ymm0, %ymm0
> >         ret
> >
> > I'm quite sure you could do sth similar on power?
>
> This only allows inserting aligned elements.  Which is probably fine
> of course (we don't allow elements that straddle vector boundaries
> either, anyway).
>
> And yes, we can do that :-)
>
> That should be
>   #define N 32
>   typedef int T;
>   typedef T V __attribute__((vector_size(N)));
>   V setg (V v, int idx, T val)
>   {
>     V valv = (V){val, val, val, val, val, val, val, val};
>     V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
>     V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
>     v = (v & ~mask) | (valv & mask);
>     return v;
>   }

Whoops yeah, simplified it a bit too much ;)

> after which I get (-march=znver2)
>
> setg:
>         vmovd   %edi, %xmm1
>         vmovd   %esi, %xmm2
>         vpbroadcastd    %xmm1, %ymm1
>         vpbroadcastd    %xmm2, %ymm2
>         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
>         vpandn  %ymm0, %ymm1, %ymm0
>         vpand   %ymm2, %ymm1, %ymm1
>         vpor    %ymm0, %ymm1, %ymm0
>         ret

I get with -march=znver2 -O2

        vmovd   %edi, %xmm1
        vmovd   %esi, %xmm2
        vpbroadcastd    %xmm1, %ymm1
        vpbroadcastd    %xmm2, %ymm2
        vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
        vpblendvb       %ymm1, %ymm2, %ymm0, %ymm0

and with -mavx512vl

        vpbroadcastd    %edi, %ymm1
        vpcmpd  $0, .LC0(%rip), %ymm1, %k1
        vpbroadcastd    %esi, %ymm0{%k1}

broadcast-with-mask - heh, would be interesting if we manage
to combine v[idx1] = val; v[idx2] = val; ;)

Now, with SSE4.2 the 16byte case compiles to

setg:
.LFB0:
        .cfi_startproc
        movd    %edi, %xmm3
        movdqa  %xmm0, %xmm1
        movd    %esi, %xmm4
        pshufd  $0, %xmm3, %xmm0
        pcmpeqd .LC0(%rip), %xmm0
        movdqa  %xmm0, %xmm2
        pandn   %xmm1, %xmm2
        pshufd  $0, %xmm4, %xmm1
        pand    %xmm1, %xmm0
        por     %xmm2, %xmm0
        ret

since there's no blend with a variable mask IIRC.

with aarch64 and SVE it doesn't handle the 32byte case at all,
the 16byte case compiles to

setg:
.LFB0:
        .cfi_startproc
        adrp    x2, .LC0
        dup     v1.4s, w0
        dup     v2.4s, w1
        ldr     q3, [x2, #:lo12:.LC0]
        cmeq    v1.4s, v1.4s, v3.4s
        bit     v0.16b, v2.16b, v1.16b

which looks equivalent to the AVX2 code.

For all of those varying the vector element type may also
cause "issues" I guess.

> .LC0:
>         .long   0
>         .long   1
>         .long   2
>         .long   3
>         .long   4
>         .long   5
>         .long   6
>         .long   7
>
> and for powerpc (changing it to 16B vectors, -mcpu=power9) it is
>
> setg:
>         addis 9,2,.LC0@toc@ha
>         mtvsrws 32,5
>         mtvsrws 33,6
>         addi 9,9,.LC0@toc@l
>         lxv 45,0(9)
>         vcmpequw 0,0,13
>         xxsel 34,34,33,32
>         blr
>
> .LC0:
>         .long   0
>         .long   1
>         .long   2
>         .long   3
>
> (We can generate that 0..3 vector without doing loads; I guess x86 can
> do that as well?  But it takes more than one insn to do (of course we
> have to set up the memory address first *with* the load, heh).)
>
> For power8 it becomes (we need to splat in separate insns):
>
> setg:
>         addis 9,2,.LC0@toc@ha
>         mtvsrwz 32,5
>         mtvsrwz 33,6
>         addi 9,9,.LC0@toc@l
>         lxvw4x 45,0,9
>         xxspltw 32,32,1
>         xxspltw 33,33,1
>         vcmpequw 0,0,13
>         xxsel 34,34,33,32
>         blr
>
>
> Segher

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-25  6:51           ` [PATCH v4 1/3] " xionghu luo
@ 2020-09-25 11:32             ` Richard Biener
  2020-09-25 13:28             ` Richard Sandiford
  1 sibling, 0 replies; 24+ messages in thread
From: Richard Biener @ 2020-09-25 11:32 UTC (permalink / raw)
  To: xionghu luo
  Cc: GCC Patches, Segher Boessenkool, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On Fri, Sep 25, 2020 at 8:51 AM xionghu luo <luoxhu@linux.ibm.com> wrote:
>
> Hi,
>
> On 2020/9/24 20:39, Richard Sandiford wrote:
> > xionghu luo <luoxhu@linux.ibm.com> writes:
> >> @@ -2658,6 +2659,43 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> >>
> >>   #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
> >>
> >> +/* Expand VEC_SET internal functions.  */
> >> +
> >> +static void
> >> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> >> +{
> >> +  tree lhs = gimple_call_lhs (stmt);
> >> +  tree op0 = gimple_call_arg (stmt, 0);
> >> +  tree op1 = gimple_call_arg (stmt, 1);
> >> +  tree op2 = gimple_call_arg (stmt, 2);
> >> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> >> +  rtx src = expand_expr (op0, NULL_RTX, VOIDmode, EXPAND_WRITE);
> >
> > I'm not sure about the expand_expr here.  ISTM that op0 is a normal
> > input and so should be expanded by expand_normal rather than
> > EXPAND_WRITE.  Also:
> >
> >> +
> >> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
> >> +  scalar_mode innermode = GET_MODE_INNER (outermode);
> >> +
> >> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> >> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> >> +
> >> +  class expand_operand ops[3];
> >> +  enum insn_code icode = optab_handler (optab, outermode);
> >> +
> >> +  if (icode != CODE_FOR_nothing)
> >> +    {
> >> +      pos = convert_to_mode (E_SImode, pos, 0);
> >> +
> >> +      create_fixed_operand (&ops[0], src);
> >
> > ...this would mean that if SRC happens to be a MEM, the pattern
> > must also accept a MEM.
> >
> > ISTM that we're making more work for ourselves by not “fixing” the optab
> > to have a natural pure-input + pure-output interface. :-)  But if we
> > stick with the current optab interface, I think we need to:
> >
> > - create a temporary register
> > - move SRC into the temporary register before the insn
> > - use create_fixed_operand with the temporary register for operand 0
> > - move the temporary register into TARGET after the insn
> >
> >> +      create_input_operand (&ops[1], value, innermode);
> >> +      create_input_operand (&ops[2], pos, GET_MODE (pos));
> >
> > For this I think we should use convert_operand_from on the original “pos”,
> > so that the target gets to choose what the mode of the operand is.
> >
>
> Thanks a lot for the nice suggestions, fixed them all and updated the patch as below.
>
>
> [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
>
> This patch enables transformation from ARRAY_REF(VIEW_CONVERT_EXPR) to
> VEC_SET internal function in gimple-isel pass if target supports
> vec_set with variable index by checking can_vec_set_var_idx_p.

OK with me if Richard is happy with the updated patch.

Thanks,
Richard.

> gcc/ChangeLog:
>
> 2020-09-25  Xionghu Luo  <luoxhu@linux.ibm.com>
>
>         * gimple-isel.cc (gimple_expand_vec_set_expr): New function.
>         (gimple_expand_vec_cond_exprs): Rename to ...
>         (gimple_expand_vec_exprs): ... this and call
>         gimple_expand_vec_set_expr.
>         * internal-fn.c (vec_set_direct): New define.
>         (expand_vec_set_optab_fn): New function.
>         (direct_vec_set_optab_supported_p): New define.
>         * internal-fn.def (VEC_SET): New DEF_INTERNAL_OPTAB_FN.
>         * optabs.c (can_vec_set_var_idx_p): New function.
>         * optabs.h (can_vec_set_var_idx_p): New declaration.
> ---
>  gcc/gimple-isel.cc  | 75 +++++++++++++++++++++++++++++++++++++++++++--
>  gcc/internal-fn.c   | 41 +++++++++++++++++++++++++
>  gcc/internal-fn.def |  2 ++
>  gcc/optabs.c        | 21 +++++++++++++
>  gcc/optabs.h        |  4 +++
>  5 files changed, 141 insertions(+), 2 deletions(-)
>
> diff --git a/gcc/gimple-isel.cc b/gcc/gimple-isel.cc
> index b330cf4c20e..02513e04900 100644
> --- a/gcc/gimple-isel.cc
> +++ b/gcc/gimple-isel.cc
> @@ -35,6 +35,74 @@ along with GCC; see the file COPYING3.  If not see
>  #include "tree-cfg.h"
>  #include "bitmap.h"
>  #include "tree-ssa-dce.h"
> +#include "memmodel.h"
> +#include "optabs.h"
> +
> +/* Expand all ARRAY_REF(VIEW_CONVERT_EXPR) gimple assignments into calls to
> +   internal function based on vector type of selected expansion.
> +   i.e.:
> +     VIEW_CONVERT_EXPR<int[4]>(u)[_1] =  = i_4(D);
> +   =>
> +     _7 = u;
> +     _8 = .VEC_SET (_7, i_4(D), _1);
> +     u = _8;  */
> +
> +static gimple *
> +gimple_expand_vec_set_expr (gimple_stmt_iterator *gsi)
> +{
> +  enum tree_code code;
> +  gcall *new_stmt = NULL;
> +  gassign *ass_stmt = NULL;
> +
> +  /* Only consider code == GIMPLE_ASSIGN.  */
> +  gassign *stmt = dyn_cast<gassign *> (gsi_stmt (*gsi));
> +  if (!stmt)
> +    return NULL;
> +
> +  tree lhs = gimple_assign_lhs (stmt);
> +  code = TREE_CODE (lhs);
> +  if (code != ARRAY_REF)
> +    return NULL;
> +
> +  tree val = gimple_assign_rhs1 (stmt);
> +  tree op0 = TREE_OPERAND (lhs, 0);
> +  if (TREE_CODE (op0) == VIEW_CONVERT_EXPR && DECL_P (TREE_OPERAND (op0, 0))
> +      && VECTOR_TYPE_P (TREE_TYPE (TREE_OPERAND (op0, 0)))
> +      && TYPE_MODE (TREE_TYPE (lhs))
> +          == TYPE_MODE (TREE_TYPE (TREE_TYPE (TREE_OPERAND (op0, 0)))))
> +    {
> +      tree pos = TREE_OPERAND (lhs, 1);
> +      tree view_op0 = TREE_OPERAND (op0, 0);
> +      machine_mode outermode = TYPE_MODE (TREE_TYPE (view_op0));
> +      if (auto_var_in_fn_p (view_op0, cfun->decl)
> +         && !TREE_ADDRESSABLE (view_op0) && can_vec_set_var_idx_p (outermode))
> +       {
> +         location_t loc = gimple_location (stmt);
> +         tree var_src = make_ssa_name (TREE_TYPE (view_op0));
> +         tree var_dst = make_ssa_name (TREE_TYPE (view_op0));
> +
> +         ass_stmt = gimple_build_assign (var_src, view_op0);
> +         gimple_set_vuse (ass_stmt, gimple_vuse (stmt));
> +         gimple_set_location (ass_stmt, loc);
> +         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
> +
> +         new_stmt
> +           = gimple_build_call_internal (IFN_VEC_SET, 3, var_src, val, pos);
> +         gimple_call_set_lhs (new_stmt, var_dst);
> +         gimple_set_location (new_stmt, loc);
> +         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> +
> +         ass_stmt = gimple_build_assign (view_op0, var_dst);
> +         gimple_set_location (ass_stmt, loc);
> +         gsi_insert_before (gsi, ass_stmt, GSI_SAME_STMT);
> +
> +         gimple_move_vops (ass_stmt, stmt);
> +         gsi_remove (gsi, true);
> +       }
> +    }
> +
> +  return ass_stmt;
> +}
>
>  /* Expand all VEC_COND_EXPR gimple assignments into calls to internal
>     function based on type of selected expansion.  */
> @@ -176,7 +244,7 @@ gimple_expand_vec_cond_expr (gimple_stmt_iterator *gsi,
>     VEC_COND_EXPR assignments.  */
>
>  static unsigned int
> -gimple_expand_vec_cond_exprs (void)
> +gimple_expand_vec_exprs (void)
>  {
>    gimple_stmt_iterator gsi;
>    basic_block bb;
> @@ -189,12 +257,15 @@ gimple_expand_vec_cond_exprs (void)
>         {
>           gimple *g = gimple_expand_vec_cond_expr (&gsi,
>                                                    &vec_cond_ssa_name_uses);
> +
>           if (g != NULL)
>             {
>               tree lhs = gimple_assign_lhs (gsi_stmt (gsi));
>               gimple_set_lhs (g, lhs);
>               gsi_replace (&gsi, g, false);
>             }
> +
> +         gimple_expand_vec_set_expr (&gsi);
>         }
>      }
>
> @@ -237,7 +308,7 @@ public:
>
>    virtual unsigned int execute (function *)
>      {
> -      return gimple_expand_vec_cond_exprs ();
> +      return gimple_expand_vec_exprs ();
>      }
>
>  }; // class pass_gimple_isel
> diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c
> index 8efc77d986b..20c1d31fb73 100644
> --- a/gcc/internal-fn.c
> +++ b/gcc/internal-fn.c
> @@ -115,6 +115,7 @@ init_internal_fns ()
>  #define vec_condeq_direct { 0, 0, false }
>  #define scatter_store_direct { 3, 1, false }
>  #define len_store_direct { 3, 3, false }
> +#define vec_set_direct { 3, 3, false }
>  #define unary_direct { 0, 0, true }
>  #define binary_direct { 0, 0, true }
>  #define ternary_direct { 0, 0, true }
> @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>
>  #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>
> +/* Expand VEC_SET internal functions.  */
> +
> +static void
> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree op0 = gimple_call_arg (stmt, 0);
> +  tree op1 = gimple_call_arg (stmt, 1);
> +  tree op2 = gimple_call_arg (stmt, 2);
> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  rtx src = expand_normal (op0);
> +
> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
> +  scalar_mode innermode = GET_MODE_INNER (outermode);
> +
> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> +
> +  class expand_operand ops[3];
> +  enum insn_code icode = optab_handler (optab, outermode);
> +
> +  if (icode != CODE_FOR_nothing)
> +    {
> +      rtx temp = gen_reg_rtx (outermode);
> +      emit_move_insn (temp, src);
> +
> +      create_fixed_operand (&ops[0], temp);
> +      create_input_operand (&ops[1], value, innermode);
> +      create_convert_operand_from (&ops[2], pos, TYPE_MODE (TREE_TYPE (op2)),
> +                                  true);
> +      if (maybe_expand_insn (icode, 3, ops))
> +       {
> +         emit_move_insn (target, temp);
> +         return;
> +       }
> +    }
> +  gcc_unreachable ();
> +}
> +
>  static void
>  expand_ABNORMAL_DISPATCHER (internal_fn, gcall *)
>  {
> @@ -3253,6 +3293,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
>  #define direct_fold_left_optab_supported_p direct_optab_supported_p
>  #define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
>  #define direct_check_ptrs_optab_supported_p direct_optab_supported_p
> +#define direct_vec_set_optab_supported_p direct_optab_supported_p
>
>  /* Return the optab used by internal function FN.  */
>
> diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
> index 13e60828fcf..e6cfe1b6159 100644
> --- a/gcc/internal-fn.def
> +++ b/gcc/internal-fn.def
> @@ -145,6 +145,8 @@ DEF_INTERNAL_OPTAB_FN (VCONDU, 0, vcondu, vec_condu)
>  DEF_INTERNAL_OPTAB_FN (VCONDEQ, 0, vcondeq, vec_condeq)
>  DEF_INTERNAL_OPTAB_FN (VCOND_MASK, 0, vcond_mask, vec_cond_mask)
>
> +DEF_INTERNAL_OPTAB_FN (VEC_SET, 0, vec_set, vec_set)
> +
>  DEF_INTERNAL_OPTAB_FN (LEN_STORE, 0, len_store, len_store)
>
>  DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
> diff --git a/gcc/optabs.c b/gcc/optabs.c
> index 184827fdf4e..8e844028d92 100644
> --- a/gcc/optabs.c
> +++ b/gcc/optabs.c
> @@ -3841,6 +3841,27 @@ can_vcond_compare_p (enum rtx_code code, machine_mode value_mode,
>          && insn_operand_matches (icode, 3, test);
>  }
>
> +/* Return whether the backend can emit vector set instructions for inserting
> +   element into vector at variable index position.  */
> +
> +bool
> +can_vec_set_var_idx_p (machine_mode vec_mode)
> +{
> +  if (!VECTOR_MODE_P (vec_mode))
> +    return false;
> +
> +  machine_mode inner_mode = GET_MODE_INNER (vec_mode);
> +  rtx reg1 = alloca_raw_REG (vec_mode, LAST_VIRTUAL_REGISTER + 1);
> +  rtx reg2 = alloca_raw_REG (inner_mode, LAST_VIRTUAL_REGISTER + 2);
> +  rtx reg3 = alloca_raw_REG (VOIDmode, LAST_VIRTUAL_REGISTER + 3);
> +
> +  enum insn_code icode = optab_handler (vec_set_optab, vec_mode);
> +
> +  return icode != CODE_FOR_nothing && insn_operand_matches (icode, 0, reg1)
> +        && insn_operand_matches (icode, 1, reg2)
> +        && insn_operand_matches (icode, 2, reg3);
> +}
> +
>  /* This function is called when we are going to emit a compare instruction that
>     compares the values found in X and Y, using the rtl operator COMPARISON.
>
> diff --git a/gcc/optabs.h b/gcc/optabs.h
> index 7c2ec257cb0..0b14700ab3d 100644
> --- a/gcc/optabs.h
> +++ b/gcc/optabs.h
> @@ -249,6 +249,10 @@ extern int can_compare_p (enum rtx_code, machine_mode,
>     VALUE_MODE.  */
>  extern bool can_vcond_compare_p (enum rtx_code, machine_mode, machine_mode);
>
> +/* Return whether the backend can emit vector set instructions for inserting
> +   element into vector at variable index position.  */
> +extern bool can_vec_set_var_idx_p (machine_mode);
> +
>  extern rtx prepare_operand (enum insn_code, rtx, int, machine_mode,
>                             machine_mode, int);
>  /* Emit a pair of rtl insns to compare two rtx's and to jump
> --
> 2.27.0.90.geebb51ba8c
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-25  6:58             ` Richard Biener
@ 2020-09-25 12:21               ` Richard Sandiford
  2020-09-25 22:39               ` Segher Boessenkool
  1 sibling, 0 replies; 24+ messages in thread
From: Richard Sandiford @ 2020-09-25 12:21 UTC (permalink / raw)
  To: Richard Biener
  Cc: Segher Boessenkool, xionghu luo, GCC Patches, David Edelsohn,
	Bill Schmidt, Jiufu Guo, linkw

Richard Biener <richard.guenther@gmail.com> writes:
> On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
>>
>> Hi!
>>
>> On Thu, Sep 24, 2020 at 04:55:21PM +0200, Richard Biener wrote:
>> > Btw, on x86_64 the following produces sth reasonable:
>> >
>> > #define N 32
>> > typedef int T;
>> > typedef T V __attribute__((vector_size(N)));
>> > V setg (V v, int idx, T val)
>> > {
>> >   V valv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
>> >   V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == valv);
>> >   v = (v & ~mask) | (valv & mask);
>> >   return v;
>> > }
>> >
>> >         vmovd   %edi, %xmm1
>> >         vpbroadcastd    %xmm1, %ymm1
>> >         vpcmpeqd        .LC0(%rip), %ymm1, %ymm2
>> >         vpblendvb       %ymm2, %ymm1, %ymm0, %ymm0
>> >         ret
>> >
>> > I'm quite sure you could do sth similar on power?
>>
>> This only allows inserting aligned elements.  Which is probably fine
>> of course (we don't allow elements that straddle vector boundaries
>> either, anyway).
>>
>> And yes, we can do that :-)
>>
>> That should be
>>   #define N 32
>>   typedef int T;
>>   typedef T V __attribute__((vector_size(N)));
>>   V setg (V v, int idx, T val)
>>   {
>>     V valv = (V){val, val, val, val, val, val, val, val};
>>     V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
>>     V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
>>     v = (v & ~mask) | (valv & mask);
>>     return v;
>>   }
>
> Whoops yeah, simplified it a bit too much ;)
>
>> after which I get (-march=znver2)
>>
>> setg:
>>         vmovd   %edi, %xmm1
>>         vmovd   %esi, %xmm2
>>         vpbroadcastd    %xmm1, %ymm1
>>         vpbroadcastd    %xmm2, %ymm2
>>         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
>>         vpandn  %ymm0, %ymm1, %ymm0
>>         vpand   %ymm2, %ymm1, %ymm1
>>         vpor    %ymm0, %ymm1, %ymm0
>>         ret
>
> I get with -march=znver2 -O2
>
>         vmovd   %edi, %xmm1
>         vmovd   %esi, %xmm2
>         vpbroadcastd    %xmm1, %ymm1
>         vpbroadcastd    %xmm2, %ymm2
>         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
>         vpblendvb       %ymm1, %ymm2, %ymm0, %ymm0
>
> and with -mavx512vl
>
>         vpbroadcastd    %edi, %ymm1
>         vpcmpd  $0, .LC0(%rip), %ymm1, %k1
>         vpbroadcastd    %esi, %ymm0{%k1}
>
> broadcast-with-mask - heh, would be interesting if we manage
> to combine v[idx1] = val; v[idx2] = val; ;)
>
> Now, with SSE4.2 the 16byte case compiles to
>
> setg:
> .LFB0:
>         .cfi_startproc
>         movd    %edi, %xmm3
>         movdqa  %xmm0, %xmm1
>         movd    %esi, %xmm4
>         pshufd  $0, %xmm3, %xmm0
>         pcmpeqd .LC0(%rip), %xmm0
>         movdqa  %xmm0, %xmm2
>         pandn   %xmm1, %xmm2
>         pshufd  $0, %xmm4, %xmm1
>         pand    %xmm1, %xmm0
>         por     %xmm2, %xmm0
>         ret
>
> since there's no blend with a variable mask IIRC.
>
> with aarch64 and SVE it doesn't handle the 32byte case at all,

FWIW, the SVE version with -msve-vector-bits=256 is:

        ptrue   p0.b, vl32
        mov     z1.s, w1
        index   z2.s, #0, #1
        ld1w    z0.s, p0/z, [x0]
        cmpeq   p1.s, p0/z, z1.s, z2.s
        mov     z0.s, p1/m, w2
        st1w    z0.s, p0, [x8]

where the ptrue, ld1w and st1w are just because generic 256-bit
vectors are passed in memory; the real operation is:

        mov     z1.s, w1
        index   z2.s, #0, #1
        cmpeq   p1.s, p0/z, z1.s, z2.s
        mov     z0.s, p1/m, w2

Thanks,
Richard

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-25  6:51           ` [PATCH v4 1/3] " xionghu luo
  2020-09-25 11:32             ` Richard Biener
@ 2020-09-25 13:28             ` Richard Sandiford
  2020-09-27  5:45               ` xionghu luo
  1 sibling, 1 reply; 24+ messages in thread
From: Richard Sandiford @ 2020-09-25 13:28 UTC (permalink / raw)
  To: xionghu luo
  Cc: Richard Biener, GCC Patches, Segher Boessenkool, David Edelsohn,
	Bill Schmidt, Jiufu Guo, linkw

xionghu luo <luoxhu@linux.ibm.com> writes:
> @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>  
>  #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>  
> +/* Expand VEC_SET internal functions.  */
> +
> +static void
> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  tree op0 = gimple_call_arg (stmt, 0);
> +  tree op1 = gimple_call_arg (stmt, 1);
> +  tree op2 = gimple_call_arg (stmt, 2);
> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
> +  rtx src = expand_normal (op0);
> +
> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
> +  scalar_mode innermode = GET_MODE_INNER (outermode);
> +
> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);

These two can just use expand_normal.  Might be easier to read if
they come immediately after the expand_normal (op0).

LGTM with that change for the internal-fn.c stuff, thanks.

Richard

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-25  6:58             ` Richard Biener
  2020-09-25 12:21               ` Richard Sandiford
@ 2020-09-25 22:39               ` Segher Boessenkool
  2020-09-28  8:00                 ` Richard Biener
  1 sibling, 1 reply; 24+ messages in thread
From: Segher Boessenkool @ 2020-09-25 22:39 UTC (permalink / raw)
  To: Richard Biener
  Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On Fri, Sep 25, 2020 at 08:58:35AM +0200, Richard Biener wrote:
> On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool
> <segher@kernel.crashing.org> wrote:
> > after which I get (-march=znver2)
> >
> > setg:
> >         vmovd   %edi, %xmm1
> >         vmovd   %esi, %xmm2
> >         vpbroadcastd    %xmm1, %ymm1
> >         vpbroadcastd    %xmm2, %ymm2
> >         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
> >         vpandn  %ymm0, %ymm1, %ymm0
> >         vpand   %ymm2, %ymm1, %ymm1
> >         vpor    %ymm0, %ymm1, %ymm0
> >         ret
> 
> I get with -march=znver2 -O2
> 
>         vmovd   %edi, %xmm1
>         vmovd   %esi, %xmm2
>         vpbroadcastd    %xmm1, %ymm1
>         vpbroadcastd    %xmm2, %ymm2
>         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
>         vpblendvb       %ymm1, %ymm2, %ymm0, %ymm0

Ah, maybe my x86 compiler it too old...
  x86_64-linux-gcc (GCC) 10.0.0 20190919 (experimental)
not exactly old, huh.  I wonder what I do wrong then.

> Now, with SSE4.2 the 16byte case compiles to
> 
> setg:
> .LFB0:
>         .cfi_startproc
>         movd    %edi, %xmm3
>         movdqa  %xmm0, %xmm1
>         movd    %esi, %xmm4
>         pshufd  $0, %xmm3, %xmm0
>         pcmpeqd .LC0(%rip), %xmm0
>         movdqa  %xmm0, %xmm2
>         pandn   %xmm1, %xmm2
>         pshufd  $0, %xmm4, %xmm1
>         pand    %xmm1, %xmm0
>         por     %xmm2, %xmm0
>         ret
> 
> since there's no blend with a variable mask IIRC.

PowerPC got at least *that* right since time immemorial :-)

> with aarch64 and SVE it doesn't handle the 32byte case at all,
> the 16byte case compiles to
> 
> setg:
> .LFB0:
>         .cfi_startproc
>         adrp    x2, .LC0
>         dup     v1.4s, w0
>         dup     v2.4s, w1
>         ldr     q3, [x2, #:lo12:.LC0]
>         cmeq    v1.4s, v1.4s, v3.4s
>         bit     v0.16b, v2.16b, v1.16b
> 
> which looks equivalent to the AVX2 code.

Yes, and we can do pretty much the same on Power, too.

> For all of those varying the vector element type may also
> cause "issues" I guess.

For us, as long as it stays 16B vectors, all should be fine.  There may
be issues in the compiler, but at least the hardware has no problem with
it ;-)

> > and for powerpc (changing it to 16B vectors, -mcpu=power9) it is
> >
> > setg:
> >         addis 9,2,.LC0@toc@ha
> >         mtvsrws 32,5
> >         mtvsrws 33,6
> >         addi 9,9,.LC0@toc@l
> >         lxv 45,0(9)
> >         vcmpequw 0,0,13
> >         xxsel 34,34,33,32
> >         blr

The -mcpu=power10 code right now is just

        plxv 45,.LC0@pcrel
        mtvsrws 32,5
        mtvsrws 33,6
        vcmpequw 0,0,13
        xxsel 34,34,33,32
        blr

(exactly the same, but less memory address setup cost), so doing
something like this as a generic version would work quite well pretty
much everywhere I think!


Segher

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v4 1/3] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR
  2020-09-25 13:28             ` Richard Sandiford
@ 2020-09-27  5:45               ` xionghu luo
  0 siblings, 0 replies; 24+ messages in thread
From: xionghu luo @ 2020-09-27  5:45 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, Segher Boessenkool, David Edelsohn,
	Bill Schmidt, Jiufu Guo, linkw, richard.sandiford



On 2020/9/25 21:28, Richard Sandiford wrote:
> xionghu luo <luoxhu@linux.ibm.com> writes:
>> @@ -2658,6 +2659,45 @@ expand_vect_cond_mask_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>>   
>>   #define expand_vec_cond_mask_optab_fn expand_vect_cond_mask_optab_fn
>>   
>> +/* Expand VEC_SET internal functions.  */
>> +
>> +static void
>> +expand_vec_set_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
>> +{
>> +  tree lhs = gimple_call_lhs (stmt);
>> +  tree op0 = gimple_call_arg (stmt, 0);
>> +  tree op1 = gimple_call_arg (stmt, 1);
>> +  tree op2 = gimple_call_arg (stmt, 2);
>> +  rtx target = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE);
>> +  rtx src = expand_normal (op0);
>> +
>> +  machine_mode outermode = TYPE_MODE (TREE_TYPE (op0));
>> +  scalar_mode innermode = GET_MODE_INNER (outermode);
>> +
>> +  rtx value = expand_expr (op1, NULL_RTX, VOIDmode, EXPAND_NORMAL);
>> +  rtx pos = expand_expr (op2, NULL_RTX, VOIDmode, EXPAND_NORMAL);
> 
> These two can just use expand_normal.  Might be easier to read if
> they come immediately after the expand_normal (op0).
> 
> LGTM with that change for the internal-fn.c stuff, thanks.
> 

Thank you, updated and committed as r11-3486.  Tested and confirmed Power/X86/ARM
still not supporting vec_set with register index, so there are no ICE regressions
caused by generating IFN VEC_SET but not properly expanded.


Thanks,
Xionghu

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251]
  2020-09-25 22:39               ` Segher Boessenkool
@ 2020-09-28  8:00                 ` Richard Biener
  0 siblings, 0 replies; 24+ messages in thread
From: Richard Biener @ 2020-09-28  8:00 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: xionghu luo, GCC Patches, David Edelsohn, Bill Schmidt,
	Jiufu Guo, linkw, Richard Sandiford

On Sat, Sep 26, 2020 at 12:41 AM Segher Boessenkool
<segher@kernel.crashing.org> wrote:
>
> On Fri, Sep 25, 2020 at 08:58:35AM +0200, Richard Biener wrote:
> > On Thu, Sep 24, 2020 at 9:38 PM Segher Boessenkool
> > <segher@kernel.crashing.org> wrote:
> > > after which I get (-march=znver2)
> > >
> > > setg:
> > >         vmovd   %edi, %xmm1
> > >         vmovd   %esi, %xmm2
> > >         vpbroadcastd    %xmm1, %ymm1
> > >         vpbroadcastd    %xmm2, %ymm2
> > >         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
> > >         vpandn  %ymm0, %ymm1, %ymm0
> > >         vpand   %ymm2, %ymm1, %ymm1
> > >         vpor    %ymm0, %ymm1, %ymm0
> > >         ret
> >
> > I get with -march=znver2 -O2
> >
> >         vmovd   %edi, %xmm1
> >         vmovd   %esi, %xmm2
> >         vpbroadcastd    %xmm1, %ymm1
> >         vpbroadcastd    %xmm2, %ymm2
> >         vpcmpeqd        .LC0(%rip), %ymm1, %ymm1
> >         vpblendvb       %ymm1, %ymm2, %ymm0, %ymm0
>
> Ah, maybe my x86 compiler it too old...
>   x86_64-linux-gcc (GCC) 10.0.0 20190919 (experimental)
> not exactly old, huh.  I wonder what I do wrong then.
>
> > Now, with SSE4.2 the 16byte case compiles to
> >
> > setg:
> > .LFB0:
> >         .cfi_startproc
> >         movd    %edi, %xmm3
> >         movdqa  %xmm0, %xmm1
> >         movd    %esi, %xmm4
> >         pshufd  $0, %xmm3, %xmm0
> >         pcmpeqd .LC0(%rip), %xmm0
> >         movdqa  %xmm0, %xmm2
> >         pandn   %xmm1, %xmm2
> >         pshufd  $0, %xmm4, %xmm1
> >         pand    %xmm1, %xmm0
> >         por     %xmm2, %xmm0
> >         ret
> >
> > since there's no blend with a variable mask IIRC.
>
> PowerPC got at least *that* right since time immemorial :-)
>
> > with aarch64 and SVE it doesn't handle the 32byte case at all,
> > the 16byte case compiles to
> >
> > setg:
> > .LFB0:
> >         .cfi_startproc
> >         adrp    x2, .LC0
> >         dup     v1.4s, w0
> >         dup     v2.4s, w1
> >         ldr     q3, [x2, #:lo12:.LC0]
> >         cmeq    v1.4s, v1.4s, v3.4s
> >         bit     v0.16b, v2.16b, v1.16b
> >
> > which looks equivalent to the AVX2 code.
>
> Yes, and we can do pretty much the same on Power, too.
>
> > For all of those varying the vector element type may also
> > cause "issues" I guess.
>
> For us, as long as it stays 16B vectors, all should be fine.  There may
> be issues in the compiler, but at least the hardware has no problem with
> it ;-)
>
> > > and for powerpc (changing it to 16B vectors, -mcpu=power9) it is
> > >
> > > setg:
> > >         addis 9,2,.LC0@toc@ha
> > >         mtvsrws 32,5
> > >         mtvsrws 33,6
> > >         addi 9,9,.LC0@toc@l
> > >         lxv 45,0(9)
> > >         vcmpequw 0,0,13
> > >         xxsel 34,34,33,32
> > >         blr
>
> The -mcpu=power10 code right now is just
>
>         plxv 45,.LC0@pcrel
>         mtvsrws 32,5
>         mtvsrws 33,6
>         vcmpequw 0,0,13
>         xxsel 34,34,33,32
>         blr
>
> (exactly the same, but less memory address setup cost), so doing
> something like this as a generic version would work quite well pretty
> much everywhere I think!

Given we don't have a good way to query for variable blend support
the only change would be to use the bit and/or way which probably
would be fine (and eventually combined).  I guess that could be done
in ISEL as well (given support for generating the mask, of course).

Of course that makes it difficult for targets to opt-out (generate
the in-memory variant)...

Richard.

>
> Segher

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2020-09-28  8:01 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-18  6:17 [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Xiong Hu Luo
2020-09-18  6:17 ` [PATCH v2 2/2] rs6000: Expand vec_insert in expander instead of gimple [PR79251] Xiong Hu Luo
2020-09-18 20:19   ` Segher Boessenkool
2020-09-24  8:21     ` xionghu luo
2020-09-24 13:27       ` Richard Biener
2020-09-24 14:55         ` Richard Biener
2020-09-24 19:36           ` Segher Boessenkool
2020-09-25  6:58             ` Richard Biener
2020-09-25 12:21               ` Richard Sandiford
2020-09-25 22:39               ` Segher Boessenkool
2020-09-28  8:00                 ` Richard Biener
2020-09-24 19:02         ` Segher Boessenkool
2020-09-25  3:50         ` xionghu luo
2020-09-25  5:00           ` Richard Biener
2020-09-18 18:20 ` [PATCH v2 1/2] IFN: Implement IFN_VEC_SET for ARRAY_REF with VIEW_CONVERT_EXPR Segher Boessenkool
2020-09-21  8:31 ` Richard Biener
2020-09-22  3:55   ` [PATCH v3 " xionghu luo
2020-09-23 11:33     ` Richard Biener
2020-09-24  3:24       ` xionghu luo
2020-09-24 12:39         ` Richard Sandiford
2020-09-25  6:51           ` [PATCH v4 1/3] " xionghu luo
2020-09-25 11:32             ` Richard Biener
2020-09-25 13:28             ` Richard Sandiford
2020-09-27  5:45               ` xionghu luo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).