[committed] amdgcn: multi-size vector reductions

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [committed] amdgcn: multi-size vector reductions
@ 2022-10-31 13:03 Andrew Stubbs
  0 siblings, 0 replies; only message in thread
From: Andrew Stubbs @ 2022-10-31 13:03 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 604 bytes --]

My recent patch to add additional vector lengths didn't address the 
vector reductions yet.

This patch adds the missing support. Shorter vectors use fewer reduction 
steps, and the means to extract the final value has been adjusted.

Lacking from this is any useful costs, so for loops the vect pass will 
almost always continue to choose the most expensive 64-lane vectors, and 
therefore reductions, with masking for smaller sizes. This works, but 
probably could be improved.

For SLP the compiler will usually use a more appropriately sized 
reduction, so this patch is useful for that case.

Andrew

[-- Attachment #2: 221031-vector-reductions.patch --]
[-- Type: text/plain, Size: 12008 bytes --]

amdgcn: multi-size vector reductions

Add support for vector reductions for any vector width by switching iterators
and generalising the code slightly.  There's no one-instruction way to move an
item from lane 31 to lane 0 (63, 15, 7, 3, and 1 are all fine though), and
vec_extract is probably fewer cycles anyway, so now we always reduce to an
SGPR.

gcc/ChangeLog:

	* config/gcn/gcn-valu.md (V64_SI): Delete iterator.
	(V64_DI): Likewise.
	(V64_1REG): Likewise.
	(V64_INT_1REG): Likewise.
	(V64_2REG): Likewise.
	(V64_ALL): Likewise.
	(V64_FP): Likewise.
	(reduc_<reduc_op>_scal_<mode>): Use V_ALL. Use gen_vec_extract.
	(fold_left_plus_<mode>): Use V_FP.
	(*<reduc_op>_dpp_shr_<mode>): Use V_1REG.
	(*<reduc_op>_dpp_shr_<mode>): Use V_DI.
	(*plus_carry_dpp_shr_<mode>): Use V_INT_1REG.
	(*plus_carry_in_dpp_shr_<mode>): Use V_SI.
	(*plus_carry_dpp_shr_<mode>): Use V_DI.
	(mov_from_lane63_<mode>): Delete.
	(mov_from_lane63_<mode>): Delete.
	* config/gcn/gcn.cc (gcn_expand_reduc_scalar): Support partial vectors.
	* config/gcn/gcn.md (unspec): Remove UNSPEC_MOV_FROM_LANE63.

diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 00c0e3be1ea..6274d2e9228 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -32,11 +32,6 @@ (define_mode_iterator V_DI
 (define_mode_iterator V_DF
 		      [V2DF V4DF V8DF V16DF V32DF V64DF])
 
-(define_mode_iterator V64_SI
-		      [V64SI])
-(define_mode_iterator V64_DI
-		      [V64DI])
-
 ; Vector modes for sub-dword modes
 (define_mode_iterator V_QIHI
 		      [V2QI V2HI
@@ -77,13 +72,6 @@ (define_mode_iterator V_FP_1REG
 		       V32HF V32SF
 		       V64HF V64SF])
 
-; V64_* modes are for where more general support is unimplemented
-; (e.g. reductions)
-(define_mode_iterator V64_1REG
-		      [V64QI V64HI V64SI V64HF V64SF])
-(define_mode_iterator V64_INT_1REG
-		      [V64QI V64HI V64SI])
-
 ; Vector modes for two vector registers
 (define_mode_iterator V_2REG
 		      [V2DI V2DF
@@ -93,9 +81,6 @@ (define_mode_iterator V_2REG
 		       V32DI V32DF
 		       V64DI V64DF])
 
-(define_mode_iterator V64_2REG
-		      [V64DI V64DF])
-
 ; Vector modes with native support
 (define_mode_iterator V_noQI
 		      [V2HI V2HF V2SI V2SF V2DI V2DF
@@ -158,11 +143,6 @@ (define_mode_iterator V_FP
 		       V32HF V32SF V32DF
 		       V64HF V64SF V64DF])
 
-(define_mode_iterator V64_ALL
-		      [V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
-(define_mode_iterator V64_FP
-		      [V64HF V64SF V64DF])
-
 (define_mode_attr scalar_mode
   [(V2QI "qi") (V2HI "hi") (V2SI "si")
    (V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
@@ -3528,15 +3508,16 @@ (define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0")
 (define_expand "reduc_<reduc_op>_scal_<mode>"
   [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
 	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_ALL 1 "register_operand")]
+	  [(match_operand:V_ALL 1 "register_operand")]
 	  REDUC_UNSPEC))]
   ""
   {
     rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
 				       <reduc_unspec>);
 
-    /* The result of the reduction is in lane 63 of tmp.  */
-    emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
+    rtx last_lane = GEN_INT (GET_MODE_NUNITS (<MODE>mode) - 1);
+    emit_insn (gen_vec_extract<mode><scalar_mode> (operands[0], tmp,
+						   last_lane));
 
     DONE;
   })
@@ -3547,7 +3528,7 @@ (define_expand "reduc_<reduc_op>_scal_<mode>"
 (define_expand "fold_left_plus_<mode>"
  [(match_operand:<SCALAR_MODE> 0 "register_operand")
   (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand")
-  (match_operand:V64_FP 2 "gcn_alu_operand")]
+  (match_operand:V_FP 2 "gcn_alu_operand")]
   "can_create_pseudo_p ()
    && (flag_openacc || flag_openmp
        || flag_associative_math)"
@@ -3563,11 +3544,11 @@ (define_expand "fold_left_plus_<mode>"
    })
 
 (define_insn "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V64_1REG 0 "register_operand"   "=v")
-	(unspec:V64_1REG
-	  [(match_operand:V64_1REG 1 "register_operand" "v")
-	   (match_operand:V64_1REG 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"      "n")]
+  [(set (match_operand:V_1REG 0 "register_operand"   "=v")
+	(unspec:V_1REG
+	  [(match_operand:V_1REG 1 "register_operand" "v")
+	   (match_operand:V_1REG 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"        "n")]
 	  REDUC_UNSPEC))]
   ; GCN3 requires a carry out, GCN5 not
   "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
@@ -3580,11 +3561,11 @@ (define_insn "*<reduc_op>_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
-	(unspec:V64_DI
-	  [(match_operand:V64_DI 1 "register_operand" "v")
-	   (match_operand:V64_DI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V_DI 0 "register_operand"    "=v")
+	(unspec:V_DI
+	  [(match_operand:V_DI 1 "register_operand" "v")
+	   (match_operand:V_DI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")]
 	  REDUC_2REG_UNSPEC))]
   ""
   "#"
@@ -3609,10 +3590,10 @@ (define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
 ; Special cases for addition.
 
 (define_insn "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V64_INT_1REG 0 "register_operand"   "=v")
-	(unspec:V64_INT_1REG
-	  [(match_operand:V64_INT_1REG 1 "register_operand" "v")
-	   (match_operand:V64_INT_1REG 2 "register_operand" "v")
+  [(set (match_operand:V_INT_1REG 0 "register_operand"   "=v")
+	(unspec:V_INT_1REG
+	  [(match_operand:V_INT_1REG 1 "register_operand" "v")
+	   (match_operand:V_INT_1REG 2 "register_operand" "v")
 	   (match_operand:SI 3 "const_int_operand"	  "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
@@ -3626,12 +3607,12 @@ (define_insn "*plus_carry_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn "*plus_carry_in_dpp_shr_<mode>"
-  [(set (match_operand:V64_SI 0 "register_operand"    "=v")
-	(unspec:V64_SI
-	  [(match_operand:V64_SI 1 "register_operand" "v")
-	   (match_operand:V64_SI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")
-	   (match_operand:DI 4 "register_operand"     "cV")]
+  [(set (match_operand:V_SI 0 "register_operand"    "=v")
+	(unspec:V_SI
+	  [(match_operand:V_SI 1 "register_operand" "v")
+	   (match_operand:V_SI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")
+	   (match_operand:DI 4 "register_operand"   "cV")]
 	  UNSPEC_PLUS_CARRY_IN_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
@@ -3644,11 +3625,11 @@ (define_insn "*plus_carry_in_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn_and_split "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
-	(unspec:V64_DI
-	  [(match_operand:V64_DI 1 "register_operand" "v")
-	   (match_operand:V64_DI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V_DI 0 "register_operand"    "=v")
+	(unspec:V_DI
+	  [(match_operand:V_DI 1 "register_operand" "v")
+	   (match_operand:V_DI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
@@ -3675,38 +3656,6 @@ (define_insn_and_split "*plus_carry_dpp_shr_<mode>"
   [(set_attr "type" "vmult")
    (set_attr "length" "16")])
 
-; Instructions to move a scalar value from lane 63 of a vector register.
-(define_insn "mov_from_lane63_<mode>"
-  [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
-	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_1REG 1 "register_operand"	  "  v,v")]
-	  UNSPEC_MOV_FROM_LANE63))]
-  ""
-  "@
-   v_readlane_b32\t%0, %1, 63
-   v_mov_b32\t%0, %1 wave_ror:1"
-  [(set_attr "type" "vop3a,vop_dpp")
-   (set_attr "exec" "none,*")
-   (set_attr "length" "8")])
-
-(define_insn "mov_from_lane63_<mode>"
-  [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
-	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_2REG 1 "register_operand"	  "  v,v")]
-	  UNSPEC_MOV_FROM_LANE63))]
-  ""
-  "@
-   v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
-   * if (REGNO (operands[0]) <= REGNO (operands[1]))	\
-       return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\"	\
-	      \"v_mov_b32\t%H0, %H1 wave_ror:1\";	\
-     else						\
-       return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\"	\
-	      \"v_mov_b32\t%L0, %L1 wave_ror:1\";"
-  [(set_attr "type" "vop3a,vop_dpp")
-   (set_attr "exec" "none,*")
-   (set_attr "length" "8")])
-
 ;; }}}
 ;; {{{ Miscellaneous
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index a561976d7f5..b9d9170f167 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -4918,23 +4918,25 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
 
    The vector register SRC of mode MODE is reduced using the operation given
    by UNSPEC, and the scalar result is returned in lane 63 of a vector
-   register.  */
-/* FIXME: Implement reductions for sizes other than V64.
-          (They're currently disabled in the machine description.)  */
+   register (or lane 31, 15, 7, 3, 1 for partial vectors).  */
 
 rtx
 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
 {
   machine_mode orig_mode = mode;
+  machine_mode scalar_mode = GET_MODE_INNER (mode);
+  int vf = GET_MODE_NUNITS (mode);
   bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
+		      || unspec == UNSPEC_SMIN_DPP_SHR
 		      || unspec == UNSPEC_SMAX_DPP_SHR
 		      || unspec == UNSPEC_UMIN_DPP_SHR
 		      || unspec == UNSPEC_UMAX_DPP_SHR)
-		     && (mode == V64DImode
-			 || mode == V64DFmode))
+		     && (scalar_mode == DImode
+			 || scalar_mode == DFmode))
 		    || (unspec == UNSPEC_PLUS_DPP_SHR
-			&& mode == V64DFmode));
+			&& scalar_mode == DFmode));
   rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
+		   : unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
 		   : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
 		   : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
 		   : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
@@ -4944,23 +4946,23 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
 		       || unspec == UNSPEC_SMAX_DPP_SHR
 		       || unspec == UNSPEC_UMIN_DPP_SHR
 		       || unspec == UNSPEC_UMAX_DPP_SHR)
-		      && (mode == V64QImode
-			  || mode == V64HImode));
+		      && (scalar_mode == QImode
+			  || scalar_mode == HImode));
   bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
 		    || unspec == UNSPEC_UMAX_DPP_SHR);
   bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
 			&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-			&& (TARGET_GCN3 || mode == V64DImode);
+			&& (TARGET_GCN3 || scalar_mode == DImode);
 
   if (use_plus_carry)
     unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
 
   if (use_extends)
     {
-      rtx tmp = gen_reg_rtx (V64SImode);
+      mode = VnMODE (vf, SImode);
+      rtx tmp = gen_reg_rtx (mode);
       convert_move (tmp, src, unsignedp);
       src = tmp;
-      mode = V64SImode;
     }
 
   /* Perform reduction by first performing the reduction operation on every
@@ -4968,7 +4970,8 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
      iteration (thereby effectively reducing every 4 lanes) and so on until
      all lanes are reduced.  */
   rtx in, out = force_reg (mode, src);
-  for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
+  int iterations = exact_log2 (vf);
+  for (int i = 0, shift = 1; i < iterations; i++, shift <<= 1)
     {
       rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
       in = out;
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index a3c9523cd6d..6c1a438f9d1 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -78,7 +78,6 @@ (define_c_enum "unspec" [
   UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
   UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
   UNSPEC_MOV_DPP_SHR
-  UNSPEC_MOV_FROM_LANE63
   UNSPEC_GATHER
   UNSPEC_SCATTER
   UNSPEC_RCP

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-10-31 13:03 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-31 13:03 [committed] amdgcn: multi-size vector reductions Andrew Stubbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).