amdgcn: multi-size vector reductions

Add support for vector reductions for any vector width by switching iterators
and generalising the code slightly.  There's no one-instruction way to move an
item from lane 31 to lane 0 (63, 15, 7, 3, and 1 are all fine though), and
vec_extract is probably fewer cycles anyway, so now we always reduce to an
SGPR.

gcc/ChangeLog:

	* config/gcn/gcn-valu.md (V64_SI): Delete iterator.
	(V64_DI): Likewise.
	(V64_1REG): Likewise.
	(V64_INT_1REG): Likewise.
	(V64_2REG): Likewise.
	(V64_ALL): Likewise.
	(V64_FP): Likewise.
	(reduc_<reduc_op>_scal_<mode>): Use V_ALL. Use gen_vec_extract.
	(fold_left_plus_<mode>): Use V_FP.
	(*<reduc_op>_dpp_shr_<mode>): Use V_1REG.
	(*<reduc_op>_dpp_shr_<mode>): Use V_DI.
	(*plus_carry_dpp_shr_<mode>): Use V_INT_1REG.
	(*plus_carry_in_dpp_shr_<mode>): Use V_SI.
	(*plus_carry_dpp_shr_<mode>): Use V_DI.
	(mov_from_lane63_<mode>): Delete.
	(mov_from_lane63_<mode>): Delete.
	* config/gcn/gcn.cc (gcn_expand_reduc_scalar): Support partial vectors.
	* config/gcn/gcn.md (unspec): Remove UNSPEC_MOV_FROM_LANE63.

diff --git a/gcc/config/gcn/gcn-valu.md b/gcc/config/gcn/gcn-valu.md
index 00c0e3be1ea..6274d2e9228 100644
--- a/gcc/config/gcn/gcn-valu.md
+++ b/gcc/config/gcn/gcn-valu.md
@@ -32,11 +32,6 @@ (define_mode_iterator V_DI
 (define_mode_iterator V_DF
 		      [V2DF V4DF V8DF V16DF V32DF V64DF])
 
-(define_mode_iterator V64_SI
-		      [V64SI])
-(define_mode_iterator V64_DI
-		      [V64DI])
-
 ; Vector modes for sub-dword modes
 (define_mode_iterator V_QIHI
 		      [V2QI V2HI
@@ -77,13 +72,6 @@ (define_mode_iterator V_FP_1REG
 		       V32HF V32SF
 		       V64HF V64SF])
 
-; V64_* modes are for where more general support is unimplemented
-; (e.g. reductions)
-(define_mode_iterator V64_1REG
-		      [V64QI V64HI V64SI V64HF V64SF])
-(define_mode_iterator V64_INT_1REG
-		      [V64QI V64HI V64SI])
-
 ; Vector modes for two vector registers
 (define_mode_iterator V_2REG
 		      [V2DI V2DF
@@ -93,9 +81,6 @@ (define_mode_iterator V_2REG
 		       V32DI V32DF
 		       V64DI V64DF])
 
-(define_mode_iterator V64_2REG
-		      [V64DI V64DF])
-
 ; Vector modes with native support
 (define_mode_iterator V_noQI
 		      [V2HI V2HF V2SI V2SF V2DI V2DF
@@ -158,11 +143,6 @@ (define_mode_iterator V_FP
 		       V32HF V32SF V32DF
 		       V64HF V64SF V64DF])
 
-(define_mode_iterator V64_ALL
-		      [V64QI V64HI V64HF V64SI V64SF V64DI V64DF])
-(define_mode_iterator V64_FP
-		      [V64HF V64SF V64DF])
-
 (define_mode_attr scalar_mode
   [(V2QI "qi") (V2HI "hi") (V2SI "si")
    (V2HF "hf") (V2SF "sf") (V2DI "di") (V2DF "df")
@@ -3528,15 +3508,16 @@ (define_int_attr reduc_insn [(UNSPEC_SMIN_DPP_SHR "v_min%i0")
 (define_expand "reduc_<reduc_op>_scal_<mode>"
   [(set (match_operand:<SCALAR_MODE> 0 "register_operand")
 	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_ALL 1 "register_operand")]
+	  [(match_operand:V_ALL 1 "register_operand")]
 	  REDUC_UNSPEC))]
   ""
   {
     rtx tmp = gcn_expand_reduc_scalar (<MODE>mode, operands[1],
 				       <reduc_unspec>);
 
-    /* The result of the reduction is in lane 63 of tmp.  */
-    emit_insn (gen_mov_from_lane63_<mode> (operands[0], tmp));
+    rtx last_lane = GEN_INT (GET_MODE_NUNITS (<MODE>mode) - 1);
+    emit_insn (gen_vec_extract<mode><scalar_mode> (operands[0], tmp,
+						   last_lane));
 
     DONE;
   })
@@ -3547,7 +3528,7 @@ (define_expand "reduc_<reduc_op>_scal_<mode>"
 (define_expand "fold_left_plus_<mode>"
  [(match_operand:<SCALAR_MODE> 0 "register_operand")
   (match_operand:<SCALAR_MODE> 1 "gcn_alu_operand")
-  (match_operand:V64_FP 2 "gcn_alu_operand")]
+  (match_operand:V_FP 2 "gcn_alu_operand")]
   "can_create_pseudo_p ()
    && (flag_openacc || flag_openmp
        || flag_associative_math)"
@@ -3563,11 +3544,11 @@ (define_expand "fold_left_plus_<mode>"
    })
 
 (define_insn "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V64_1REG 0 "register_operand"   "=v")
-	(unspec:V64_1REG
-	  [(match_operand:V64_1REG 1 "register_operand" "v")
-	   (match_operand:V64_1REG 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"      "n")]
+  [(set (match_operand:V_1REG 0 "register_operand"   "=v")
+	(unspec:V_1REG
+	  [(match_operand:V_1REG 1 "register_operand" "v")
+	   (match_operand:V_1REG 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"        "n")]
 	  REDUC_UNSPEC))]
   ; GCN3 requires a carry out, GCN5 not
   "!(TARGET_GCN3 && SCALAR_INT_MODE_P (<SCALAR_MODE>mode)
@@ -3580,11 +3561,11 @@ (define_insn "*<reduc_op>_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
-  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
-	(unspec:V64_DI
-	  [(match_operand:V64_DI 1 "register_operand" "v")
-	   (match_operand:V64_DI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V_DI 0 "register_operand"    "=v")
+	(unspec:V_DI
+	  [(match_operand:V_DI 1 "register_operand" "v")
+	   (match_operand:V_DI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")]
 	  REDUC_2REG_UNSPEC))]
   ""
   "#"
@@ -3609,10 +3590,10 @@ (define_insn_and_split "*<reduc_op>_dpp_shr_<mode>"
 ; Special cases for addition.
 
 (define_insn "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V64_INT_1REG 0 "register_operand"   "=v")
-	(unspec:V64_INT_1REG
-	  [(match_operand:V64_INT_1REG 1 "register_operand" "v")
-	   (match_operand:V64_INT_1REG 2 "register_operand" "v")
+  [(set (match_operand:V_INT_1REG 0 "register_operand"   "=v")
+	(unspec:V_INT_1REG
+	  [(match_operand:V_INT_1REG 1 "register_operand" "v")
+	   (match_operand:V_INT_1REG 2 "register_operand" "v")
 	   (match_operand:SI 3 "const_int_operand"	  "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
@@ -3626,12 +3607,12 @@ (define_insn "*plus_carry_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn "*plus_carry_in_dpp_shr_<mode>"
-  [(set (match_operand:V64_SI 0 "register_operand"    "=v")
-	(unspec:V64_SI
-	  [(match_operand:V64_SI 1 "register_operand" "v")
-	   (match_operand:V64_SI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")
-	   (match_operand:DI 4 "register_operand"     "cV")]
+  [(set (match_operand:V_SI 0 "register_operand"    "=v")
+	(unspec:V_SI
+	  [(match_operand:V_SI 1 "register_operand" "v")
+	   (match_operand:V_SI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")
+	   (match_operand:DI 4 "register_operand"   "cV")]
 	  UNSPEC_PLUS_CARRY_IN_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
@@ -3644,11 +3625,11 @@ (define_insn "*plus_carry_in_dpp_shr_<mode>"
    (set_attr "length" "8")])
 
 (define_insn_and_split "*plus_carry_dpp_shr_<mode>"
-  [(set (match_operand:V64_DI 0 "register_operand"    "=v")
-	(unspec:V64_DI
-	  [(match_operand:V64_DI 1 "register_operand" "v")
-	   (match_operand:V64_DI 2 "register_operand" "v")
-	   (match_operand:SI 3 "const_int_operand"    "n")]
+  [(set (match_operand:V_DI 0 "register_operand"    "=v")
+	(unspec:V_DI
+	  [(match_operand:V_DI 1 "register_operand" "v")
+	   (match_operand:V_DI 2 "register_operand" "v")
+	   (match_operand:SI 3 "const_int_operand"  "n")]
 	  UNSPEC_PLUS_CARRY_DPP_SHR))
    (clobber (reg:DI VCC_REG))]
   ""
@@ -3675,38 +3656,6 @@ (define_insn_and_split "*plus_carry_dpp_shr_<mode>"
   [(set_attr "type" "vmult")
    (set_attr "length" "16")])
 
-; Instructions to move a scalar value from lane 63 of a vector register.
-(define_insn "mov_from_lane63_<mode>"
-  [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
-	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_1REG 1 "register_operand"	  "  v,v")]
-	  UNSPEC_MOV_FROM_LANE63))]
-  ""
-  "@
-   v_readlane_b32\t%0, %1, 63
-   v_mov_b32\t%0, %1 wave_ror:1"
-  [(set_attr "type" "vop3a,vop_dpp")
-   (set_attr "exec" "none,*")
-   (set_attr "length" "8")])
-
-(define_insn "mov_from_lane63_<mode>"
-  [(set (match_operand:<SCALAR_MODE> 0 "register_operand" "=Sg,v")
-	(unspec:<SCALAR_MODE>
-	  [(match_operand:V64_2REG 1 "register_operand"	  "  v,v")]
-	  UNSPEC_MOV_FROM_LANE63))]
-  ""
-  "@
-   v_readlane_b32\t%L0, %L1, 63\;v_readlane_b32\t%H0, %H1, 63
-   * if (REGNO (operands[0]) <= REGNO (operands[1]))	\
-       return \"v_mov_b32\t%L0, %L1 wave_ror:1\;\"	\
-	      \"v_mov_b32\t%H0, %H1 wave_ror:1\";	\
-     else						\
-       return \"v_mov_b32\t%H0, %H1 wave_ror:1\;\"	\
-	      \"v_mov_b32\t%L0, %L1 wave_ror:1\";"
-  [(set_attr "type" "vop3a,vop_dpp")
-   (set_attr "exec" "none,*")
-   (set_attr "length" "8")])
-
 ;; }}}
 ;; {{{ Miscellaneous
 
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index a561976d7f5..b9d9170f167 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -4918,23 +4918,25 @@ gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn,
 
    The vector register SRC of mode MODE is reduced using the operation given
    by UNSPEC, and the scalar result is returned in lane 63 of a vector
-   register.  */
-/* FIXME: Implement reductions for sizes other than V64.
-          (They're currently disabled in the machine description.)  */
+   register (or lane 31, 15, 7, 3, 1 for partial vectors).  */
 
 rtx
 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
 {
   machine_mode orig_mode = mode;
+  machine_mode scalar_mode = GET_MODE_INNER (mode);
+  int vf = GET_MODE_NUNITS (mode);
   bool use_moves = (((unspec == UNSPEC_SMIN_DPP_SHR
+		      || unspec == UNSPEC_SMIN_DPP_SHR
 		      || unspec == UNSPEC_SMAX_DPP_SHR
 		      || unspec == UNSPEC_UMIN_DPP_SHR
 		      || unspec == UNSPEC_UMAX_DPP_SHR)
-		     && (mode == V64DImode
-			 || mode == V64DFmode))
+		     && (scalar_mode == DImode
+			 || scalar_mode == DFmode))
 		    || (unspec == UNSPEC_PLUS_DPP_SHR
-			&& mode == V64DFmode));
+			&& scalar_mode == DFmode));
   rtx_code code = (unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
+		   : unspec == UNSPEC_SMIN_DPP_SHR ? SMIN
 		   : unspec == UNSPEC_SMAX_DPP_SHR ? SMAX
 		   : unspec == UNSPEC_UMIN_DPP_SHR ? UMIN
 		   : unspec == UNSPEC_UMAX_DPP_SHR ? UMAX
@@ -4944,23 +4946,23 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
 		       || unspec == UNSPEC_SMAX_DPP_SHR
 		       || unspec == UNSPEC_UMIN_DPP_SHR
 		       || unspec == UNSPEC_UMAX_DPP_SHR)
-		      && (mode == V64QImode
-			  || mode == V64HImode));
+		      && (scalar_mode == QImode
+			  || scalar_mode == HImode));
   bool unsignedp = (unspec == UNSPEC_UMIN_DPP_SHR
 		    || unspec == UNSPEC_UMAX_DPP_SHR);
   bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR
 			&& GET_MODE_CLASS (mode) == MODE_VECTOR_INT
-			&& (TARGET_GCN3 || mode == V64DImode);
+			&& (TARGET_GCN3 || scalar_mode == DImode);
 
   if (use_plus_carry)
     unspec = UNSPEC_PLUS_CARRY_DPP_SHR;
 
   if (use_extends)
     {
-      rtx tmp = gen_reg_rtx (V64SImode);
+      mode = VnMODE (vf, SImode);
+      rtx tmp = gen_reg_rtx (mode);
       convert_move (tmp, src, unsignedp);
       src = tmp;
-      mode = V64SImode;
     }
 
   /* Perform reduction by first performing the reduction operation on every
@@ -4968,7 +4970,8 @@ gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec)
      iteration (thereby effectively reducing every 4 lanes) and so on until
      all lanes are reduced.  */
   rtx in, out = force_reg (mode, src);
-  for (int i = 0, shift = 1; i < 6; i++, shift <<= 1)
+  int iterations = exact_log2 (vf);
+  for (int i = 0, shift = 1; i < iterations; i++, shift <<= 1)
     {
       rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift);
       in = out;
diff --git a/gcc/config/gcn/gcn.md b/gcc/config/gcn/gcn.md
index a3c9523cd6d..6c1a438f9d1 100644
--- a/gcc/config/gcn/gcn.md
+++ b/gcc/config/gcn/gcn.md
@@ -78,7 +78,6 @@ (define_c_enum "unspec" [
   UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
   UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
   UNSPEC_MOV_DPP_SHR
-  UNSPEC_MOV_FROM_LANE63
   UNSPEC_GATHER
   UNSPEC_SCATTER
   UNSPEC_RCP