Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Feng Xue OS <fxue@os.amperecomputing.com>
To: Richard Biener <richard.guenther@gmail.com>
Cc: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>
Subject: Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing
Date: Wed, 26 Jun 2024 14:48:23 +0000	[thread overview]
Message-ID: <LV2PR01MB7839AA7184054EAB80571117F7D62@LV2PR01MB7839.prod.exchangelabs.com> (raw)
In-Reply-To: <LV2PR01MB78399700661AC569E6CA40BAF7C82@LV2PR01MB7839.prod.exchangelabs.com>

[-- Attachment #1: Type: text/plain, Size: 19707 bytes --]

Updated the patches based on comments.

The input vectype of reduction PHI statement must be determined before
vect cost computation for the reduction. Since lance-reducing operation has
different input vectype from normal one, so we need to traverse all reduction
statements to find out the input vectype with the least lanes, and set that to
the PHI statement.

---
 gcc/tree-vect-loop.cc | 79 ++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 347dac97e49..419f4b08d2b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
-      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
+      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
+
+      if (reduc_idx == -1)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7686,10 +7688,57 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
              return false;
            }
        }
-      else if (!stmt_info)
-       /* First non-conversion stmt.  */
-       stmt_info = vdef;
-      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
+      else
+       {
+         /* First non-conversion stmt.  */
+         if (!stmt_info)
+           stmt_info = vdef;
+
+         if (lane_reducing_op_p (op.code))
+           {
+             enum vect_def_type dt;
+             tree vectype_op;
+
+             /* The last operand of lane-reducing operation is for
+                reduction.  */
+             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
+
+             if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op))
+               return false;
+
+             tree type_op = TREE_TYPE (op.ops[0]);
+
+             if (!vectype_op)
+               {
+                 vectype_op = get_vectype_for_scalar_type (loop_vinfo,
+                                                           type_op);
+                 if (!vectype_op)
+                   return false;
+               }
+
+             /* For lane-reducing operation vectorizable analysis needs the
+                reduction PHI information */
+             STMT_VINFO_REDUC_DEF (def) = phi_info;
+
+             /* Each lane-reducing operation has its own input vectype, while
+                reduction PHI will record the input vectype with the least
+                lanes.  */
+             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
+
+             /* To accommodate lane-reducing operations of mixed input
+                vectypes, choose input vectype with the least lanes for the
+                reduction PHI statement, which would result in the most
+                ncopies for vectorized reduction results.  */
+             if (!vectype_in
+                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
+               vectype_in = vectype_op;
+           }
+         else
+           vectype_in = STMT_VINFO_VECTYPE (phi_info);
+       }
+
+      reduc_def = op.ops[reduc_idx];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
        slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
@@ -7747,6 +7796,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,

   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
+  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7831,16 +7882,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
          = get_vectype_for_scalar_type (loop_vinfo,
                                         TREE_TYPE (op.ops[i]), slp_op[i]);

-      /* To properly compute ncopies we are interested in the widest
-        non-reduction input type in case we're looking at a widening
-        accumulation that we later handle in vect_transform_reduction.  */
-      if (lane_reducing
-         && vectype_op[i]
-         && (!vectype_in
-             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
-       vectype_in = vectype_op[i];
-
       /* Record how the non-reduction-def value of COND_EXPR is defined.
         ???  For a chain of multiple CONDs we'd have to match them up all.  */
       if (op.code == COND_EXPR && reduc_chain_length == 1)
@@ -7859,14 +7900,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
            }
        }
     }
-  if (!vectype_in)
-    vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
-
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;

   enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
   STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
--
2.17.1


________________________________________
From: Feng Xue OS <fxue@os.amperecomputing.com>
Sent: Thursday, June 20, 2024 1:47 PM
To: Richard Biener
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing

>> +         if (lane_reducing_op_p (op.code))
>> +           {
>> +             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
>> +             tree op_type = TREE_TYPE (op.ops[0]);
>> +             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
>> +                                                                op_type,
>> +                                                                group_size);
>
> I think doing it this way does not adhere to the vector type size constraint
> with loop vectorization.  You should use vect_is_simple_use like the
> original code did as the actual vector definition determines the vector type
> used.

OK, though this might be wordy.

Actually, STMT_VINFO_REDUC_VECTYPE_IN is logically equivalent to nunits_vectype
that is determined in vect_determine_vf_for_stmt_1(). So how about setting the type
in this function?

>
> You are always using op.ops[0] here - I think that works because
> reduc_idx is the last operand of all lane-reducing ops.  But then
> we should assert reduc_idx != 0 here and add a comment.

Already added in the following assertion.

>> +
>> +             /* The last operand of lane-reducing operation is for
>> +                reduction.  */
>> +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);

                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> +
>> +             /* For lane-reducing operation vectorizable analysis needs the
>> +                reduction PHI information */
>> +             STMT_VINFO_REDUC_DEF (def) = phi_info;
>> +
>> +             if (!new_vectype_in)
>> +               return false;
>> +
>> +             /* Each lane-reducing operation has its own input vectype, while
>> +                reduction PHI will record the input vectype with the least
>> +                lanes.  */
>> +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
>> +
>> +             /* To accommodate lane-reducing operations of mixed input
>> +                vectypes, choose input vectype with the least lanes for the
>> +                reduction PHI statement, which would result in the most
>> +                ncopies for vectorized reduction results.  */
>> +             if (!vectype_in
>> +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
>> +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
>> +               vectype_in = new_vectype_in;
>
> I know this is a fragile area but I always wonder since the accumulating operand
> is the largest (all lane-reducing ops are widening), and that will be
> equal to the
> type of the PHI node, how this condition can be ever true.

In the original code, accumulating operand is skipped! While it is correctly, we
should not count the operand, this is why we call operation lane-reducing.

>
> ncopies is determined by the VF, so the comment is at least misleading.
>
>> +           }
>> +         else
>> +           vectype_in = STMT_VINFO_VECTYPE (phi_info);
>
> Please initialize vectype_in from phi_info before the loop (that
> should never be NULL).
>

May not, as the below explanation.

> I'll note that with your patch it seems we'd initialize vectype_in to
> the biggest
> non-accumulation vector type involved in lane-reducing ops but the accumulating
> type might still be larger.   Why, when we have multiple lane-reducing
> ops, would
> we chose the largest input here?  I see we eventually do
>
>   if (slp_node)
>     ncopies = 1;
>   else
>     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
>
> but then IIRC we always force a single cycle def for lane-reducing ops(?).


> In particular for vect_transform_reduction and SLP we rely on
> SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
> STMT_VINFO_REDUC_VECTYPE_IN.
>
> So I wonder what breaks when we set vectype_in = vector type of PHI?
>

Yes. It is right, nothing is broken. Suppose that a loop contains three dot_prods,
two are <16 * char>, one is <8 * short>, and choose <4 * int> as vectype_in:

With the patch #7, we get:

      vector<4> int sum_v0 = { 0, 0, 0, 0 };
      vector<4> int sum_v1 = { 0, 0, 0, 0 };
      vector<4> int sum_v2 = { 0, 0, 0, 0 };
      vector<4> int sum_v3 = { 0, 0, 0, 0 };

      loop () {
         sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);

         sum_v0 = dot_prod<16 * char>(char_b0, char_b1, sum_v0);

         sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
         sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);

         sum_v2 = sum_v2;
         sum_v3 = sum_v3;
      }

The def/use cycles (sum_v2 and sum_v3> would be optimized away finally.
Then this gets same result as setting vectype_in to <8 * short>.

With the patch #8, we get:

      vector<4> int sum_v0 = { 0, 0, 0, 0 };
      vector<4> int sum_v1 = { 0, 0, 0, 0 };
      vector<4> int sum_v2 = { 0, 0, 0, 0 };
      vector<4> int sum_v3 = { 0, 0, 0, 0 };

      loop () {
         sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);

         sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1);

         sum_v2 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v2);
         sum_v3 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v3);
      }

All dot_prods are assigned to separate def/use cycles, and no
dependency. More def/use cycles, higher instruction parallelism,
but there need extra cost in epilogue to combine the result.

So we consider a somewhat compact def/use layout similar to
single-defuse-cycle, in which two <16 * char> dot_prods are independent,
and cycle 2 and 3 are not used, and this is better than the 1st scheme.

      vector<4> int sum_v0 = { 0, 0, 0, 0 };
      vector<4> int sum_v1 = { 0, 0, 0, 0 };

      loop () {
         sum_v0 = dot_prod<16 * char>(char_a0, char_a1, sum_v0);

         sum_v1 = dot_prod<16 * char>(char_b0, char_b1, sum_v1);

         sum_v0 = dot_prod<8 * short>(short_c0_lo, short_c1_lo, sum_v0);
         sum_v1 = dot_prod<8 * short>(short_c0_hi, short_c1_hi, sum_v1);
      }

For this purpose, we need to track the vectype_in that results in
the most ncopies, for this case, the type is <8 * short>.

BTW: would you please also take a look at patch #7 and #8?

Thanks,
Feng

________________________________________
From: Richard Biener <richard.guenther@gmail.com>
Sent: Wednesday, June 19, 2024 9:01 PM
To: Feng Xue OS
Cc: gcc-patches@gcc.gnu.org
Subject: Re: [PATCH 4/8] vect: Determine input vectype for multiple lane-reducing

On Sun, Jun 16, 2024 at 9:25 AM Feng Xue OS <fxue@os.amperecomputing.com> wrote:
>
> The input vectype of reduction PHI statement must be determined before
> vect cost computation for the reduction. Since lance-reducing operation has
> different input vectype from normal one, so we need to traverse all reduction
> statements to find out the input vectype with the least lanes, and set that to
> the PHI statement.
>
> Thanks,
> Feng
>
> ---
> gcc/
>         * tree-vect-loop.cc (vectorizable_reduction): Determine input vectype
>         during traversal of reduction statements.
> ---
>  gcc/tree-vect-loop.cc | 72 +++++++++++++++++++++++++++++--------------
>  1 file changed, 49 insertions(+), 23 deletions(-)
>
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 0f7b125e72d..39aa5cb1197 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>      {
>        stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
>        stmt_vec_info vdef = vect_stmt_to_vectorize (def);
> -      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
> +      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
> +
> +      if (reduc_idx == -1)
>         {
>           if (dump_enabled_p ())
>             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> @@ -7686,10 +7688,50 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>               return false;
>             }
>         }
> -      else if (!stmt_info)
> -       /* First non-conversion stmt.  */
> -       stmt_info = vdef;
> -      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
> +      else
> +       {
> +         /* First non-conversion stmt.  */
> +         if (!stmt_info)
> +           stmt_info = vdef;
> +
> +         if (lane_reducing_op_p (op.code))
> +           {
> +             unsigned group_size = slp_node ? SLP_TREE_LANES (slp_node) : 0;
> +             tree op_type = TREE_TYPE (op.ops[0]);
> +             tree new_vectype_in = get_vectype_for_scalar_type (loop_vinfo,
> +                                                                op_type,
> +                                                                group_size);

I think doing it this way does not adhere to the vector type size constraint
with loop vectorization.  You should use vect_is_simple_use like the
original code did as the actual vector definition determines the vector type
used.

You are always using op.ops[0] here - I think that works because
reduc_idx is the last operand of all lane-reducing ops.  But then
we should assert reduc_idx != 0 here and add a comment.

> +
> +             /* The last operand of lane-reducing operation is for
> +                reduction.  */
> +             gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
> +
> +             /* For lane-reducing operation vectorizable analysis needs the
> +                reduction PHI information */
> +             STMT_VINFO_REDUC_DEF (def) = phi_info;
> +
> +             if (!new_vectype_in)
> +               return false;
> +
> +             /* Each lane-reducing operation has its own input vectype, while
> +                reduction PHI will record the input vectype with the least
> +                lanes.  */
> +             STMT_VINFO_REDUC_VECTYPE_IN (vdef) = new_vectype_in;
> +
> +             /* To accommodate lane-reducing operations of mixed input
> +                vectypes, choose input vectype with the least lanes for the
> +                reduction PHI statement, which would result in the most
> +                ncopies for vectorized reduction results.  */
> +             if (!vectype_in
> +                 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> +                      < GET_MODE_SIZE (SCALAR_TYPE_MODE (op_type))))
> +               vectype_in = new_vectype_in;

I know this is a fragile area but I always wonder since the accumulating operand
is the largest (all lane-reducing ops are widening), and that will be
equal to the
type of the PHI node, how this condition can be ever true.

ncopies is determined by the VF, so the comment is at least misleading.

> +           }
> +         else
> +           vectype_in = STMT_VINFO_VECTYPE (phi_info);

Please initialize vectype_in from phi_info before the loop (that
should never be NULL).

I'll note that with your patch it seems we'd initialize vectype_in to
the biggest
non-accumulation vector type involved in lane-reducing ops but the accumulating
type might still be larger.   Why, when we have multiple lane-reducing
ops, would
we chose the largest input here?  I see we eventually do

  if (slp_node)
    ncopies = 1;
  else
    ncopies = vect_get_num_copies (loop_vinfo, vectype_in);

but then IIRC we always force a single cycle def for lane-reducing ops(?).
In particular for vect_transform_reduction and SLP we rely on
SLP_TREE_NUMBER_OF_VEC_STMTS while non-SLP uses
STMT_VINFO_REDUC_VECTYPE_IN.

So I wonder what breaks when we set vectype_in = vector type of PHI?

Richard.

> +       }
> +
> +      reduc_def = op.ops[reduc_idx];
>        reduc_chain_length++;
>        if (!stmt_info && slp_node)
>         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
> @@ -7747,6 +7789,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>
>    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
>    STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
> +  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> +
>    gimple_match_op op;
>    if (!gimple_extract_op (stmt_info->stmt, &op))
>      gcc_unreachable ();
> @@ -7831,16 +7875,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>           = get_vectype_for_scalar_type (loop_vinfo,
>                                          TREE_TYPE (op.ops[i]), slp_op[i]);
>
> -      /* To properly compute ncopies we are interested in the widest
> -        non-reduction input type in case we're looking at a widening
> -        accumulation that we later handle in vect_transform_reduction.  */
> -      if (lane_reducing
> -         && vectype_op[i]
> -         && (!vectype_in
> -             || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
> -                 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
> -       vectype_in = vectype_op[i];
> -
>        /* Record how the non-reduction-def value of COND_EXPR is defined.
>          ???  For a chain of multiple CONDs we'd have to match them up all.  */
>        if (op.code == COND_EXPR && reduc_chain_length == 1)
> @@ -7859,14 +7893,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>             }
>         }
>      }
> -  if (!vectype_in)
> -    vectype_in = STMT_VINFO_VECTYPE (phi_info);
> -  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
> -
> -  /* Each lane-reducing operation has its own input vectype, while reduction
> -     PHI records the input vectype with least lanes.  */
> -  if (lane_reducing)
> -    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
>
>    enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
>    STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
> --
> 2.17.1

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-vect-Determine-input-vectype-for-multiple-lane-reduc.patch --]
[-- Type: text/x-patch; name="0001-vect-Determine-input-vectype-for-multiple-lane-reduc.patch", Size: 5129 bytes --]

From 74c4d39f823ae94767f135babea45af16b401811 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Sun, 16 Jun 2024 13:00:32 +0800
Subject: [PATCH 1/3] vect: Determine input vectype for multiple lane-reducing
 operations

The input vectype of reduction PHI statement must be determined before
vect cost computation for the reduction. Since lance-reducing operation has
different input vectype from normal one, so we need to traverse all reduction
statements to find out the input vectype with the least lanes, and set that to
the PHI statement.

2024-06-16 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vect-loop.cc (vectorizable_reduction): Determine input vectype
	during traversal of reduction statements.
---
 gcc/tree-vect-loop.cc | 79 ++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 347dac97e49..419f4b08d2b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
-      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
+      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
+
+      if (reduc_idx == -1)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7686,10 +7688,57 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	      return false;
 	    }
 	}
-      else if (!stmt_info)
-	/* First non-conversion stmt.  */
-	stmt_info = vdef;
-      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
+      else
+	{
+	  /* First non-conversion stmt.  */
+	  if (!stmt_info)
+	    stmt_info = vdef;
+
+	  if (lane_reducing_op_p (op.code))
+	    {
+	      enum vect_def_type dt;
+	      tree vectype_op;
+
+	      /* The last operand of lane-reducing operation is for
+		 reduction.  */
+	      gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
+
+	      if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op))
+		return false;
+	      
+	      tree type_op = TREE_TYPE (op.ops[0]);
+
+	      if (!vectype_op)
+		{	      
+		  vectype_op = get_vectype_for_scalar_type (loop_vinfo,
+							    type_op);
+		  if (!vectype_op)
+		    return false;
+		}
+
+	      /* For lane-reducing operation vectorizable analysis needs the
+		 reduction PHI information */
+	      STMT_VINFO_REDUC_DEF (def) = phi_info;
+
+	      /* Each lane-reducing operation has its own input vectype, while
+		 reduction PHI will record the input vectype with the least
+		 lanes.  */
+	      STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
+
+	      /* To accommodate lane-reducing operations of mixed input
+		 vectypes, choose input vectype with the least lanes for the
+		 reduction PHI statement, which would result in the most
+		 ncopies for vectorized reduction results.  */
+	      if (!vectype_in
+		  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		       < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
+		vectype_in = vectype_op;
+	    }
+	  else
+	    vectype_in = STMT_VINFO_VECTYPE (phi_info);
+	}
+
+      reduc_def = op.ops[reduc_idx];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
 	slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
@@ -7747,6 +7796,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
+  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7831,16 +7882,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	  = get_vectype_for_scalar_type (loop_vinfo,
 					 TREE_TYPE (op.ops[i]), slp_op[i]);
 
-      /* To properly compute ncopies we are interested in the widest
-	 non-reduction input type in case we're looking at a widening
-	 accumulation that we later handle in vect_transform_reduction.  */
-      if (lane_reducing
-	  && vectype_op[i]
-	  && (!vectype_in
-	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
-	vectype_in = vectype_op[i];
-
       /* Record how the non-reduction-def value of COND_EXPR is defined.
 	 ???  For a chain of multiple CONDs we'd have to match them up all.  */
       if (op.code == COND_EXPR && reduc_chain_length == 1)
@@ -7859,14 +7900,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	    }
 	}
     }
-  if (!vectype_in)
-    vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
-
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
 
   enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
   STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
-- 
2.17.1


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #3: 0001-vect-Determine-input-vectype-for-multiple-lane-reduc.patch --]
[-- Type: text/x-patch; name="0001-vect-Determine-input-vectype-for-multiple-lane-reduc.patch", Size: 5129 bytes --]

From 74c4d39f823ae94767f135babea45af16b401811 Mon Sep 17 00:00:00 2001
From: Feng Xue <fxue@os.amperecomputing.com>
Date: Sun, 16 Jun 2024 13:00:32 +0800
Subject: [PATCH 1/3] vect: Determine input vectype for multiple lane-reducing
 operations

The input vectype of reduction PHI statement must be determined before
vect cost computation for the reduction. Since lance-reducing operation has
different input vectype from normal one, so we need to traverse all reduction
statements to find out the input vectype with the least lanes, and set that to
the PHI statement.

2024-06-16 Feng Xue <fxue@os.amperecomputing.com>

gcc/
	* tree-vect-loop.cc (vectorizable_reduction): Determine input vectype
	during traversal of reduction statements.
---
 gcc/tree-vect-loop.cc | 79 ++++++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 23 deletions(-)

diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
index 347dac97e49..419f4b08d2b 100644
--- a/gcc/tree-vect-loop.cc
+++ b/gcc/tree-vect-loop.cc
@@ -7643,7 +7643,9 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
-      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
+      int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
+
+      if (reduc_idx == -1)
 	{
 	  if (dump_enabled_p ())
 	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -7686,10 +7688,57 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	      return false;
 	    }
 	}
-      else if (!stmt_info)
-	/* First non-conversion stmt.  */
-	stmt_info = vdef;
-      reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
+      else
+	{
+	  /* First non-conversion stmt.  */
+	  if (!stmt_info)
+	    stmt_info = vdef;
+
+	  if (lane_reducing_op_p (op.code))
+	    {
+	      enum vect_def_type dt;
+	      tree vectype_op;
+
+	      /* The last operand of lane-reducing operation is for
+		 reduction.  */
+	      gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
+
+	      if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op))
+		return false;
+	      
+	      tree type_op = TREE_TYPE (op.ops[0]);
+
+	      if (!vectype_op)
+		{	      
+		  vectype_op = get_vectype_for_scalar_type (loop_vinfo,
+							    type_op);
+		  if (!vectype_op)
+		    return false;
+		}
+
+	      /* For lane-reducing operation vectorizable analysis needs the
+		 reduction PHI information */
+	      STMT_VINFO_REDUC_DEF (def) = phi_info;
+
+	      /* Each lane-reducing operation has its own input vectype, while
+		 reduction PHI will record the input vectype with the least
+		 lanes.  */
+	      STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
+
+	      /* To accommodate lane-reducing operations of mixed input
+		 vectypes, choose input vectype with the least lanes for the
+		 reduction PHI statement, which would result in the most
+		 ncopies for vectorized reduction results.  */
+	      if (!vectype_in
+		  || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
+		       < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
+		vectype_in = vectype_op;
+	    }
+	  else
+	    vectype_in = STMT_VINFO_VECTYPE (phi_info);
+	}
+
+      reduc_def = op.ops[reduc_idx];
       reduc_chain_length++;
       if (!stmt_info && slp_node)
 	slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
@@ -7747,6 +7796,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
+  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
+
   gimple_match_op op;
   if (!gimple_extract_op (stmt_info->stmt, &op))
     gcc_unreachable ();
@@ -7831,16 +7882,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	  = get_vectype_for_scalar_type (loop_vinfo,
 					 TREE_TYPE (op.ops[i]), slp_op[i]);
 
-      /* To properly compute ncopies we are interested in the widest
-	 non-reduction input type in case we're looking at a widening
-	 accumulation that we later handle in vect_transform_reduction.  */
-      if (lane_reducing
-	  && vectype_op[i]
-	  && (!vectype_in
-	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
-		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
-	vectype_in = vectype_op[i];
-
       /* Record how the non-reduction-def value of COND_EXPR is defined.
 	 ???  For a chain of multiple CONDs we'd have to match them up all.  */
       if (op.code == COND_EXPR && reduc_chain_length == 1)
@@ -7859,14 +7900,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
 	    }
 	}
     }
-  if (!vectype_in)
-    vectype_in = STMT_VINFO_VECTYPE (phi_info);
-  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
-
-  /* Each lane-reducing operation has its own input vectype, while reduction
-     PHI records the input vectype with least lanes.  */
-  if (lane_reducing)
-    STMT_VINFO_REDUC_VECTYPE_IN (stmt_info) = vectype_in;
 
   enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
   STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
-- 
2.17.1

next prev parent reply	other threads:[~2024-06-26 14:48 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-06-16  7:25 Feng Xue OS
2024-06-19 13:01 ` Richard Biener
2024-06-20  5:47   ` Feng Xue OS
2024-06-20 11:52     ` Richard Biener
2024-06-26 14:48     ` Feng Xue OS [this message]
2024-06-28 12:59       ` Richard Biener

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=LV2PR01MB7839AA7184054EAB80571117F7D62@LV2PR01MB7839.prod.exchangelabs.com \
    --to=fxue@os.amperecomputing.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=richard.guenther@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).