Re: [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3.

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* Re: [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3.
@ 2014-06-03 13:22 Evgeny Stupachenko
  0 siblings, 0 replies; 3+ messages in thread
From: Evgeny Stupachenko @ 2014-06-03 13:22 UTC (permalink / raw)
  To: GCC Patches, Richard Biener, Jakub Jelinek, Uros Bizjak

I've added a bug report for the stores group case:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403


On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Ping.
> Test is modified according to the fix in the test for loads.
>
> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> new file mode 100644
> index 0000000..e7161f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */
> +
> +#define byte unsigned char
> +
> +void
> +matrix_mul (byte *in, byte *out, int size)
> +{
> +  int i;
> +  for (i = 0; i < size; i++)
> +    {
> +      out[0] = in[0] + in[1] + in[3];
> +      out[1] = in[0] + in[2] + in[4];
> +      out[2] = in[1] + in[2] + in[4];
> +      in += 4;
> +      out += 3;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> target { i?86-*-* x86_64-*-* } } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
>
>
> On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>> 2nd part of patch is on stores group.
>> Bootstrap and make check passed on x86.
>>
>> Is it ok?
>>
>> 2014-05-06  Evgeny Stupachenko  <evstupac@gmail.com>
>>
>>         * tree-vect-data-refs.c (vect_grouped_store_supported): New
>>         check for storess group of length 3.
>>         (vect_permute_store_chain): New permutations for storess group of
>>         length 3.
>>         * tree-vect-stmts.c (vect_model_store_cost): Change cost
>>         of vec_perm_shuffle for the new permutations.
>>
>> ChangeLog for testsuite:
>>
>> 2014-05-06  Evgeny Stupachenko  <evstupac@gmail.com>
>>
>>        PR tree-optimization/52252
>>        * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.
>>
>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>> index ef710cf..fb0e30d 100644
>> --- a/gcc/tree-vect-data-refs.c
>> +++ b/gcc/tree-vect-data-refs.c
>> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype,
>> unsigned HOST_WIDE_INT count)
>>  {
>>    enum machine_mode mode = TYPE_MODE (vectype);
>>
>> -  /* vect_permute_store_chain requires the group size to be a power of two.  */
>> -  if (exact_log2 (count) == -1)
>> +  /* vect_permute_store_chain requires the group size to be equal to 3 or
>> +     be a power of two.  */
>> +  if (count != 3 && exact_log2 (count) == -1)
>>      {
>>        if (dump_enabled_p ())
>>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> -                         "the size of the group of accesses"
>> -                         " is not a power of 2\n");
>> +                        "the size of the group of accesses"
>> +                        " is not a power of 2 or not eqaul to 3\n");
>>        return false;
>>      }
>>
>> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype,
>> unsigned HOST_WIDE_INT count)
>>      {
>>        unsigned int i, nelt = GET_MODE_NUNITS (mode);
>>        unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>> -      for (i = 0; i < nelt / 2; i++)
>> +
>> +      if (count == 3)
>>         {
>> -         sel[i * 2] = i;
>> -         sel[i * 2 + 1] = i + nelt;
>> +         unsigned int j0 = 0, j1 = 0, j2 = 0;
>> +         unsigned int i, j;
>> +
>> +         for (j = 0; j < 3; j++)
>> +           {
>> +             int nelt0 = ((3 - j) * nelt) % 3;
>> +             int nelt1 = ((3 - j) * nelt + 1) % 3;
>> +             int nelt2 = ((3 - j) * nelt + 2) % 3;
>> +             for (i = 0; i < nelt; i++)
>> +               {
>> +                 if (3 * i + nelt0 < nelt)
>> +                   sel[3 * i + nelt0] = j0++;
>> +                 if (3 * i + nelt1 < nelt)
>> +                   sel[3 * i + nelt1] = nelt + j1++;
>> +                 if (3 * i + nelt2 < nelt)
>> +                   sel[3 * i + nelt2] = 0;
>> +               }
>> +             if (!can_vec_perm_p (mode, false, sel))
>> +               {
>> +                 if (dump_enabled_p ())
>> +                   dump_printf (MSG_MISSED_OPTIMIZATION,
>> +                                "permutaion op not supported by target.\n");
>> +                 return false;
>> +               }
>> +
>> +             for (i = 0; i < nelt; i++)
>> +               {
>> +                 if (3 * i + nelt0 < nelt)
>> +                   sel[3 * i + nelt0] = 3 * i + nelt0;
>> +                 if (3 * i + nelt1 < nelt)
>> +                   sel[3 * i + nelt1] = 3 * i + nelt1;
>> +                 if (3 * i + nelt2 < nelt)
>> +                   sel[3 * i + nelt2] = nelt + j2++;
>> +               }
>> +             if (!can_vec_perm_p (mode, false, sel))
>> +               {
>> +                 if (dump_enabled_p ())
>> +                   dump_printf (MSG_MISSED_OPTIMIZATION,
>> +                                "permutaion op not supported by target.\n");
>> +                 return false;
>> +               }
>> +           }
>> +         return true;
>>         }
>> -      if (can_vec_perm_p (mode, false, sel))
>> +      else
>>         {
>> -         for (i = 0; i < nelt; i++)
>> -           sel[i] += nelt / 2;
>> -         if (can_vec_perm_p (mode, false, sel))
>> -           return true;
>> +         /* If length is not equal to 3 then only power of 2 is supported.  */
>> +         gcc_assert (exact_log2 (count) != -1);
>> +
>> +         for (i = 0; i < nelt / 2; i++)
>> +           {
>> +             sel[i * 2] = i;
>> +             sel[i * 2 + 1] = i + nelt;
>> +           }
>> +           if (can_vec_perm_p (mode, false, sel))
>> +             {
>> +               for (i = 0; i < nelt; i++)
>> +                 sel[i] += nelt / 2;
>> +               if (can_vec_perm_p (mode, false, sel))
>> +                 return true;
>> +             }
>>         }
>>      }
>>
>>    if (dump_enabled_p ())
>>      dump_printf (MSG_MISSED_OPTIMIZATION,
>> -                 "interleave op not supported by target.\n");
>> +                "permutaion op not supported by target.\n");
>>    return false;
>>  }
>>
>> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype,
>> unsigned HOST_WIDE_INT count)
>>  /* Function vect_permute_store_chain.
>>
>>     Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
>> -   a power of 2, generate interleave_high/low stmts to reorder the data
>> -   correctly for the stores.  Return the final references for stores in
>> -   RESULT_CHAIN.
>> +   a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
>> +   the data correctly for the stores.  Return the final references for stores
>> +   in RESULT_CHAIN.
>>
>>     E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
>>     The input is 4 vectors each containing 8 elements.  We assign a number to
>> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
>>    gimple perm_stmt;
>>    tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
>>    tree perm_mask_low, perm_mask_high;
>> -  unsigned int i, n;
>> +  tree data_ref;
>> +  tree perm3_mask_low, perm3_mask_high;
>> +  unsigned int i, n, log_length = exact_log2 (length);
>>    unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
>>    unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>>
>> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
>>    memcpy (result_chain->address (), dr_chain.address (),
>>           length * sizeof (tree));
>>
>> -  for (i = 0, n = nelt / 2; i < n; i++)
>> +  if (length == 3)
>>      {
>> -      sel[i * 2] = i;
>> -      sel[i * 2 + 1] = i + nelt;
>> -    }
>> -  perm_mask_high = vect_gen_perm_mask (vectype, sel);
>> -  gcc_assert (perm_mask_high != NULL);
>> +      unsigned int j0 = 0, j1 = 0, j2 = 0;
>>
>> -  for (i = 0; i < nelt; i++)
>> -    sel[i] += nelt / 2;
>> -  perm_mask_low = vect_gen_perm_mask (vectype, sel);
>> -  gcc_assert (perm_mask_low != NULL);
>> +      for (j = 0; j < 3; j++)
>> +        {
>> +         int nelt0 = ((3 - j) * nelt) % 3;
>> +         int nelt1 = ((3 - j) * nelt + 1) % 3;
>> +         int nelt2 = ((3 - j) * nelt + 2) % 3;
>>
>> -  for (i = 0, n = exact_log2 (length); i < n; i++)
>> -    {
>> -      for (j = 0; j < length/2; j++)
>> -       {
>> -         vect1 = dr_chain[j];
>> -         vect2 = dr_chain[j+length/2];
>> +         for (i = 0; i < nelt; i++)
>> +           {
>> +             if (3 * i + nelt0 < nelt)
>> +               sel[3 * i + nelt0] = j0++;
>> +             if (3 * i + nelt1 < nelt)
>> +               sel[3 * i + nelt1] = nelt + j1++;
>> +             if (3 * i + nelt2 < nelt)
>> +               sel[3 * i + nelt2] = 0;
>> +           }
>> +         perm3_mask_low = vect_gen_perm_mask (vectype, sel);
>> +         gcc_assert (perm3_mask_low != NULL);
>> +
>> +         for (i = 0; i < nelt; i++)
>> +           {
>> +             if (3 * i + nelt0 < nelt)
>> +               sel[3 * i + nelt0] = 3 * i + nelt0;
>> +             if (3 * i + nelt1 < nelt)
>> +               sel[3 * i + nelt1] = 3 * i + nelt1;
>> +             if (3 * i + nelt2 < nelt)
>> +               sel[3 * i + nelt2] = nelt + j2++;
>> +           }
>> +         perm3_mask_high = vect_gen_perm_mask (vectype, sel);
>> +         gcc_assert (perm3_mask_high != NULL);
>> +
>> +         vect1 = dr_chain[0];
>> +         vect2 = dr_chain[1];
>>
>>           /* Create interleaving stmt:
>> -            high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
>> -         high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
>> -         perm_stmt
>> -           = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
>> -                                           vect1, vect2, perm_mask_high);
>> +            low = VEC_PERM_EXPR <vect1, vect2,
>> +                                 {j, nelt, *, j + 1, nelt + j + 1, *,
>> +                                  j + 2, nelt + j + 2, *, ...}>  */
>> +         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
>> +         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> +                                                   vect1, vect2,
>> +                                                   perm3_mask_low);
>>           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> -         (*result_chain)[2*j] = high;
>>
>> +         vect1 = data_ref;
>> +         vect2 = dr_chain[2];
>>           /* Create interleaving stmt:
>> -            low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
>> -                                                nelt*3/2+1, ...}>  */
>> -         low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
>> -         perm_stmt
>> -           = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
>> -                                           vect1, vect2, perm_mask_low);
>> +            low = VEC_PERM_EXPR <vect1, vect2,
>> +                                 {0, 1, nelt + j, 3, 4, nelt + j + 1,
>> +                                  6, 7, nelt + j + 2, ...}>  */
>> +         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
>> +         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> +                                                   vect1, vect2,
>> +                                                   perm3_mask_high);
>>           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> -         (*result_chain)[2*j+1] = low;
>> +         (*result_chain)[j] = data_ref;
>> +       }
>> +    }
>> +  else
>> +    {
>> +      /* If length is not equal to 3 then only power of 2 is supported.  */
>> +      gcc_assert (exact_log2 (length) != -1);
>> +
>> +      for (i = 0, n = nelt / 2; i < n; i++)
>> +       {
>> +         sel[i * 2] = i;
>> +         sel[i * 2 + 1] = i + nelt;
>>         }
>> -      memcpy (dr_chain.address (), result_chain->address (),
>> -             length * sizeof (tree));
>> +       perm_mask_high = vect_gen_perm_mask (vectype, sel);
>> +       gcc_assert (perm_mask_high != NULL);
>> +
>> +       for (i = 0; i < nelt; i++)
>> +         sel[i] += nelt / 2;
>> +       perm_mask_low = vect_gen_perm_mask (vectype, sel);
>> +       gcc_assert (perm_mask_low != NULL);
>> +
>> +       for (i = 0, n = log_length; i < n; i++)
>> +         {
>> +           for (j = 0; j < length/2; j++)
>> +             {
>> +               vect1 = dr_chain[j];
>> +               vect2 = dr_chain[j+length/2];
>> +
>> +               /* Create interleaving stmt:
>> +                  high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
>> +                                                       ...}>  */
>> +               high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
>> +               perm_stmt
>> +                 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
>> +                                                 vect1, vect2, perm_mask_high);
>> +               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> +               (*result_chain)[2*j] = high;
>> +
>> +               /* Create interleaving stmt:
>> +                  low = VEC_PERM_EXPR <vect1, vect2,
>> +                                       {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
>> +                                        ...}>  */
>> +               low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
>> +               perm_stmt
>> +                 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
>> +                                                 vect1, vect2, perm_mask_low);
>> +               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> +               (*result_chain)[2*j+1] = low;
>> +             }
>> +           memcpy (dr_chain.address (), result_chain->address (),
>> +                   length * sizeof (tree));
>> +         }
>>      }
>>  }
>>
>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
>> index b87c143..24d0b94 100644
>> --- a/gcc/tree-vect-stmts.c
>> +++ b/gcc/tree-vect-stmts.c
>> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
>>       include the cost of the permutes.  */
>>    if (!store_lanes_p && group_size > 1)
>>      {
>> -      /* Uses a high and low interleave operation for each needed permute.  */
>> -
>> -      int nstmts = ncopies * exact_log2 (group_size) * group_size;
>> +      /* Uses a high and low interleave or shuffle operations for each
>> +        needed permute.  */
>> +      int nstmts = ncopies * ceil_log2 (group_size) * group_size;
>>        inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
>>                                       stmt_info, 0, vect_body);
>>
>>
>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> new file mode 100644
>> index 0000000..cc1e72e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3
>> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
>> +
>> +#define byte unsigned char
>> +
>> +void
>> +matrix_mul (byte *in, byte *out, int size)
>> +{
>> +  int i;
>> +  for (i = 0; i < size; i++)
>> +    {
>> +      out[0] = in[0] + in[1] + in[3];
>> +      out[1] = in[0] + in[2] + in[4];
>> +      out[2] = in[1] + in[2] + in[4];
>> +      in += 4;
>> +      out += 3;
>> +    }
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3.
  2014-06-10 10:11 Evgeny Stupachenko
@ 2014-06-10 11:25 ` Richard Biener
  0 siblings, 0 replies; 3+ messages in thread
From: Richard Biener @ 2014-06-10 11:25 UTC (permalink / raw)
  To: Evgeny Stupachenko; +Cc: GCC Patches, Jakub Jelinek, Uros Bizjak

On Tue, 10 Jun 2014, Evgeny Stupachenko wrote:

> ping.
> The changes are similar to already committed on loads group.

Ok.

Thanks,
Richard.

> On Tue, Jun 3, 2014 at 5:22 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> > I've added a bug report for the stores group case:
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403
> >
> >
> > On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> >> Ping.
> >> Test is modified according to the fix in the test for loads.
> >>
> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >> new file mode 100644
> >> index 0000000..e7161f7
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >> @@ -0,0 +1,21 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */
> >> +
> >> +#define byte unsigned char
> >> +
> >> +void
> >> +matrix_mul (byte *in, byte *out, int size)
> >> +{
> >> +  int i;
> >> +  for (i = 0; i < size; i++)
> >> +    {
> >> +      out[0] = in[0] + in[1] + in[3];
> >> +      out[1] = in[0] + in[2] + in[4];
> >> +      out[2] = in[1] + in[2] + in[4];
> >> +      in += 4;
> >> +      out += 3;
> >> +    }
> >> +}
> >> +
> >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> >> target { i?86-*-* x86_64-*-* } } } } */
> >> +/* { dg-final { cleanup-tree-dump "vect" } } */
> >>
> >>
> >> On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> >>> 2nd part of patch is on stores group.
> >>> Bootstrap and make check passed on x86.
> >>>
> >>> Is it ok?
> >>>
> >>> 2014-05-06  Evgeny Stupachenko  <evstupac@gmail.com>
> >>>
> >>>         * tree-vect-data-refs.c (vect_grouped_store_supported): New
> >>>         check for storess group of length 3.
> >>>         (vect_permute_store_chain): New permutations for storess group of
> >>>         length 3.
> >>>         * tree-vect-stmts.c (vect_model_store_cost): Change cost
> >>>         of vec_perm_shuffle for the new permutations.
> >>>
> >>> ChangeLog for testsuite:
> >>>
> >>> 2014-05-06  Evgeny Stupachenko  <evstupac@gmail.com>
> >>>
> >>>        PR tree-optimization/52252
> >>>        * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.
> >>>
> >>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> >>> index ef710cf..fb0e30d 100644
> >>> --- a/gcc/tree-vect-data-refs.c
> >>> +++ b/gcc/tree-vect-data-refs.c
> >>> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype,
> >>> unsigned HOST_WIDE_INT count)
> >>>  {
> >>>    enum machine_mode mode = TYPE_MODE (vectype);
> >>>
> >>> -  /* vect_permute_store_chain requires the group size to be a power of two.  */
> >>> -  if (exact_log2 (count) == -1)
> >>> +  /* vect_permute_store_chain requires the group size to be equal to 3 or
> >>> +     be a power of two.  */
> >>> +  if (count != 3 && exact_log2 (count) == -1)
> >>>      {
> >>>        if (dump_enabled_p ())
> >>>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >>> -                         "the size of the group of accesses"
> >>> -                         " is not a power of 2\n");
> >>> +                        "the size of the group of accesses"
> >>> +                        " is not a power of 2 or not eqaul to 3\n");
> >>>        return false;
> >>>      }
> >>>
> >>> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype,
> >>> unsigned HOST_WIDE_INT count)
> >>>      {
> >>>        unsigned int i, nelt = GET_MODE_NUNITS (mode);
> >>>        unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
> >>> -      for (i = 0; i < nelt / 2; i++)
> >>> +
> >>> +      if (count == 3)
> >>>         {
> >>> -         sel[i * 2] = i;
> >>> -         sel[i * 2 + 1] = i + nelt;
> >>> +         unsigned int j0 = 0, j1 = 0, j2 = 0;
> >>> +         unsigned int i, j;
> >>> +
> >>> +         for (j = 0; j < 3; j++)
> >>> +           {
> >>> +             int nelt0 = ((3 - j) * nelt) % 3;
> >>> +             int nelt1 = ((3 - j) * nelt + 1) % 3;
> >>> +             int nelt2 = ((3 - j) * nelt + 2) % 3;
> >>> +             for (i = 0; i < nelt; i++)
> >>> +               {
> >>> +                 if (3 * i + nelt0 < nelt)
> >>> +                   sel[3 * i + nelt0] = j0++;
> >>> +                 if (3 * i + nelt1 < nelt)
> >>> +                   sel[3 * i + nelt1] = nelt + j1++;
> >>> +                 if (3 * i + nelt2 < nelt)
> >>> +                   sel[3 * i + nelt2] = 0;
> >>> +               }
> >>> +             if (!can_vec_perm_p (mode, false, sel))
> >>> +               {
> >>> +                 if (dump_enabled_p ())
> >>> +                   dump_printf (MSG_MISSED_OPTIMIZATION,
> >>> +                                "permutaion op not supported by target.\n");
> >>> +                 return false;
> >>> +               }
> >>> +
> >>> +             for (i = 0; i < nelt; i++)
> >>> +               {
> >>> +                 if (3 * i + nelt0 < nelt)
> >>> +                   sel[3 * i + nelt0] = 3 * i + nelt0;
> >>> +                 if (3 * i + nelt1 < nelt)
> >>> +                   sel[3 * i + nelt1] = 3 * i + nelt1;
> >>> +                 if (3 * i + nelt2 < nelt)
> >>> +                   sel[3 * i + nelt2] = nelt + j2++;
> >>> +               }
> >>> +             if (!can_vec_perm_p (mode, false, sel))
> >>> +               {
> >>> +                 if (dump_enabled_p ())
> >>> +                   dump_printf (MSG_MISSED_OPTIMIZATION,
> >>> +                                "permutaion op not supported by target.\n");
> >>> +                 return false;
> >>> +               }
> >>> +           }
> >>> +         return true;
> >>>         }
> >>> -      if (can_vec_perm_p (mode, false, sel))
> >>> +      else
> >>>         {
> >>> -         for (i = 0; i < nelt; i++)
> >>> -           sel[i] += nelt / 2;
> >>> -         if (can_vec_perm_p (mode, false, sel))
> >>> -           return true;
> >>> +         /* If length is not equal to 3 then only power of 2 is supported.  */
> >>> +         gcc_assert (exact_log2 (count) != -1);
> >>> +
> >>> +         for (i = 0; i < nelt / 2; i++)
> >>> +           {
> >>> +             sel[i * 2] = i;
> >>> +             sel[i * 2 + 1] = i + nelt;
> >>> +           }
> >>> +           if (can_vec_perm_p (mode, false, sel))
> >>> +             {
> >>> +               for (i = 0; i < nelt; i++)
> >>> +                 sel[i] += nelt / 2;
> >>> +               if (can_vec_perm_p (mode, false, sel))
> >>> +                 return true;
> >>> +             }
> >>>         }
> >>>      }
> >>>
> >>>    if (dump_enabled_p ())
> >>>      dump_printf (MSG_MISSED_OPTIMIZATION,
> >>> -                 "interleave op not supported by target.\n");
> >>> +                "permutaion op not supported by target.\n");
> >>>    return false;
> >>>  }
> >>>
> >>> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype,
> >>> unsigned HOST_WIDE_INT count)
> >>>  /* Function vect_permute_store_chain.
> >>>
> >>>     Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
> >>> -   a power of 2, generate interleave_high/low stmts to reorder the data
> >>> -   correctly for the stores.  Return the final references for stores in
> >>> -   RESULT_CHAIN.
> >>> +   a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
> >>> +   the data correctly for the stores.  Return the final references for stores
> >>> +   in RESULT_CHAIN.
> >>>
> >>>     E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
> >>>     The input is 4 vectors each containing 8 elements.  We assign a number to
> >>> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
> >>>    gimple perm_stmt;
> >>>    tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
> >>>    tree perm_mask_low, perm_mask_high;
> >>> -  unsigned int i, n;
> >>> +  tree data_ref;
> >>> +  tree perm3_mask_low, perm3_mask_high;
> >>> +  unsigned int i, n, log_length = exact_log2 (length);
> >>>    unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
> >>>    unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
> >>>
> >>> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
> >>>    memcpy (result_chain->address (), dr_chain.address (),
> >>>           length * sizeof (tree));
> >>>
> >>> -  for (i = 0, n = nelt / 2; i < n; i++)
> >>> +  if (length == 3)
> >>>      {
> >>> -      sel[i * 2] = i;
> >>> -      sel[i * 2 + 1] = i + nelt;
> >>> -    }
> >>> -  perm_mask_high = vect_gen_perm_mask (vectype, sel);
> >>> -  gcc_assert (perm_mask_high != NULL);
> >>> +      unsigned int j0 = 0, j1 = 0, j2 = 0;
> >>>
> >>> -  for (i = 0; i < nelt; i++)
> >>> -    sel[i] += nelt / 2;
> >>> -  perm_mask_low = vect_gen_perm_mask (vectype, sel);
> >>> -  gcc_assert (perm_mask_low != NULL);
> >>> +      for (j = 0; j < 3; j++)
> >>> +        {
> >>> +         int nelt0 = ((3 - j) * nelt) % 3;
> >>> +         int nelt1 = ((3 - j) * nelt + 1) % 3;
> >>> +         int nelt2 = ((3 - j) * nelt + 2) % 3;
> >>>
> >>> -  for (i = 0, n = exact_log2 (length); i < n; i++)
> >>> -    {
> >>> -      for (j = 0; j < length/2; j++)
> >>> -       {
> >>> -         vect1 = dr_chain[j];
> >>> -         vect2 = dr_chain[j+length/2];
> >>> +         for (i = 0; i < nelt; i++)
> >>> +           {
> >>> +             if (3 * i + nelt0 < nelt)
> >>> +               sel[3 * i + nelt0] = j0++;
> >>> +             if (3 * i + nelt1 < nelt)
> >>> +               sel[3 * i + nelt1] = nelt + j1++;
> >>> +             if (3 * i + nelt2 < nelt)
> >>> +               sel[3 * i + nelt2] = 0;
> >>> +           }
> >>> +         perm3_mask_low = vect_gen_perm_mask (vectype, sel);
> >>> +         gcc_assert (perm3_mask_low != NULL);
> >>> +
> >>> +         for (i = 0; i < nelt; i++)
> >>> +           {
> >>> +             if (3 * i + nelt0 < nelt)
> >>> +               sel[3 * i + nelt0] = 3 * i + nelt0;
> >>> +             if (3 * i + nelt1 < nelt)
> >>> +               sel[3 * i + nelt1] = 3 * i + nelt1;
> >>> +             if (3 * i + nelt2 < nelt)
> >>> +               sel[3 * i + nelt2] = nelt + j2++;
> >>> +           }
> >>> +         perm3_mask_high = vect_gen_perm_mask (vectype, sel);
> >>> +         gcc_assert (perm3_mask_high != NULL);
> >>> +
> >>> +         vect1 = dr_chain[0];
> >>> +         vect2 = dr_chain[1];
> >>>
> >>>           /* Create interleaving stmt:
> >>> -            high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
> >>> -         high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
> >>> -         perm_stmt
> >>> -           = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
> >>> -                                           vect1, vect2, perm_mask_high);
> >>> +            low = VEC_PERM_EXPR <vect1, vect2,
> >>> +                                 {j, nelt, *, j + 1, nelt + j + 1, *,
> >>> +                                  j + 2, nelt + j + 2, *, ...}>  */
> >>> +         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
> >>> +         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> >>> +                                                   vect1, vect2,
> >>> +                                                   perm3_mask_low);
> >>>           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> >>> -         (*result_chain)[2*j] = high;
> >>>
> >>> +         vect1 = data_ref;
> >>> +         vect2 = dr_chain[2];
> >>>           /* Create interleaving stmt:
> >>> -            low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
> >>> -                                                nelt*3/2+1, ...}>  */
> >>> -         low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
> >>> -         perm_stmt
> >>> -           = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
> >>> -                                           vect1, vect2, perm_mask_low);
> >>> +            low = VEC_PERM_EXPR <vect1, vect2,
> >>> +                                 {0, 1, nelt + j, 3, 4, nelt + j + 1,
> >>> +                                  6, 7, nelt + j + 2, ...}>  */
> >>> +         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
> >>> +         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> >>> +                                                   vect1, vect2,
> >>> +                                                   perm3_mask_high);
> >>>           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> >>> -         (*result_chain)[2*j+1] = low;
> >>> +         (*result_chain)[j] = data_ref;
> >>> +       }
> >>> +    }
> >>> +  else
> >>> +    {
> >>> +      /* If length is not equal to 3 then only power of 2 is supported.  */
> >>> +      gcc_assert (exact_log2 (length) != -1);
> >>> +
> >>> +      for (i = 0, n = nelt / 2; i < n; i++)
> >>> +       {
> >>> +         sel[i * 2] = i;
> >>> +         sel[i * 2 + 1] = i + nelt;
> >>>         }
> >>> -      memcpy (dr_chain.address (), result_chain->address (),
> >>> -             length * sizeof (tree));
> >>> +       perm_mask_high = vect_gen_perm_mask (vectype, sel);
> >>> +       gcc_assert (perm_mask_high != NULL);
> >>> +
> >>> +       for (i = 0; i < nelt; i++)
> >>> +         sel[i] += nelt / 2;
> >>> +       perm_mask_low = vect_gen_perm_mask (vectype, sel);
> >>> +       gcc_assert (perm_mask_low != NULL);
> >>> +
> >>> +       for (i = 0, n = log_length; i < n; i++)
> >>> +         {
> >>> +           for (j = 0; j < length/2; j++)
> >>> +             {
> >>> +               vect1 = dr_chain[j];
> >>> +               vect2 = dr_chain[j+length/2];
> >>> +
> >>> +               /* Create interleaving stmt:
> >>> +                  high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
> >>> +                                                       ...}>  */
> >>> +               high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
> >>> +               perm_stmt
> >>> +                 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
> >>> +                                                 vect1, vect2, perm_mask_high);
> >>> +               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> >>> +               (*result_chain)[2*j] = high;
> >>> +
> >>> +               /* Create interleaving stmt:
> >>> +                  low = VEC_PERM_EXPR <vect1, vect2,
> >>> +                                       {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
> >>> +                                        ...}>  */
> >>> +               low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
> >>> +               perm_stmt
> >>> +                 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
> >>> +                                                 vect1, vect2, perm_mask_low);
> >>> +               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> >>> +               (*result_chain)[2*j+1] = low;
> >>> +             }
> >>> +           memcpy (dr_chain.address (), result_chain->address (),
> >>> +                   length * sizeof (tree));
> >>> +         }
> >>>      }
> >>>  }
> >>>
> >>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> >>> index b87c143..24d0b94 100644
> >>> --- a/gcc/tree-vect-stmts.c
> >>> +++ b/gcc/tree-vect-stmts.c
> >>> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
> >>>       include the cost of the permutes.  */
> >>>    if (!store_lanes_p && group_size > 1)
> >>>      {
> >>> -      /* Uses a high and low interleave operation for each needed permute.  */
> >>> -
> >>> -      int nstmts = ncopies * exact_log2 (group_size) * group_size;
> >>> +      /* Uses a high and low interleave or shuffle operations for each
> >>> +        needed permute.  */
> >>> +      int nstmts = ncopies * ceil_log2 (group_size) * group_size;
> >>>        inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
> >>>                                       stmt_info, 0, vect_body);
> >>>
> >>>
> >>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >>> new file mode 100644
> >>> index 0000000..cc1e72e
> >>> --- /dev/null
> >>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >>> @@ -0,0 +1,21 @@
> >>> +/* { dg-do compile } */
> >>> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3
> >>> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
> >>> +
> >>> +#define byte unsigned char
> >>> +
> >>> +void
> >>> +matrix_mul (byte *in, byte *out, int size)
> >>> +{
> >>> +  int i;
> >>> +  for (i = 0; i < size; i++)
> >>> +    {
> >>> +      out[0] = in[0] + in[1] + in[3];
> >>> +      out[1] = in[0] + in[2] + in[4];
> >>> +      out[2] = in[1] + in[2] + in[4];
> >>> +      in += 4;
> >>> +      out += 3;
> >>> +    }
> >>> +}
> >>> +
> >>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> >>> +/* { dg-final { cleanup-tree-dump "vect" } } */
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3.
@ 2014-06-10 10:11 Evgeny Stupachenko
  2014-06-10 11:25 ` Richard Biener
  0 siblings, 1 reply; 3+ messages in thread
From: Evgeny Stupachenko @ 2014-06-10 10:11 UTC (permalink / raw)
  To: GCC Patches, Richard Biener, Jakub Jelinek, Uros Bizjak

ping.
The changes are similar to already committed on loads group.

On Tue, Jun 3, 2014 at 5:22 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> I've added a bug report for the stores group case:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403
>
>
> On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>> Ping.
>> Test is modified according to the fix in the test for loads.
>>
>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> new file mode 100644
>> index 0000000..e7161f7
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */
>> +
>> +#define byte unsigned char
>> +
>> +void
>> +matrix_mul (byte *in, byte *out, int size)
>> +{
>> +  int i;
>> +  for (i = 0; i < size; i++)
>> +    {
>> +      out[0] = in[0] + in[1] + in[3];
>> +      out[1] = in[0] + in[2] + in[4];
>> +      out[2] = in[1] + in[2] + in[4];
>> +      in += 4;
>> +      out += 3;
>> +    }
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
>> target { i?86-*-* x86_64-*-* } } } } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */
>>
>>
>> On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>>> 2nd part of patch is on stores group.
>>> Bootstrap and make check passed on x86.
>>>
>>> Is it ok?
>>>
>>> 2014-05-06  Evgeny Stupachenko  <evstupac@gmail.com>
>>>
>>>         * tree-vect-data-refs.c (vect_grouped_store_supported): New
>>>         check for storess group of length 3.
>>>         (vect_permute_store_chain): New permutations for storess group of
>>>         length 3.
>>>         * tree-vect-stmts.c (vect_model_store_cost): Change cost
>>>         of vec_perm_shuffle for the new permutations.
>>>
>>> ChangeLog for testsuite:
>>>
>>> 2014-05-06  Evgeny Stupachenko  <evstupac@gmail.com>
>>>
>>>        PR tree-optimization/52252
>>>        * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.
>>>
>>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>>> index ef710cf..fb0e30d 100644
>>> --- a/gcc/tree-vect-data-refs.c
>>> +++ b/gcc/tree-vect-data-refs.c
>>> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype,
>>> unsigned HOST_WIDE_INT count)
>>>  {
>>>    enum machine_mode mode = TYPE_MODE (vectype);
>>>
>>> -  /* vect_permute_store_chain requires the group size to be a power of two.  */
>>> -  if (exact_log2 (count) == -1)
>>> +  /* vect_permute_store_chain requires the group size to be equal to 3 or
>>> +     be a power of two.  */
>>> +  if (count != 3 && exact_log2 (count) == -1)
>>>      {
>>>        if (dump_enabled_p ())
>>>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>> -                         "the size of the group of accesses"
>>> -                         " is not a power of 2\n");
>>> +                        "the size of the group of accesses"
>>> +                        " is not a power of 2 or not eqaul to 3\n");
>>>        return false;
>>>      }
>>>
>>> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype,
>>> unsigned HOST_WIDE_INT count)
>>>      {
>>>        unsigned int i, nelt = GET_MODE_NUNITS (mode);
>>>        unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>>> -      for (i = 0; i < nelt / 2; i++)
>>> +
>>> +      if (count == 3)
>>>         {
>>> -         sel[i * 2] = i;
>>> -         sel[i * 2 + 1] = i + nelt;
>>> +         unsigned int j0 = 0, j1 = 0, j2 = 0;
>>> +         unsigned int i, j;
>>> +
>>> +         for (j = 0; j < 3; j++)
>>> +           {
>>> +             int nelt0 = ((3 - j) * nelt) % 3;
>>> +             int nelt1 = ((3 - j) * nelt + 1) % 3;
>>> +             int nelt2 = ((3 - j) * nelt + 2) % 3;
>>> +             for (i = 0; i < nelt; i++)
>>> +               {
>>> +                 if (3 * i + nelt0 < nelt)
>>> +                   sel[3 * i + nelt0] = j0++;
>>> +                 if (3 * i + nelt1 < nelt)
>>> +                   sel[3 * i + nelt1] = nelt + j1++;
>>> +                 if (3 * i + nelt2 < nelt)
>>> +                   sel[3 * i + nelt2] = 0;
>>> +               }
>>> +             if (!can_vec_perm_p (mode, false, sel))
>>> +               {
>>> +                 if (dump_enabled_p ())
>>> +                   dump_printf (MSG_MISSED_OPTIMIZATION,
>>> +                                "permutaion op not supported by target.\n");
>>> +                 return false;
>>> +               }
>>> +
>>> +             for (i = 0; i < nelt; i++)
>>> +               {
>>> +                 if (3 * i + nelt0 < nelt)
>>> +                   sel[3 * i + nelt0] = 3 * i + nelt0;
>>> +                 if (3 * i + nelt1 < nelt)
>>> +                   sel[3 * i + nelt1] = 3 * i + nelt1;
>>> +                 if (3 * i + nelt2 < nelt)
>>> +                   sel[3 * i + nelt2] = nelt + j2++;
>>> +               }
>>> +             if (!can_vec_perm_p (mode, false, sel))
>>> +               {
>>> +                 if (dump_enabled_p ())
>>> +                   dump_printf (MSG_MISSED_OPTIMIZATION,
>>> +                                "permutaion op not supported by target.\n");
>>> +                 return false;
>>> +               }
>>> +           }
>>> +         return true;
>>>         }
>>> -      if (can_vec_perm_p (mode, false, sel))
>>> +      else
>>>         {
>>> -         for (i = 0; i < nelt; i++)
>>> -           sel[i] += nelt / 2;
>>> -         if (can_vec_perm_p (mode, false, sel))
>>> -           return true;
>>> +         /* If length is not equal to 3 then only power of 2 is supported.  */
>>> +         gcc_assert (exact_log2 (count) != -1);
>>> +
>>> +         for (i = 0; i < nelt / 2; i++)
>>> +           {
>>> +             sel[i * 2] = i;
>>> +             sel[i * 2 + 1] = i + nelt;
>>> +           }
>>> +           if (can_vec_perm_p (mode, false, sel))
>>> +             {
>>> +               for (i = 0; i < nelt; i++)
>>> +                 sel[i] += nelt / 2;
>>> +               if (can_vec_perm_p (mode, false, sel))
>>> +                 return true;
>>> +             }
>>>         }
>>>      }
>>>
>>>    if (dump_enabled_p ())
>>>      dump_printf (MSG_MISSED_OPTIMIZATION,
>>> -                 "interleave op not supported by target.\n");
>>> +                "permutaion op not supported by target.\n");
>>>    return false;
>>>  }
>>>
>>> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype,
>>> unsigned HOST_WIDE_INT count)
>>>  /* Function vect_permute_store_chain.
>>>
>>>     Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
>>> -   a power of 2, generate interleave_high/low stmts to reorder the data
>>> -   correctly for the stores.  Return the final references for stores in
>>> -   RESULT_CHAIN.
>>> +   a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
>>> +   the data correctly for the stores.  Return the final references for stores
>>> +   in RESULT_CHAIN.
>>>
>>>     E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
>>>     The input is 4 vectors each containing 8 elements.  We assign a number to
>>> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
>>>    gimple perm_stmt;
>>>    tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
>>>    tree perm_mask_low, perm_mask_high;
>>> -  unsigned int i, n;
>>> +  tree data_ref;
>>> +  tree perm3_mask_low, perm3_mask_high;
>>> +  unsigned int i, n, log_length = exact_log2 (length);
>>>    unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
>>>    unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>>>
>>> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
>>>    memcpy (result_chain->address (), dr_chain.address (),
>>>           length * sizeof (tree));
>>>
>>> -  for (i = 0, n = nelt / 2; i < n; i++)
>>> +  if (length == 3)
>>>      {
>>> -      sel[i * 2] = i;
>>> -      sel[i * 2 + 1] = i + nelt;
>>> -    }
>>> -  perm_mask_high = vect_gen_perm_mask (vectype, sel);
>>> -  gcc_assert (perm_mask_high != NULL);
>>> +      unsigned int j0 = 0, j1 = 0, j2 = 0;
>>>
>>> -  for (i = 0; i < nelt; i++)
>>> -    sel[i] += nelt / 2;
>>> -  perm_mask_low = vect_gen_perm_mask (vectype, sel);
>>> -  gcc_assert (perm_mask_low != NULL);
>>> +      for (j = 0; j < 3; j++)
>>> +        {
>>> +         int nelt0 = ((3 - j) * nelt) % 3;
>>> +         int nelt1 = ((3 - j) * nelt + 1) % 3;
>>> +         int nelt2 = ((3 - j) * nelt + 2) % 3;
>>>
>>> -  for (i = 0, n = exact_log2 (length); i < n; i++)
>>> -    {
>>> -      for (j = 0; j < length/2; j++)
>>> -       {
>>> -         vect1 = dr_chain[j];
>>> -         vect2 = dr_chain[j+length/2];
>>> +         for (i = 0; i < nelt; i++)
>>> +           {
>>> +             if (3 * i + nelt0 < nelt)
>>> +               sel[3 * i + nelt0] = j0++;
>>> +             if (3 * i + nelt1 < nelt)
>>> +               sel[3 * i + nelt1] = nelt + j1++;
>>> +             if (3 * i + nelt2 < nelt)
>>> +               sel[3 * i + nelt2] = 0;
>>> +           }
>>> +         perm3_mask_low = vect_gen_perm_mask (vectype, sel);
>>> +         gcc_assert (perm3_mask_low != NULL);
>>> +
>>> +         for (i = 0; i < nelt; i++)
>>> +           {
>>> +             if (3 * i + nelt0 < nelt)
>>> +               sel[3 * i + nelt0] = 3 * i + nelt0;
>>> +             if (3 * i + nelt1 < nelt)
>>> +               sel[3 * i + nelt1] = 3 * i + nelt1;
>>> +             if (3 * i + nelt2 < nelt)
>>> +               sel[3 * i + nelt2] = nelt + j2++;
>>> +           }
>>> +         perm3_mask_high = vect_gen_perm_mask (vectype, sel);
>>> +         gcc_assert (perm3_mask_high != NULL);
>>> +
>>> +         vect1 = dr_chain[0];
>>> +         vect2 = dr_chain[1];
>>>
>>>           /* Create interleaving stmt:
>>> -            high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
>>> -         high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
>>> -         perm_stmt
>>> -           = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
>>> -                                           vect1, vect2, perm_mask_high);
>>> +            low = VEC_PERM_EXPR <vect1, vect2,
>>> +                                 {j, nelt, *, j + 1, nelt + j + 1, *,
>>> +                                  j + 2, nelt + j + 2, *, ...}>  */
>>> +         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
>>> +         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> +                                                   vect1, vect2,
>>> +                                                   perm3_mask_low);
>>>           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> -         (*result_chain)[2*j] = high;
>>>
>>> +         vect1 = data_ref;
>>> +         vect2 = dr_chain[2];
>>>           /* Create interleaving stmt:
>>> -            low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
>>> -                                                nelt*3/2+1, ...}>  */
>>> -         low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
>>> -         perm_stmt
>>> -           = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
>>> -                                           vect1, vect2, perm_mask_low);
>>> +            low = VEC_PERM_EXPR <vect1, vect2,
>>> +                                 {0, 1, nelt + j, 3, 4, nelt + j + 1,
>>> +                                  6, 7, nelt + j + 2, ...}>  */
>>> +         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
>>> +         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> +                                                   vect1, vect2,
>>> +                                                   perm3_mask_high);
>>>           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> -         (*result_chain)[2*j+1] = low;
>>> +         (*result_chain)[j] = data_ref;
>>> +       }
>>> +    }
>>> +  else
>>> +    {
>>> +      /* If length is not equal to 3 then only power of 2 is supported.  */
>>> +      gcc_assert (exact_log2 (length) != -1);
>>> +
>>> +      for (i = 0, n = nelt / 2; i < n; i++)
>>> +       {
>>> +         sel[i * 2] = i;
>>> +         sel[i * 2 + 1] = i + nelt;
>>>         }
>>> -      memcpy (dr_chain.address (), result_chain->address (),
>>> -             length * sizeof (tree));
>>> +       perm_mask_high = vect_gen_perm_mask (vectype, sel);
>>> +       gcc_assert (perm_mask_high != NULL);
>>> +
>>> +       for (i = 0; i < nelt; i++)
>>> +         sel[i] += nelt / 2;
>>> +       perm_mask_low = vect_gen_perm_mask (vectype, sel);
>>> +       gcc_assert (perm_mask_low != NULL);
>>> +
>>> +       for (i = 0, n = log_length; i < n; i++)
>>> +         {
>>> +           for (j = 0; j < length/2; j++)
>>> +             {
>>> +               vect1 = dr_chain[j];
>>> +               vect2 = dr_chain[j+length/2];
>>> +
>>> +               /* Create interleaving stmt:
>>> +                  high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
>>> +                                                       ...}>  */
>>> +               high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
>>> +               perm_stmt
>>> +                 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
>>> +                                                 vect1, vect2, perm_mask_high);
>>> +               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> +               (*result_chain)[2*j] = high;
>>> +
>>> +               /* Create interleaving stmt:
>>> +                  low = VEC_PERM_EXPR <vect1, vect2,
>>> +                                       {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
>>> +                                        ...}>  */
>>> +               low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
>>> +               perm_stmt
>>> +                 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
>>> +                                                 vect1, vect2, perm_mask_low);
>>> +               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> +               (*result_chain)[2*j+1] = low;
>>> +             }
>>> +           memcpy (dr_chain.address (), result_chain->address (),
>>> +                   length * sizeof (tree));
>>> +         }
>>>      }
>>>  }
>>>
>>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
>>> index b87c143..24d0b94 100644
>>> --- a/gcc/tree-vect-stmts.c
>>> +++ b/gcc/tree-vect-stmts.c
>>> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
>>>       include the cost of the permutes.  */
>>>    if (!store_lanes_p && group_size > 1)
>>>      {
>>> -      /* Uses a high and low interleave operation for each needed permute.  */
>>> -
>>> -      int nstmts = ncopies * exact_log2 (group_size) * group_size;
>>> +      /* Uses a high and low interleave or shuffle operations for each
>>> +        needed permute.  */
>>> +      int nstmts = ncopies * ceil_log2 (group_size) * group_size;
>>>        inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
>>>                                       stmt_info, 0, vect_body);
>>>
>>>
>>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>>> new file mode 100644
>>> index 0000000..cc1e72e
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>>> @@ -0,0 +1,21 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3
>>> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
>>> +
>>> +#define byte unsigned char
>>> +
>>> +void
>>> +matrix_mul (byte *in, byte *out, int size)
>>> +{
>>> +  int i;
>>> +  for (i = 0; i < size; i++)
>>> +    {
>>> +      out[0] = in[0] + in[1] + in[3];
>>> +      out[1] = in[0] + in[2] + in[4];
>>> +      out[2] = in[1] + in[2] + in[4];
>>> +      in += 4;
>>> +      out += 3;
>>> +    }
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>> +/* { dg-final { cleanup-tree-dump "vect" } } */

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2014-06-10 11:25 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-03 13:22 [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3 Evgeny Stupachenko
2014-06-10 10:11 Evgeny Stupachenko
2014-06-10 11:25 ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).