* Re: [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3.
@ 2014-06-03 13:22 Evgeny Stupachenko
0 siblings, 0 replies; 3+ messages in thread
From: Evgeny Stupachenko @ 2014-06-03 13:22 UTC (permalink / raw)
To: GCC Patches, Richard Biener, Jakub Jelinek, Uros Bizjak
I've added a bug report for the stores group case:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403
On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> Ping.
> Test is modified according to the fix in the test for loads.
>
> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> new file mode 100644
> index 0000000..e7161f7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> @@ -0,0 +1,21 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */
> +
> +#define byte unsigned char
> +
> +void
> +matrix_mul (byte *in, byte *out, int size)
> +{
> + int i;
> + for (i = 0; i < size; i++)
> + {
> + out[0] = in[0] + in[1] + in[3];
> + out[1] = in[0] + in[2] + in[4];
> + out[2] = in[1] + in[2] + in[4];
> + in += 4;
> + out += 3;
> + }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> target { i?86-*-* x86_64-*-* } } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
>
>
> On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>> 2nd part of patch is on stores group.
>> Bootstrap and make check passed on x86.
>>
>> Is it ok?
>>
>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com>
>>
>> * tree-vect-data-refs.c (vect_grouped_store_supported): New
>> check for storess group of length 3.
>> (vect_permute_store_chain): New permutations for storess group of
>> length 3.
>> * tree-vect-stmts.c (vect_model_store_cost): Change cost
>> of vec_perm_shuffle for the new permutations.
>>
>> ChangeLog for testsuite:
>>
>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com>
>>
>> PR tree-optimization/52252
>> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.
>>
>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>> index ef710cf..fb0e30d 100644
>> --- a/gcc/tree-vect-data-refs.c
>> +++ b/gcc/tree-vect-data-refs.c
>> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype,
>> unsigned HOST_WIDE_INT count)
>> {
>> enum machine_mode mode = TYPE_MODE (vectype);
>>
>> - /* vect_permute_store_chain requires the group size to be a power of two. */
>> - if (exact_log2 (count) == -1)
>> + /* vect_permute_store_chain requires the group size to be equal to 3 or
>> + be a power of two. */
>> + if (count != 3 && exact_log2 (count) == -1)
>> {
>> if (dump_enabled_p ())
>> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>> - "the size of the group of accesses"
>> - " is not a power of 2\n");
>> + "the size of the group of accesses"
>> + " is not a power of 2 or not eqaul to 3\n");
>> return false;
>> }
>>
>> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype,
>> unsigned HOST_WIDE_INT count)
>> {
>> unsigned int i, nelt = GET_MODE_NUNITS (mode);
>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>> - for (i = 0; i < nelt / 2; i++)
>> +
>> + if (count == 3)
>> {
>> - sel[i * 2] = i;
>> - sel[i * 2 + 1] = i + nelt;
>> + unsigned int j0 = 0, j1 = 0, j2 = 0;
>> + unsigned int i, j;
>> +
>> + for (j = 0; j < 3; j++)
>> + {
>> + int nelt0 = ((3 - j) * nelt) % 3;
>> + int nelt1 = ((3 - j) * nelt + 1) % 3;
>> + int nelt2 = ((3 - j) * nelt + 2) % 3;
>> + for (i = 0; i < nelt; i++)
>> + {
>> + if (3 * i + nelt0 < nelt)
>> + sel[3 * i + nelt0] = j0++;
>> + if (3 * i + nelt1 < nelt)
>> + sel[3 * i + nelt1] = nelt + j1++;
>> + if (3 * i + nelt2 < nelt)
>> + sel[3 * i + nelt2] = 0;
>> + }
>> + if (!can_vec_perm_p (mode, false, sel))
>> + {
>> + if (dump_enabled_p ())
>> + dump_printf (MSG_MISSED_OPTIMIZATION,
>> + "permutaion op not supported by target.\n");
>> + return false;
>> + }
>> +
>> + for (i = 0; i < nelt; i++)
>> + {
>> + if (3 * i + nelt0 < nelt)
>> + sel[3 * i + nelt0] = 3 * i + nelt0;
>> + if (3 * i + nelt1 < nelt)
>> + sel[3 * i + nelt1] = 3 * i + nelt1;
>> + if (3 * i + nelt2 < nelt)
>> + sel[3 * i + nelt2] = nelt + j2++;
>> + }
>> + if (!can_vec_perm_p (mode, false, sel))
>> + {
>> + if (dump_enabled_p ())
>> + dump_printf (MSG_MISSED_OPTIMIZATION,
>> + "permutaion op not supported by target.\n");
>> + return false;
>> + }
>> + }
>> + return true;
>> }
>> - if (can_vec_perm_p (mode, false, sel))
>> + else
>> {
>> - for (i = 0; i < nelt; i++)
>> - sel[i] += nelt / 2;
>> - if (can_vec_perm_p (mode, false, sel))
>> - return true;
>> + /* If length is not equal to 3 then only power of 2 is supported. */
>> + gcc_assert (exact_log2 (count) != -1);
>> +
>> + for (i = 0; i < nelt / 2; i++)
>> + {
>> + sel[i * 2] = i;
>> + sel[i * 2 + 1] = i + nelt;
>> + }
>> + if (can_vec_perm_p (mode, false, sel))
>> + {
>> + for (i = 0; i < nelt; i++)
>> + sel[i] += nelt / 2;
>> + if (can_vec_perm_p (mode, false, sel))
>> + return true;
>> + }
>> }
>> }
>>
>> if (dump_enabled_p ())
>> dump_printf (MSG_MISSED_OPTIMIZATION,
>> - "interleave op not supported by target.\n");
>> + "permutaion op not supported by target.\n");
>> return false;
>> }
>>
>> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype,
>> unsigned HOST_WIDE_INT count)
>> /* Function vect_permute_store_chain.
>>
>> Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
>> - a power of 2, generate interleave_high/low stmts to reorder the data
>> - correctly for the stores. Return the final references for stores in
>> - RESULT_CHAIN.
>> + a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
>> + the data correctly for the stores. Return the final references for stores
>> + in RESULT_CHAIN.
>>
>> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
>> The input is 4 vectors each containing 8 elements. We assign a number to
>> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
>> gimple perm_stmt;
>> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
>> tree perm_mask_low, perm_mask_high;
>> - unsigned int i, n;
>> + tree data_ref;
>> + tree perm3_mask_low, perm3_mask_high;
>> + unsigned int i, n, log_length = exact_log2 (length);
>> unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>>
>> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
>> memcpy (result_chain->address (), dr_chain.address (),
>> length * sizeof (tree));
>>
>> - for (i = 0, n = nelt / 2; i < n; i++)
>> + if (length == 3)
>> {
>> - sel[i * 2] = i;
>> - sel[i * 2 + 1] = i + nelt;
>> - }
>> - perm_mask_high = vect_gen_perm_mask (vectype, sel);
>> - gcc_assert (perm_mask_high != NULL);
>> + unsigned int j0 = 0, j1 = 0, j2 = 0;
>>
>> - for (i = 0; i < nelt; i++)
>> - sel[i] += nelt / 2;
>> - perm_mask_low = vect_gen_perm_mask (vectype, sel);
>> - gcc_assert (perm_mask_low != NULL);
>> + for (j = 0; j < 3; j++)
>> + {
>> + int nelt0 = ((3 - j) * nelt) % 3;
>> + int nelt1 = ((3 - j) * nelt + 1) % 3;
>> + int nelt2 = ((3 - j) * nelt + 2) % 3;
>>
>> - for (i = 0, n = exact_log2 (length); i < n; i++)
>> - {
>> - for (j = 0; j < length/2; j++)
>> - {
>> - vect1 = dr_chain[j];
>> - vect2 = dr_chain[j+length/2];
>> + for (i = 0; i < nelt; i++)
>> + {
>> + if (3 * i + nelt0 < nelt)
>> + sel[3 * i + nelt0] = j0++;
>> + if (3 * i + nelt1 < nelt)
>> + sel[3 * i + nelt1] = nelt + j1++;
>> + if (3 * i + nelt2 < nelt)
>> + sel[3 * i + nelt2] = 0;
>> + }
>> + perm3_mask_low = vect_gen_perm_mask (vectype, sel);
>> + gcc_assert (perm3_mask_low != NULL);
>> +
>> + for (i = 0; i < nelt; i++)
>> + {
>> + if (3 * i + nelt0 < nelt)
>> + sel[3 * i + nelt0] = 3 * i + nelt0;
>> + if (3 * i + nelt1 < nelt)
>> + sel[3 * i + nelt1] = 3 * i + nelt1;
>> + if (3 * i + nelt2 < nelt)
>> + sel[3 * i + nelt2] = nelt + j2++;
>> + }
>> + perm3_mask_high = vect_gen_perm_mask (vectype, sel);
>> + gcc_assert (perm3_mask_high != NULL);
>> +
>> + vect1 = dr_chain[0];
>> + vect2 = dr_chain[1];
>>
>> /* Create interleaving stmt:
>> - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
>> - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
>> - perm_stmt
>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
>> - vect1, vect2, perm_mask_high);
>> + low = VEC_PERM_EXPR <vect1, vect2,
>> + {j, nelt, *, j + 1, nelt + j + 1, *,
>> + j + 2, nelt + j + 2, *, ...}> */
>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> + vect1, vect2,
>> + perm3_mask_low);
>> vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> - (*result_chain)[2*j] = high;
>>
>> + vect1 = data_ref;
>> + vect2 = dr_chain[2];
>> /* Create interleaving stmt:
>> - low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
>> - nelt*3/2+1, ...}> */
>> - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
>> - perm_stmt
>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
>> - vect1, vect2, perm_mask_low);
>> + low = VEC_PERM_EXPR <vect1, vect2,
>> + {0, 1, nelt + j, 3, 4, nelt + j + 1,
>> + 6, 7, nelt + j + 2, ...}> */
>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>> + vect1, vect2,
>> + perm3_mask_high);
>> vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> - (*result_chain)[2*j+1] = low;
>> + (*result_chain)[j] = data_ref;
>> + }
>> + }
>> + else
>> + {
>> + /* If length is not equal to 3 then only power of 2 is supported. */
>> + gcc_assert (exact_log2 (length) != -1);
>> +
>> + for (i = 0, n = nelt / 2; i < n; i++)
>> + {
>> + sel[i * 2] = i;
>> + sel[i * 2 + 1] = i + nelt;
>> }
>> - memcpy (dr_chain.address (), result_chain->address (),
>> - length * sizeof (tree));
>> + perm_mask_high = vect_gen_perm_mask (vectype, sel);
>> + gcc_assert (perm_mask_high != NULL);
>> +
>> + for (i = 0; i < nelt; i++)
>> + sel[i] += nelt / 2;
>> + perm_mask_low = vect_gen_perm_mask (vectype, sel);
>> + gcc_assert (perm_mask_low != NULL);
>> +
>> + for (i = 0, n = log_length; i < n; i++)
>> + {
>> + for (j = 0; j < length/2; j++)
>> + {
>> + vect1 = dr_chain[j];
>> + vect2 = dr_chain[j+length/2];
>> +
>> + /* Create interleaving stmt:
>> + high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
>> + ...}> */
>> + high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
>> + perm_stmt
>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
>> + vect1, vect2, perm_mask_high);
>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> + (*result_chain)[2*j] = high;
>> +
>> + /* Create interleaving stmt:
>> + low = VEC_PERM_EXPR <vect1, vect2,
>> + {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
>> + ...}> */
>> + low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
>> + perm_stmt
>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
>> + vect1, vect2, perm_mask_low);
>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>> + (*result_chain)[2*j+1] = low;
>> + }
>> + memcpy (dr_chain.address (), result_chain->address (),
>> + length * sizeof (tree));
>> + }
>> }
>> }
>>
>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
>> index b87c143..24d0b94 100644
>> --- a/gcc/tree-vect-stmts.c
>> +++ b/gcc/tree-vect-stmts.c
>> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
>> include the cost of the permutes. */
>> if (!store_lanes_p && group_size > 1)
>> {
>> - /* Uses a high and low interleave operation for each needed permute. */
>> -
>> - int nstmts = ncopies * exact_log2 (group_size) * group_size;
>> + /* Uses a high and low interleave or shuffle operations for each
>> + needed permute. */
>> + int nstmts = ncopies * ceil_log2 (group_size) * group_size;
>> inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
>> stmt_info, 0, vect_body);
>>
>>
>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> new file mode 100644
>> index 0000000..cc1e72e
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3
>> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
>> +
>> +#define byte unsigned char
>> +
>> +void
>> +matrix_mul (byte *in, byte *out, int size)
>> +{
>> + int i;
>> + for (i = 0; i < size; i++)
>> + {
>> + out[0] = in[0] + in[1] + in[3];
>> + out[1] = in[0] + in[2] + in[4];
>> + out[2] = in[1] + in[2] + in[4];
>> + in += 4;
>> + out += 3;
>> + }
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3.
2014-06-10 10:11 Evgeny Stupachenko
@ 2014-06-10 11:25 ` Richard Biener
0 siblings, 0 replies; 3+ messages in thread
From: Richard Biener @ 2014-06-10 11:25 UTC (permalink / raw)
To: Evgeny Stupachenko; +Cc: GCC Patches, Jakub Jelinek, Uros Bizjak
On Tue, 10 Jun 2014, Evgeny Stupachenko wrote:
> ping.
> The changes are similar to already committed on loads group.
Ok.
Thanks,
Richard.
> On Tue, Jun 3, 2014 at 5:22 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> > I've added a bug report for the stores group case:
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403
> >
> >
> > On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> >> Ping.
> >> Test is modified according to the fix in the test for loads.
> >>
> >> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >> new file mode 100644
> >> index 0000000..e7161f7
> >> --- /dev/null
> >> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >> @@ -0,0 +1,21 @@
> >> +/* { dg-do compile } */
> >> +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */
> >> +
> >> +#define byte unsigned char
> >> +
> >> +void
> >> +matrix_mul (byte *in, byte *out, int size)
> >> +{
> >> + int i;
> >> + for (i = 0; i < size; i++)
> >> + {
> >> + out[0] = in[0] + in[1] + in[3];
> >> + out[1] = in[0] + in[2] + in[4];
> >> + out[2] = in[1] + in[2] + in[4];
> >> + in += 4;
> >> + out += 3;
> >> + }
> >> +}
> >> +
> >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
> >> target { i?86-*-* x86_64-*-* } } } } */
> >> +/* { dg-final { cleanup-tree-dump "vect" } } */
> >>
> >>
> >> On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> >>> 2nd part of patch is on stores group.
> >>> Bootstrap and make check passed on x86.
> >>>
> >>> Is it ok?
> >>>
> >>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com>
> >>>
> >>> * tree-vect-data-refs.c (vect_grouped_store_supported): New
> >>> check for storess group of length 3.
> >>> (vect_permute_store_chain): New permutations for storess group of
> >>> length 3.
> >>> * tree-vect-stmts.c (vect_model_store_cost): Change cost
> >>> of vec_perm_shuffle for the new permutations.
> >>>
> >>> ChangeLog for testsuite:
> >>>
> >>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com>
> >>>
> >>> PR tree-optimization/52252
> >>> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.
> >>>
> >>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> >>> index ef710cf..fb0e30d 100644
> >>> --- a/gcc/tree-vect-data-refs.c
> >>> +++ b/gcc/tree-vect-data-refs.c
> >>> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype,
> >>> unsigned HOST_WIDE_INT count)
> >>> {
> >>> enum machine_mode mode = TYPE_MODE (vectype);
> >>>
> >>> - /* vect_permute_store_chain requires the group size to be a power of two. */
> >>> - if (exact_log2 (count) == -1)
> >>> + /* vect_permute_store_chain requires the group size to be equal to 3 or
> >>> + be a power of two. */
> >>> + if (count != 3 && exact_log2 (count) == -1)
> >>> {
> >>> if (dump_enabled_p ())
> >>> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >>> - "the size of the group of accesses"
> >>> - " is not a power of 2\n");
> >>> + "the size of the group of accesses"
> >>> + " is not a power of 2 or not eqaul to 3\n");
> >>> return false;
> >>> }
> >>>
> >>> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype,
> >>> unsigned HOST_WIDE_INT count)
> >>> {
> >>> unsigned int i, nelt = GET_MODE_NUNITS (mode);
> >>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
> >>> - for (i = 0; i < nelt / 2; i++)
> >>> +
> >>> + if (count == 3)
> >>> {
> >>> - sel[i * 2] = i;
> >>> - sel[i * 2 + 1] = i + nelt;
> >>> + unsigned int j0 = 0, j1 = 0, j2 = 0;
> >>> + unsigned int i, j;
> >>> +
> >>> + for (j = 0; j < 3; j++)
> >>> + {
> >>> + int nelt0 = ((3 - j) * nelt) % 3;
> >>> + int nelt1 = ((3 - j) * nelt + 1) % 3;
> >>> + int nelt2 = ((3 - j) * nelt + 2) % 3;
> >>> + for (i = 0; i < nelt; i++)
> >>> + {
> >>> + if (3 * i + nelt0 < nelt)
> >>> + sel[3 * i + nelt0] = j0++;
> >>> + if (3 * i + nelt1 < nelt)
> >>> + sel[3 * i + nelt1] = nelt + j1++;
> >>> + if (3 * i + nelt2 < nelt)
> >>> + sel[3 * i + nelt2] = 0;
> >>> + }
> >>> + if (!can_vec_perm_p (mode, false, sel))
> >>> + {
> >>> + if (dump_enabled_p ())
> >>> + dump_printf (MSG_MISSED_OPTIMIZATION,
> >>> + "permutaion op not supported by target.\n");
> >>> + return false;
> >>> + }
> >>> +
> >>> + for (i = 0; i < nelt; i++)
> >>> + {
> >>> + if (3 * i + nelt0 < nelt)
> >>> + sel[3 * i + nelt0] = 3 * i + nelt0;
> >>> + if (3 * i + nelt1 < nelt)
> >>> + sel[3 * i + nelt1] = 3 * i + nelt1;
> >>> + if (3 * i + nelt2 < nelt)
> >>> + sel[3 * i + nelt2] = nelt + j2++;
> >>> + }
> >>> + if (!can_vec_perm_p (mode, false, sel))
> >>> + {
> >>> + if (dump_enabled_p ())
> >>> + dump_printf (MSG_MISSED_OPTIMIZATION,
> >>> + "permutaion op not supported by target.\n");
> >>> + return false;
> >>> + }
> >>> + }
> >>> + return true;
> >>> }
> >>> - if (can_vec_perm_p (mode, false, sel))
> >>> + else
> >>> {
> >>> - for (i = 0; i < nelt; i++)
> >>> - sel[i] += nelt / 2;
> >>> - if (can_vec_perm_p (mode, false, sel))
> >>> - return true;
> >>> + /* If length is not equal to 3 then only power of 2 is supported. */
> >>> + gcc_assert (exact_log2 (count) != -1);
> >>> +
> >>> + for (i = 0; i < nelt / 2; i++)
> >>> + {
> >>> + sel[i * 2] = i;
> >>> + sel[i * 2 + 1] = i + nelt;
> >>> + }
> >>> + if (can_vec_perm_p (mode, false, sel))
> >>> + {
> >>> + for (i = 0; i < nelt; i++)
> >>> + sel[i] += nelt / 2;
> >>> + if (can_vec_perm_p (mode, false, sel))
> >>> + return true;
> >>> + }
> >>> }
> >>> }
> >>>
> >>> if (dump_enabled_p ())
> >>> dump_printf (MSG_MISSED_OPTIMIZATION,
> >>> - "interleave op not supported by target.\n");
> >>> + "permutaion op not supported by target.\n");
> >>> return false;
> >>> }
> >>>
> >>> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype,
> >>> unsigned HOST_WIDE_INT count)
> >>> /* Function vect_permute_store_chain.
> >>>
> >>> Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
> >>> - a power of 2, generate interleave_high/low stmts to reorder the data
> >>> - correctly for the stores. Return the final references for stores in
> >>> - RESULT_CHAIN.
> >>> + a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
> >>> + the data correctly for the stores. Return the final references for stores
> >>> + in RESULT_CHAIN.
> >>>
> >>> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
> >>> The input is 4 vectors each containing 8 elements. We assign a number to
> >>> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
> >>> gimple perm_stmt;
> >>> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
> >>> tree perm_mask_low, perm_mask_high;
> >>> - unsigned int i, n;
> >>> + tree data_ref;
> >>> + tree perm3_mask_low, perm3_mask_high;
> >>> + unsigned int i, n, log_length = exact_log2 (length);
> >>> unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
> >>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
> >>>
> >>> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
> >>> memcpy (result_chain->address (), dr_chain.address (),
> >>> length * sizeof (tree));
> >>>
> >>> - for (i = 0, n = nelt / 2; i < n; i++)
> >>> + if (length == 3)
> >>> {
> >>> - sel[i * 2] = i;
> >>> - sel[i * 2 + 1] = i + nelt;
> >>> - }
> >>> - perm_mask_high = vect_gen_perm_mask (vectype, sel);
> >>> - gcc_assert (perm_mask_high != NULL);
> >>> + unsigned int j0 = 0, j1 = 0, j2 = 0;
> >>>
> >>> - for (i = 0; i < nelt; i++)
> >>> - sel[i] += nelt / 2;
> >>> - perm_mask_low = vect_gen_perm_mask (vectype, sel);
> >>> - gcc_assert (perm_mask_low != NULL);
> >>> + for (j = 0; j < 3; j++)
> >>> + {
> >>> + int nelt0 = ((3 - j) * nelt) % 3;
> >>> + int nelt1 = ((3 - j) * nelt + 1) % 3;
> >>> + int nelt2 = ((3 - j) * nelt + 2) % 3;
> >>>
> >>> - for (i = 0, n = exact_log2 (length); i < n; i++)
> >>> - {
> >>> - for (j = 0; j < length/2; j++)
> >>> - {
> >>> - vect1 = dr_chain[j];
> >>> - vect2 = dr_chain[j+length/2];
> >>> + for (i = 0; i < nelt; i++)
> >>> + {
> >>> + if (3 * i + nelt0 < nelt)
> >>> + sel[3 * i + nelt0] = j0++;
> >>> + if (3 * i + nelt1 < nelt)
> >>> + sel[3 * i + nelt1] = nelt + j1++;
> >>> + if (3 * i + nelt2 < nelt)
> >>> + sel[3 * i + nelt2] = 0;
> >>> + }
> >>> + perm3_mask_low = vect_gen_perm_mask (vectype, sel);
> >>> + gcc_assert (perm3_mask_low != NULL);
> >>> +
> >>> + for (i = 0; i < nelt; i++)
> >>> + {
> >>> + if (3 * i + nelt0 < nelt)
> >>> + sel[3 * i + nelt0] = 3 * i + nelt0;
> >>> + if (3 * i + nelt1 < nelt)
> >>> + sel[3 * i + nelt1] = 3 * i + nelt1;
> >>> + if (3 * i + nelt2 < nelt)
> >>> + sel[3 * i + nelt2] = nelt + j2++;
> >>> + }
> >>> + perm3_mask_high = vect_gen_perm_mask (vectype, sel);
> >>> + gcc_assert (perm3_mask_high != NULL);
> >>> +
> >>> + vect1 = dr_chain[0];
> >>> + vect2 = dr_chain[1];
> >>>
> >>> /* Create interleaving stmt:
> >>> - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
> >>> - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
> >>> - perm_stmt
> >>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
> >>> - vect1, vect2, perm_mask_high);
> >>> + low = VEC_PERM_EXPR <vect1, vect2,
> >>> + {j, nelt, *, j + 1, nelt + j + 1, *,
> >>> + j + 2, nelt + j + 2, *, ...}> */
> >>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
> >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> >>> + vect1, vect2,
> >>> + perm3_mask_low);
> >>> vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> >>> - (*result_chain)[2*j] = high;
> >>>
> >>> + vect1 = data_ref;
> >>> + vect2 = dr_chain[2];
> >>> /* Create interleaving stmt:
> >>> - low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
> >>> - nelt*3/2+1, ...}> */
> >>> - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
> >>> - perm_stmt
> >>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
> >>> - vect1, vect2, perm_mask_low);
> >>> + low = VEC_PERM_EXPR <vect1, vect2,
> >>> + {0, 1, nelt + j, 3, 4, nelt + j + 1,
> >>> + 6, 7, nelt + j + 2, ...}> */
> >>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
> >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
> >>> + vect1, vect2,
> >>> + perm3_mask_high);
> >>> vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> >>> - (*result_chain)[2*j+1] = low;
> >>> + (*result_chain)[j] = data_ref;
> >>> + }
> >>> + }
> >>> + else
> >>> + {
> >>> + /* If length is not equal to 3 then only power of 2 is supported. */
> >>> + gcc_assert (exact_log2 (length) != -1);
> >>> +
> >>> + for (i = 0, n = nelt / 2; i < n; i++)
> >>> + {
> >>> + sel[i * 2] = i;
> >>> + sel[i * 2 + 1] = i + nelt;
> >>> }
> >>> - memcpy (dr_chain.address (), result_chain->address (),
> >>> - length * sizeof (tree));
> >>> + perm_mask_high = vect_gen_perm_mask (vectype, sel);
> >>> + gcc_assert (perm_mask_high != NULL);
> >>> +
> >>> + for (i = 0; i < nelt; i++)
> >>> + sel[i] += nelt / 2;
> >>> + perm_mask_low = vect_gen_perm_mask (vectype, sel);
> >>> + gcc_assert (perm_mask_low != NULL);
> >>> +
> >>> + for (i = 0, n = log_length; i < n; i++)
> >>> + {
> >>> + for (j = 0; j < length/2; j++)
> >>> + {
> >>> + vect1 = dr_chain[j];
> >>> + vect2 = dr_chain[j+length/2];
> >>> +
> >>> + /* Create interleaving stmt:
> >>> + high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
> >>> + ...}> */
> >>> + high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
> >>> + perm_stmt
> >>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
> >>> + vect1, vect2, perm_mask_high);
> >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> >>> + (*result_chain)[2*j] = high;
> >>> +
> >>> + /* Create interleaving stmt:
> >>> + low = VEC_PERM_EXPR <vect1, vect2,
> >>> + {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
> >>> + ...}> */
> >>> + low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
> >>> + perm_stmt
> >>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
> >>> + vect1, vect2, perm_mask_low);
> >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
> >>> + (*result_chain)[2*j+1] = low;
> >>> + }
> >>> + memcpy (dr_chain.address (), result_chain->address (),
> >>> + length * sizeof (tree));
> >>> + }
> >>> }
> >>> }
> >>>
> >>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> >>> index b87c143..24d0b94 100644
> >>> --- a/gcc/tree-vect-stmts.c
> >>> +++ b/gcc/tree-vect-stmts.c
> >>> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
> >>> include the cost of the permutes. */
> >>> if (!store_lanes_p && group_size > 1)
> >>> {
> >>> - /* Uses a high and low interleave operation for each needed permute. */
> >>> -
> >>> - int nstmts = ncopies * exact_log2 (group_size) * group_size;
> >>> + /* Uses a high and low interleave or shuffle operations for each
> >>> + needed permute. */
> >>> + int nstmts = ncopies * ceil_log2 (group_size) * group_size;
> >>> inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
> >>> stmt_info, 0, vect_body);
> >>>
> >>>
> >>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >>> new file mode 100644
> >>> index 0000000..cc1e72e
> >>> --- /dev/null
> >>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
> >>> @@ -0,0 +1,21 @@
> >>> +/* { dg-do compile } */
> >>> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3
> >>> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
> >>> +
> >>> +#define byte unsigned char
> >>> +
> >>> +void
> >>> +matrix_mul (byte *in, byte *out, int size)
> >>> +{
> >>> + int i;
> >>> + for (i = 0; i < size; i++)
> >>> + {
> >>> + out[0] = in[0] + in[1] + in[3];
> >>> + out[1] = in[0] + in[2] + in[4];
> >>> + out[2] = in[1] + in[2] + in[4];
> >>> + in += 4;
> >>> + out += 3;
> >>> + }
> >>> +}
> >>> +
> >>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> >>> +/* { dg-final { cleanup-tree-dump "vect" } } */
>
>
--
Richard Biener <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3.
@ 2014-06-10 10:11 Evgeny Stupachenko
2014-06-10 11:25 ` Richard Biener
0 siblings, 1 reply; 3+ messages in thread
From: Evgeny Stupachenko @ 2014-06-10 10:11 UTC (permalink / raw)
To: GCC Patches, Richard Biener, Jakub Jelinek, Uros Bizjak
ping.
The changes are similar to already committed on loads group.
On Tue, Jun 3, 2014 at 5:22 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
> I've added a bug report for the stores group case:
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403
>
>
> On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>> Ping.
>> Test is modified according to the fix in the test for loads.
>>
>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> new file mode 100644
>> index 0000000..e7161f7
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>> @@ -0,0 +1,21 @@
>> +/* { dg-do compile } */
>> +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */
>> +
>> +#define byte unsigned char
>> +
>> +void
>> +matrix_mul (byte *in, byte *out, int size)
>> +{
>> + int i;
>> + for (i = 0; i < size; i++)
>> + {
>> + out[0] = in[0] + in[1] + in[3];
>> + out[1] = in[0] + in[2] + in[4];
>> + out[2] = in[1] + in[2] + in[4];
>> + in += 4;
>> + out += 3;
>> + }
>> +}
>> +
>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" {
>> target { i?86-*-* x86_64-*-* } } } } */
>> +/* { dg-final { cleanup-tree-dump "vect" } } */
>>
>>
>> On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko <evstupac@gmail.com> wrote:
>>> 2nd part of patch is on stores group.
>>> Bootstrap and make check passed on x86.
>>>
>>> Is it ok?
>>>
>>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com>
>>>
>>> * tree-vect-data-refs.c (vect_grouped_store_supported): New
>>> check for storess group of length 3.
>>> (vect_permute_store_chain): New permutations for storess group of
>>> length 3.
>>> * tree-vect-stmts.c (vect_model_store_cost): Change cost
>>> of vec_perm_shuffle for the new permutations.
>>>
>>> ChangeLog for testsuite:
>>>
>>> 2014-05-06 Evgeny Stupachenko <evstupac@gmail.com>
>>>
>>> PR tree-optimization/52252
>>> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3.
>>>
>>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
>>> index ef710cf..fb0e30d 100644
>>> --- a/gcc/tree-vect-data-refs.c
>>> +++ b/gcc/tree-vect-data-refs.c
>>> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype,
>>> unsigned HOST_WIDE_INT count)
>>> {
>>> enum machine_mode mode = TYPE_MODE (vectype);
>>>
>>> - /* vect_permute_store_chain requires the group size to be a power of two. */
>>> - if (exact_log2 (count) == -1)
>>> + /* vect_permute_store_chain requires the group size to be equal to 3 or
>>> + be a power of two. */
>>> + if (count != 3 && exact_log2 (count) == -1)
>>> {
>>> if (dump_enabled_p ())
>>> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>>> - "the size of the group of accesses"
>>> - " is not a power of 2\n");
>>> + "the size of the group of accesses"
>>> + " is not a power of 2 or not eqaul to 3\n");
>>> return false;
>>> }
>>>
>>> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype,
>>> unsigned HOST_WIDE_INT count)
>>> {
>>> unsigned int i, nelt = GET_MODE_NUNITS (mode);
>>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>>> - for (i = 0; i < nelt / 2; i++)
>>> +
>>> + if (count == 3)
>>> {
>>> - sel[i * 2] = i;
>>> - sel[i * 2 + 1] = i + nelt;
>>> + unsigned int j0 = 0, j1 = 0, j2 = 0;
>>> + unsigned int i, j;
>>> +
>>> + for (j = 0; j < 3; j++)
>>> + {
>>> + int nelt0 = ((3 - j) * nelt) % 3;
>>> + int nelt1 = ((3 - j) * nelt + 1) % 3;
>>> + int nelt2 = ((3 - j) * nelt + 2) % 3;
>>> + for (i = 0; i < nelt; i++)
>>> + {
>>> + if (3 * i + nelt0 < nelt)
>>> + sel[3 * i + nelt0] = j0++;
>>> + if (3 * i + nelt1 < nelt)
>>> + sel[3 * i + nelt1] = nelt + j1++;
>>> + if (3 * i + nelt2 < nelt)
>>> + sel[3 * i + nelt2] = 0;
>>> + }
>>> + if (!can_vec_perm_p (mode, false, sel))
>>> + {
>>> + if (dump_enabled_p ())
>>> + dump_printf (MSG_MISSED_OPTIMIZATION,
>>> + "permutaion op not supported by target.\n");
>>> + return false;
>>> + }
>>> +
>>> + for (i = 0; i < nelt; i++)
>>> + {
>>> + if (3 * i + nelt0 < nelt)
>>> + sel[3 * i + nelt0] = 3 * i + nelt0;
>>> + if (3 * i + nelt1 < nelt)
>>> + sel[3 * i + nelt1] = 3 * i + nelt1;
>>> + if (3 * i + nelt2 < nelt)
>>> + sel[3 * i + nelt2] = nelt + j2++;
>>> + }
>>> + if (!can_vec_perm_p (mode, false, sel))
>>> + {
>>> + if (dump_enabled_p ())
>>> + dump_printf (MSG_MISSED_OPTIMIZATION,
>>> + "permutaion op not supported by target.\n");
>>> + return false;
>>> + }
>>> + }
>>> + return true;
>>> }
>>> - if (can_vec_perm_p (mode, false, sel))
>>> + else
>>> {
>>> - for (i = 0; i < nelt; i++)
>>> - sel[i] += nelt / 2;
>>> - if (can_vec_perm_p (mode, false, sel))
>>> - return true;
>>> + /* If length is not equal to 3 then only power of 2 is supported. */
>>> + gcc_assert (exact_log2 (count) != -1);
>>> +
>>> + for (i = 0; i < nelt / 2; i++)
>>> + {
>>> + sel[i * 2] = i;
>>> + sel[i * 2 + 1] = i + nelt;
>>> + }
>>> + if (can_vec_perm_p (mode, false, sel))
>>> + {
>>> + for (i = 0; i < nelt; i++)
>>> + sel[i] += nelt / 2;
>>> + if (can_vec_perm_p (mode, false, sel))
>>> + return true;
>>> + }
>>> }
>>> }
>>>
>>> if (dump_enabled_p ())
>>> dump_printf (MSG_MISSED_OPTIMIZATION,
>>> - "interleave op not supported by target.\n");
>>> + "permutaion op not supported by target.\n");
>>> return false;
>>> }
>>>
>>> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype,
>>> unsigned HOST_WIDE_INT count)
>>> /* Function vect_permute_store_chain.
>>>
>>> Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
>>> - a power of 2, generate interleave_high/low stmts to reorder the data
>>> - correctly for the stores. Return the final references for stores in
>>> - RESULT_CHAIN.
>>> + a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
>>> + the data correctly for the stores. Return the final references for stores
>>> + in RESULT_CHAIN.
>>>
>>> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
>>> The input is 4 vectors each containing 8 elements. We assign a number to
>>> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec<tree> dr_chain,
>>> gimple perm_stmt;
>>> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
>>> tree perm_mask_low, perm_mask_high;
>>> - unsigned int i, n;
>>> + tree data_ref;
>>> + tree perm3_mask_low, perm3_mask_high;
>>> + unsigned int i, n, log_length = exact_log2 (length);
>>> unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
>>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
>>>
>>> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec<tree> dr_chain,
>>> memcpy (result_chain->address (), dr_chain.address (),
>>> length * sizeof (tree));
>>>
>>> - for (i = 0, n = nelt / 2; i < n; i++)
>>> + if (length == 3)
>>> {
>>> - sel[i * 2] = i;
>>> - sel[i * 2 + 1] = i + nelt;
>>> - }
>>> - perm_mask_high = vect_gen_perm_mask (vectype, sel);
>>> - gcc_assert (perm_mask_high != NULL);
>>> + unsigned int j0 = 0, j1 = 0, j2 = 0;
>>>
>>> - for (i = 0; i < nelt; i++)
>>> - sel[i] += nelt / 2;
>>> - perm_mask_low = vect_gen_perm_mask (vectype, sel);
>>> - gcc_assert (perm_mask_low != NULL);
>>> + for (j = 0; j < 3; j++)
>>> + {
>>> + int nelt0 = ((3 - j) * nelt) % 3;
>>> + int nelt1 = ((3 - j) * nelt + 1) % 3;
>>> + int nelt2 = ((3 - j) * nelt + 2) % 3;
>>>
>>> - for (i = 0, n = exact_log2 (length); i < n; i++)
>>> - {
>>> - for (j = 0; j < length/2; j++)
>>> - {
>>> - vect1 = dr_chain[j];
>>> - vect2 = dr_chain[j+length/2];
>>> + for (i = 0; i < nelt; i++)
>>> + {
>>> + if (3 * i + nelt0 < nelt)
>>> + sel[3 * i + nelt0] = j0++;
>>> + if (3 * i + nelt1 < nelt)
>>> + sel[3 * i + nelt1] = nelt + j1++;
>>> + if (3 * i + nelt2 < nelt)
>>> + sel[3 * i + nelt2] = 0;
>>> + }
>>> + perm3_mask_low = vect_gen_perm_mask (vectype, sel);
>>> + gcc_assert (perm3_mask_low != NULL);
>>> +
>>> + for (i = 0; i < nelt; i++)
>>> + {
>>> + if (3 * i + nelt0 < nelt)
>>> + sel[3 * i + nelt0] = 3 * i + nelt0;
>>> + if (3 * i + nelt1 < nelt)
>>> + sel[3 * i + nelt1] = 3 * i + nelt1;
>>> + if (3 * i + nelt2 < nelt)
>>> + sel[3 * i + nelt2] = nelt + j2++;
>>> + }
>>> + perm3_mask_high = vect_gen_perm_mask (vectype, sel);
>>> + gcc_assert (perm3_mask_high != NULL);
>>> +
>>> + vect1 = dr_chain[0];
>>> + vect2 = dr_chain[1];
>>>
>>> /* Create interleaving stmt:
>>> - high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
>>> - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
>>> - perm_stmt
>>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
>>> - vect1, vect2, perm_mask_high);
>>> + low = VEC_PERM_EXPR <vect1, vect2,
>>> + {j, nelt, *, j + 1, nelt + j + 1, *,
>>> + j + 2, nelt + j + 2, *, ...}> */
>>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
>>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> + vect1, vect2,
>>> + perm3_mask_low);
>>> vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> - (*result_chain)[2*j] = high;
>>>
>>> + vect1 = data_ref;
>>> + vect2 = dr_chain[2];
>>> /* Create interleaving stmt:
>>> - low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
>>> - nelt*3/2+1, ...}> */
>>> - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
>>> - perm_stmt
>>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
>>> - vect1, vect2, perm_mask_low);
>>> + low = VEC_PERM_EXPR <vect1, vect2,
>>> + {0, 1, nelt + j, 3, 4, nelt + j + 1,
>>> + 6, 7, nelt + j + 2, ...}> */
>>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
>>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
>>> + vect1, vect2,
>>> + perm3_mask_high);
>>> vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> - (*result_chain)[2*j+1] = low;
>>> + (*result_chain)[j] = data_ref;
>>> + }
>>> + }
>>> + else
>>> + {
>>> + /* If length is not equal to 3 then only power of 2 is supported. */
>>> + gcc_assert (exact_log2 (length) != -1);
>>> +
>>> + for (i = 0, n = nelt / 2; i < n; i++)
>>> + {
>>> + sel[i * 2] = i;
>>> + sel[i * 2 + 1] = i + nelt;
>>> }
>>> - memcpy (dr_chain.address (), result_chain->address (),
>>> - length * sizeof (tree));
>>> + perm_mask_high = vect_gen_perm_mask (vectype, sel);
>>> + gcc_assert (perm_mask_high != NULL);
>>> +
>>> + for (i = 0; i < nelt; i++)
>>> + sel[i] += nelt / 2;
>>> + perm_mask_low = vect_gen_perm_mask (vectype, sel);
>>> + gcc_assert (perm_mask_low != NULL);
>>> +
>>> + for (i = 0, n = log_length; i < n; i++)
>>> + {
>>> + for (j = 0; j < length/2; j++)
>>> + {
>>> + vect1 = dr_chain[j];
>>> + vect2 = dr_chain[j+length/2];
>>> +
>>> + /* Create interleaving stmt:
>>> + high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
>>> + ...}> */
>>> + high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
>>> + perm_stmt
>>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
>>> + vect1, vect2, perm_mask_high);
>>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> + (*result_chain)[2*j] = high;
>>> +
>>> + /* Create interleaving stmt:
>>> + low = VEC_PERM_EXPR <vect1, vect2,
>>> + {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
>>> + ...}> */
>>> + low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
>>> + perm_stmt
>>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
>>> + vect1, vect2, perm_mask_low);
>>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi);
>>> + (*result_chain)[2*j+1] = low;
>>> + }
>>> + memcpy (dr_chain.address (), result_chain->address (),
>>> + length * sizeof (tree));
>>> + }
>>> }
>>> }
>>>
>>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
>>> index b87c143..24d0b94 100644
>>> --- a/gcc/tree-vect-stmts.c
>>> +++ b/gcc/tree-vect-stmts.c
>>> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
>>> include the cost of the permutes. */
>>> if (!store_lanes_p && group_size > 1)
>>> {
>>> - /* Uses a high and low interleave operation for each needed permute. */
>>> -
>>> - int nstmts = ncopies * exact_log2 (group_size) * group_size;
>>> + /* Uses a high and low interleave or shuffle operations for each
>>> + needed permute. */
>>> + int nstmts = ncopies * ceil_log2 (group_size) * group_size;
>>> inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
>>> stmt_info, 0, vect_body);
>>>
>>>
>>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>>> new file mode 100644
>>> index 0000000..cc1e72e
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c
>>> @@ -0,0 +1,21 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3
>>> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
>>> +
>>> +#define byte unsigned char
>>> +
>>> +void
>>> +matrix_mul (byte *in, byte *out, int size)
>>> +{
>>> + int i;
>>> + for (i = 0; i < size; i++)
>>> + {
>>> + out[0] = in[0] + in[1] + in[3];
>>> + out[1] = in[0] + in[2] + in[4];
>>> + out[2] = in[1] + in[2] + in[4];
>>> + in += 4;
>>> + out += 3;
>>> + }
>>> +}
>>> +
>>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
>>> +/* { dg-final { cleanup-tree-dump "vect" } } */
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2014-06-10 11:25 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-06-03 13:22 [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3 Evgeny Stupachenko
2014-06-10 10:11 Evgeny Stupachenko
2014-06-10 11:25 ` Richard Biener
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).