From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 31949 invoked by alias); 10 Jun 2014 11:25:31 -0000 Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-patches-owner@gcc.gnu.org Received: (qmail 31939 invoked by uid 89); 10 Jun 2014 11:25:31 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-3.6 required=5.0 tests=AWL,BAYES_00,RP_MATCHES_RCVD autolearn=ham version=3.3.2 X-HELO: mx2.suse.de Received: from cantor2.suse.de (HELO mx2.suse.de) (195.135.220.15) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with (CAMELLIA256-SHA encrypted) ESMTPS; Tue, 10 Jun 2014 11:25:29 +0000 Received: from relay1.suse.de (charybdis-ext.suse.de [195.135.220.254]) by mx2.suse.de (Postfix) with ESMTP id C35FBAB0C; Tue, 10 Jun 2014 11:25:25 +0000 (UTC) Date: Tue, 10 Jun 2014 11:25:00 -0000 From: Richard Biener To: Evgeny Stupachenko cc: GCC Patches , Jakub Jelinek , Uros Bizjak Subject: Re: [PING, PATCH2/2, PR52252] Vectorization for load/store groups of size 3. In-Reply-To: Message-ID: References: User-Agent: Alpine 2.11 (LSU 23 2013-08-11) MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII X-SW-Source: 2014-06/txt/msg00808.txt.bz2 On Tue, 10 Jun 2014, Evgeny Stupachenko wrote: > ping. > The changes are similar to already committed on loads group. Ok. Thanks, Richard. > On Tue, Jun 3, 2014 at 5:22 PM, Evgeny Stupachenko wrote: > > I've added a bug report for the stores group case: > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61403 > > > > > > On Wed, May 28, 2014 at 5:18 PM, Evgeny Stupachenko wrote: > >> Ping. > >> Test is modified according to the fix in the test for loads. > >> > >> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c > >> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c > >> new file mode 100644 > >> index 0000000..e7161f7 > >> --- /dev/null > >> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c > >> @@ -0,0 +1,21 @@ > >> +/* { dg-do compile } */ > >> +/* { dg-additional-options "-mssse3" { target { i?86-*-* x86_64-*-* } } } */ > >> + > >> +#define byte unsigned char > >> + > >> +void > >> +matrix_mul (byte *in, byte *out, int size) > >> +{ > >> + int i; > >> + for (i = 0; i < size; i++) > >> + { > >> + out[0] = in[0] + in[1] + in[3]; > >> + out[1] = in[0] + in[2] + in[4]; > >> + out[2] = in[1] + in[2] + in[4]; > >> + in += 4; > >> + out += 3; > >> + } > >> +} > >> + > >> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { > >> target { i?86-*-* x86_64-*-* } } } } */ > >> +/* { dg-final { cleanup-tree-dump "vect" } } */ > >> > >> > >> On Tue, May 6, 2014 at 6:39 PM, Evgeny Stupachenko wrote: > >>> 2nd part of patch is on stores group. > >>> Bootstrap and make check passed on x86. > >>> > >>> Is it ok? > >>> > >>> 2014-05-06 Evgeny Stupachenko > >>> > >>> * tree-vect-data-refs.c (vect_grouped_store_supported): New > >>> check for storess group of length 3. > >>> (vect_permute_store_chain): New permutations for storess group of > >>> length 3. > >>> * tree-vect-stmts.c (vect_model_store_cost): Change cost > >>> of vec_perm_shuffle for the new permutations. > >>> > >>> ChangeLog for testsuite: > >>> > >>> 2014-05-06 Evgeny Stupachenko > >>> > >>> PR tree-optimization/52252 > >>> * gcc.dg/vect/pr52252-st.c: Test on stores group of size 3. > >>> > >>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c > >>> index ef710cf..fb0e30d 100644 > >>> --- a/gcc/tree-vect-data-refs.c > >>> +++ b/gcc/tree-vect-data-refs.c > >>> @@ -4365,13 +4365,14 @@ vect_grouped_store_supported (tree vectype, > >>> unsigned HOST_WIDE_INT count) > >>> { > >>> enum machine_mode mode = TYPE_MODE (vectype); > >>> > >>> - /* vect_permute_store_chain requires the group size to be a power of two. */ > >>> - if (exact_log2 (count) == -1) > >>> + /* vect_permute_store_chain requires the group size to be equal to 3 or > >>> + be a power of two. */ > >>> + if (count != 3 && exact_log2 (count) == -1) > >>> { > >>> if (dump_enabled_p ()) > >>> dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > >>> - "the size of the group of accesses" > >>> - " is not a power of 2\n"); > >>> + "the size of the group of accesses" > >>> + " is not a power of 2 or not eqaul to 3\n"); > >>> return false; > >>> } > >>> > >>> @@ -4380,23 +4381,76 @@ vect_grouped_store_supported (tree vectype, > >>> unsigned HOST_WIDE_INT count) > >>> { > >>> unsigned int i, nelt = GET_MODE_NUNITS (mode); > >>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt); > >>> - for (i = 0; i < nelt / 2; i++) > >>> + > >>> + if (count == 3) > >>> { > >>> - sel[i * 2] = i; > >>> - sel[i * 2 + 1] = i + nelt; > >>> + unsigned int j0 = 0, j1 = 0, j2 = 0; > >>> + unsigned int i, j; > >>> + > >>> + for (j = 0; j < 3; j++) > >>> + { > >>> + int nelt0 = ((3 - j) * nelt) % 3; > >>> + int nelt1 = ((3 - j) * nelt + 1) % 3; > >>> + int nelt2 = ((3 - j) * nelt + 2) % 3; > >>> + for (i = 0; i < nelt; i++) > >>> + { > >>> + if (3 * i + nelt0 < nelt) > >>> + sel[3 * i + nelt0] = j0++; > >>> + if (3 * i + nelt1 < nelt) > >>> + sel[3 * i + nelt1] = nelt + j1++; > >>> + if (3 * i + nelt2 < nelt) > >>> + sel[3 * i + nelt2] = 0; > >>> + } > >>> + if (!can_vec_perm_p (mode, false, sel)) > >>> + { > >>> + if (dump_enabled_p ()) > >>> + dump_printf (MSG_MISSED_OPTIMIZATION, > >>> + "permutaion op not supported by target.\n"); > >>> + return false; > >>> + } > >>> + > >>> + for (i = 0; i < nelt; i++) > >>> + { > >>> + if (3 * i + nelt0 < nelt) > >>> + sel[3 * i + nelt0] = 3 * i + nelt0; > >>> + if (3 * i + nelt1 < nelt) > >>> + sel[3 * i + nelt1] = 3 * i + nelt1; > >>> + if (3 * i + nelt2 < nelt) > >>> + sel[3 * i + nelt2] = nelt + j2++; > >>> + } > >>> + if (!can_vec_perm_p (mode, false, sel)) > >>> + { > >>> + if (dump_enabled_p ()) > >>> + dump_printf (MSG_MISSED_OPTIMIZATION, > >>> + "permutaion op not supported by target.\n"); > >>> + return false; > >>> + } > >>> + } > >>> + return true; > >>> } > >>> - if (can_vec_perm_p (mode, false, sel)) > >>> + else > >>> { > >>> - for (i = 0; i < nelt; i++) > >>> - sel[i] += nelt / 2; > >>> - if (can_vec_perm_p (mode, false, sel)) > >>> - return true; > >>> + /* If length is not equal to 3 then only power of 2 is supported. */ > >>> + gcc_assert (exact_log2 (count) != -1); > >>> + > >>> + for (i = 0; i < nelt / 2; i++) > >>> + { > >>> + sel[i * 2] = i; > >>> + sel[i * 2 + 1] = i + nelt; > >>> + } > >>> + if (can_vec_perm_p (mode, false, sel)) > >>> + { > >>> + for (i = 0; i < nelt; i++) > >>> + sel[i] += nelt / 2; > >>> + if (can_vec_perm_p (mode, false, sel)) > >>> + return true; > >>> + } > >>> } > >>> } > >>> > >>> if (dump_enabled_p ()) > >>> dump_printf (MSG_MISSED_OPTIMIZATION, > >>> - "interleave op not supported by target.\n"); > >>> + "permutaion op not supported by target.\n"); > >>> return false; > >>> } > >>> > >>> @@ -4416,9 +4470,9 @@ vect_store_lanes_supported (tree vectype, > >>> unsigned HOST_WIDE_INT count) > >>> /* Function vect_permute_store_chain. > >>> > >>> Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be > >>> - a power of 2, generate interleave_high/low stmts to reorder the data > >>> - correctly for the stores. Return the final references for stores in > >>> - RESULT_CHAIN. > >>> + a power of 2 or equal to 3, generate interleave_high/low stmts to reorder > >>> + the data correctly for the stores. Return the final references for stores > >>> + in RESULT_CHAIN. > >>> > >>> E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. > >>> The input is 4 vectors each containing 8 elements. We assign a number to > >>> @@ -4485,7 +4539,9 @@ vect_permute_store_chain (vec dr_chain, > >>> gimple perm_stmt; > >>> tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); > >>> tree perm_mask_low, perm_mask_high; > >>> - unsigned int i, n; > >>> + tree data_ref; > >>> + tree perm3_mask_low, perm3_mask_high; > >>> + unsigned int i, n, log_length = exact_log2 (length); > >>> unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype); > >>> unsigned char *sel = XALLOCAVEC (unsigned char, nelt); > >>> > >>> @@ -4493,47 +4549,116 @@ vect_permute_store_chain (vec dr_chain, > >>> memcpy (result_chain->address (), dr_chain.address (), > >>> length * sizeof (tree)); > >>> > >>> - for (i = 0, n = nelt / 2; i < n; i++) > >>> + if (length == 3) > >>> { > >>> - sel[i * 2] = i; > >>> - sel[i * 2 + 1] = i + nelt; > >>> - } > >>> - perm_mask_high = vect_gen_perm_mask (vectype, sel); > >>> - gcc_assert (perm_mask_high != NULL); > >>> + unsigned int j0 = 0, j1 = 0, j2 = 0; > >>> > >>> - for (i = 0; i < nelt; i++) > >>> - sel[i] += nelt / 2; > >>> - perm_mask_low = vect_gen_perm_mask (vectype, sel); > >>> - gcc_assert (perm_mask_low != NULL); > >>> + for (j = 0; j < 3; j++) > >>> + { > >>> + int nelt0 = ((3 - j) * nelt) % 3; > >>> + int nelt1 = ((3 - j) * nelt + 1) % 3; > >>> + int nelt2 = ((3 - j) * nelt + 2) % 3; > >>> > >>> - for (i = 0, n = exact_log2 (length); i < n; i++) > >>> - { > >>> - for (j = 0; j < length/2; j++) > >>> - { > >>> - vect1 = dr_chain[j]; > >>> - vect2 = dr_chain[j+length/2]; > >>> + for (i = 0; i < nelt; i++) > >>> + { > >>> + if (3 * i + nelt0 < nelt) > >>> + sel[3 * i + nelt0] = j0++; > >>> + if (3 * i + nelt1 < nelt) > >>> + sel[3 * i + nelt1] = nelt + j1++; > >>> + if (3 * i + nelt2 < nelt) > >>> + sel[3 * i + nelt2] = 0; > >>> + } > >>> + perm3_mask_low = vect_gen_perm_mask (vectype, sel); > >>> + gcc_assert (perm3_mask_low != NULL); > >>> + > >>> + for (i = 0; i < nelt; i++) > >>> + { > >>> + if (3 * i + nelt0 < nelt) > >>> + sel[3 * i + nelt0] = 3 * i + nelt0; > >>> + if (3 * i + nelt1 < nelt) > >>> + sel[3 * i + nelt1] = 3 * i + nelt1; > >>> + if (3 * i + nelt2 < nelt) > >>> + sel[3 * i + nelt2] = nelt + j2++; > >>> + } > >>> + perm3_mask_high = vect_gen_perm_mask (vectype, sel); > >>> + gcc_assert (perm3_mask_high != NULL); > >>> + > >>> + vect1 = dr_chain[0]; > >>> + vect2 = dr_chain[1]; > >>> > >>> /* Create interleaving stmt: > >>> - high = VEC_PERM_EXPR */ > >>> - high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); > >>> - perm_stmt > >>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, high, > >>> - vect1, vect2, perm_mask_high); > >>> + low = VEC_PERM_EXPR >>> + {j, nelt, *, j + 1, nelt + j + 1, *, > >>> + j + 2, nelt + j + 2, *, ...}> */ > >>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); > >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > >>> + vect1, vect2, > >>> + perm3_mask_low); > >>> vect_finish_stmt_generation (stmt, perm_stmt, gsi); > >>> - (*result_chain)[2*j] = high; > >>> > >>> + vect1 = data_ref; > >>> + vect2 = dr_chain[2]; > >>> /* Create interleaving stmt: > >>> - low = VEC_PERM_EXPR >>> - nelt*3/2+1, ...}> */ > >>> - low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); > >>> - perm_stmt > >>> - = gimple_build_assign_with_ops (VEC_PERM_EXPR, low, > >>> - vect1, vect2, perm_mask_low); > >>> + low = VEC_PERM_EXPR >>> + {0, 1, nelt + j, 3, 4, nelt + j + 1, > >>> + 6, 7, nelt + j + 2, ...}> */ > >>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); > >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, > >>> + vect1, vect2, > >>> + perm3_mask_high); > >>> vect_finish_stmt_generation (stmt, perm_stmt, gsi); > >>> - (*result_chain)[2*j+1] = low; > >>> + (*result_chain)[j] = data_ref; > >>> + } > >>> + } > >>> + else > >>> + { > >>> + /* If length is not equal to 3 then only power of 2 is supported. */ > >>> + gcc_assert (exact_log2 (length) != -1); > >>> + > >>> + for (i = 0, n = nelt / 2; i < n; i++) > >>> + { > >>> + sel[i * 2] = i; > >>> + sel[i * 2 + 1] = i + nelt; > >>> } > >>> - memcpy (dr_chain.address (), result_chain->address (), > >>> - length * sizeof (tree)); > >>> + perm_mask_high = vect_gen_perm_mask (vectype, sel); > >>> + gcc_assert (perm_mask_high != NULL); > >>> + > >>> + for (i = 0; i < nelt; i++) > >>> + sel[i] += nelt / 2; > >>> + perm_mask_low = vect_gen_perm_mask (vectype, sel); > >>> + gcc_assert (perm_mask_low != NULL); > >>> + > >>> + for (i = 0, n = log_length; i < n; i++) > >>> + { > >>> + for (j = 0; j < length/2; j++) > >>> + { > >>> + vect1 = dr_chain[j]; > >>> + vect2 = dr_chain[j+length/2]; > >>> + > >>> + /* Create interleaving stmt: > >>> + high = VEC_PERM_EXPR >>> + ...}> */ > >>> + high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); > >>> + perm_stmt > >>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, high, > >>> + vect1, vect2, perm_mask_high); > >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); > >>> + (*result_chain)[2*j] = high; > >>> + > >>> + /* Create interleaving stmt: > >>> + low = VEC_PERM_EXPR >>> + {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, > >>> + ...}> */ > >>> + low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); > >>> + perm_stmt > >>> + = gimple_build_assign_with_ops (VEC_PERM_EXPR, low, > >>> + vect1, vect2, perm_mask_low); > >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); > >>> + (*result_chain)[2*j+1] = low; > >>> + } > >>> + memcpy (dr_chain.address (), result_chain->address (), > >>> + length * sizeof (tree)); > >>> + } > >>> } > >>> } > >>> > >>> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c > >>> index b87c143..24d0b94 100644 > >>> --- a/gcc/tree-vect-stmts.c > >>> +++ b/gcc/tree-vect-stmts.c > >>> @@ -974,9 +974,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, > >>> include the cost of the permutes. */ > >>> if (!store_lanes_p && group_size > 1) > >>> { > >>> - /* Uses a high and low interleave operation for each needed permute. */ > >>> - > >>> - int nstmts = ncopies * exact_log2 (group_size) * group_size; > >>> + /* Uses a high and low interleave or shuffle operations for each > >>> + needed permute. */ > >>> + int nstmts = ncopies * ceil_log2 (group_size) * group_size; > >>> inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm, > >>> stmt_info, 0, vect_body); > >>> > >>> > >>> diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-st.c > >>> b/gcc/testsuite/gcc.dg/vect/pr52252-st.c > >>> new file mode 100644 > >>> index 0000000..cc1e72e > >>> --- /dev/null > >>> +++ b/gcc/testsuite/gcc.dg/vect/pr52252-st.c > >>> @@ -0,0 +1,21 @@ > >>> +/* { dg-do compile } */ > >>> +/* { dg-options "-O2 -g -ftree-vectorize -mssse3 > >>> -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */ > >>> + > >>> +#define byte unsigned char > >>> + > >>> +void > >>> +matrix_mul (byte *in, byte *out, int size) > >>> +{ > >>> + int i; > >>> + for (i = 0; i < size; i++) > >>> + { > >>> + out[0] = in[0] + in[1] + in[3]; > >>> + out[1] = in[0] + in[2] + in[4]; > >>> + out[2] = in[1] + in[2] + in[4]; > >>> + in += 4; > >>> + out += 3; > >>> + } > >>> +} > >>> + > >>> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ > >>> +/* { dg-final { cleanup-tree-dump "vect" } } */ > > -- Richard Biener SUSE / SUSE Labs SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746 GF: Jeff Hawn, Jennifer Guild, Felix Imend"orffer