On Wed, 20 Mar 2019, Kewen.Lin wrote: > Hi, > > Please refer to below link for previous threads. > https://gcc.gnu.org/ml/gcc-patches/2019-03/msg00348.html > > Comparing to patch v2, I've moved up the vector operation target > check upward together with vector type target check. Besides, I > ran bootstrap and regtest on powerpc64-linux-gnu (BE), updated > testcases' requirements and options for robustness. > > Is it OK for GCC10? > > > gcc/ChangeLog > > 2019-03-20 Kewen Lin > > PR target/88497 > * tree-ssa-reassoc.c (reassociate_bb): Swap the positions of > GIMPLE_BINARY_RHS check and gimple_visited_p check, call new > function undistribute_bitref_for_vector. > (undistribute_bitref_for_vector): New function. > (cleanup_vinfo_map): Likewise. > (unsigned_cmp): Likewise. > > gcc/testsuite/ChangeLog > > 2019-03-20 Kewen Lin > > * gcc.dg/tree-ssa/pr88497-1.c: New test. > * gcc.dg/tree-ssa/pr88497-2.c: Likewise. > * gcc.dg/tree-ssa/pr88497-3.c: Likewise. > * gcc.dg/tree-ssa/pr88497-4.c: Likewise. > * gcc.dg/tree-ssa/pr88497-5.c: Likewise. > > --- > gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c | 44 +++++ > gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c | 33 ++++ > gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c | 33 ++++ > gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c | 33 ++++ > gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c | 33 ++++ > gcc/tree-ssa-reassoc.c | 306 +++++++++++++++++++++++++++++- > 6 files changed, 477 insertions(+), 5 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c > new file mode 100644 > index 0000000..99c9af8 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-1.c > @@ -0,0 +1,44 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_double } */ > +/* { dg-require-effective-target powerpc_vsx_ok { target { powerpc*-*-* } } } */ > +/* { dg-options "-O2 -ffast-math" } */ > +/* { dg-options "-O2 -ffast-math -mvsx -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ Use /* { dg-additional-options "-mvsx" { target { powerpc... that saves duplicate typing. I guess that x86_64/i?86 also works if you enable SSE2 - can you check that and do adjustments accordingly? > + > +/* To test reassoc can undistribute vector bit_field_ref summation. > + > + arg1 and arg2 are two arrays whose elements of type vector double. > + Assuming: > + A0 = arg1[0], A1 = arg1[1], A2 = arg1[2], A3 = arg1[3], > + B0 = arg2[0], B1 = arg2[1], B2 = arg2[2], B3 = arg2[3], > + > + Then: > + V0 = A0 * B0, V1 = A1 * B1, V2 = A2 * B2, V3 = A3 * B3, > + > + reassoc transforms > + > + accumulator += V0[0] + V0[1] + V1[0] + V1[1] + V2[0] + V2[1] > + + V3[0] + V3[1]; > + > + into: > + > + T = V0 + V1 + V2 + V3 > + accumulator += T[0] + T[1]; > + > + Fewer bit_field_refs, only two for 128 or more bits vector. */ > + > +typedef double v2df __attribute__ ((vector_size (16))); > +double > +test (double accumulator, v2df arg1[], v2df arg2[]) > +{ > + v2df temp; > + temp = arg1[0] * arg2[0]; > + accumulator += temp[0] + temp[1]; > + temp = arg1[1] * arg2[1]; > + accumulator += temp[0] + temp[1]; > + temp = arg1[2] * arg2[2]; > + accumulator += temp[0] + temp[1]; > + temp = arg1[3] * arg2[3]; > + accumulator += temp[0] + temp[1]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 2 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c > new file mode 100644 > index 0000000..61ed0bf5 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-2.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_float } */ > +/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */ > +/* { dg-options "-O2 -ffast-math" } */ > +/* { dg-options "-O2 -ffast-math -maltivec -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ > + > +/* To test reassoc can undistribute vector bit_field_ref on multiplication. > + > + v1, v2, v3, v4 of type vector float. > + > + reassoc transforms > + > + accumulator *= v1[0] * v1[1] * v1[2] * v1[3] * > + v2[0] * v2[1] * v2[2] * v2[3] * > + v3[0] * v3[1] * v3[2] * v3[3] * > + v4[0] * v4[1] * v4[2] * v4[3] ; > + > + into: > + > + T = v1 * v2 * v3 * v4; > + accumulator *= T[0] * T[1] * T[2] * T[3]; > + > + Fewer bit_field_refs, only four for 128 or more bits vector. */ > + > +typedef float v4si __attribute__((vector_size(16))); > +float test(float accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { > + accumulator *= v1[0] * v1[1] * v1[2] * v1[3]; > + accumulator *= v2[0] * v2[1] * v2[2] * v2[3]; > + accumulator *= v3[0] * v3[1] * v3[2] * v3[3]; > + accumulator *= v4[0] * v4[1] * v4[2] * v4[3]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c > new file mode 100644 > index 0000000..3790afc > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-3.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */ > +/* { dg-options "-O2 -ffast-math" } */ > +/* { dg-options "-O2 -ffast-math -maltivec -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ > + > +/* To test reassoc can undistribute vector bit_field_ref on bitwise AND. > + > + v1, v2, v3, v4 of type vector int. > + > + reassoc transforms > + > + accumulator &= v1[0] & v1[1] & v1[2] & v1[3] & > + v2[0] & v2[1] & v2[2] & v2[3] & > + v3[0] & v3[1] & v3[2] & v3[3] & > + v4[0] & v4[1] & v4[2] & v4[3] ; > + > + into: > + > + T = v1 & v2 & v3 & v4; > + accumulator &= T[0] & T[1] & T[2] & T[3]; > + > + Fewer bit_field_refs, only four for 128 or more bits vector. */ > + > +typedef int v4si __attribute__((vector_size(16))); > +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { > + accumulator &= v1[0] & v1[1] & v1[2] & v1[3]; > + accumulator &= v2[0] & v2[1] & v2[2] & v2[3]; > + accumulator &= v3[0] & v3[1] & v3[2] & v3[3]; > + accumulator &= v4[0] & v4[1] & v4[2] & v4[3]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c > new file mode 100644 > index 0000000..1864aad > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-4.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */ > +/* { dg-options "-O2 -ffast-math" } */ > +/* { dg-options "-O2 -ffast-math -maltivec -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ > + > +/* To test reassoc can undistribute vector bit_field_ref on bitwise IOR. > + > + v1, v2, v3, v4 of type vector int. > + > + reassoc transforms > + > + accumulator |= v1[0] | v1[1] | v1[2] | v1[3] | > + v2[0] | v2[1] | v2[2] | v2[3] | > + v3[0] | v3[1] | v3[2] | v3[3] | > + v4[0] | v4[1] | v4[2] | v4[3] ; > + > + into: > + > + T = v1 | v2 | v3 | v4; > + accumulator |= T[0] | T[1] | T[2] | T[3]; > + > + Fewer bit_field_refs, only four for 128 or more bits vector. */ > + > +typedef int v4si __attribute__((vector_size(16))); > +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { > + accumulator |= v1[0] | v1[1] | v1[2] | v1[3]; > + accumulator |= v2[0] | v2[1] | v2[2] | v2[3]; > + accumulator |= v3[0] | v3[1] | v3[2] | v3[3]; > + accumulator |= v4[0] | v4[1] | v4[2] | v4[3]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c > new file mode 100644 > index 0000000..f747372 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr88497-5.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target vect_int } */ > +/* { dg-require-effective-target powerpc_altivec_ok { target { powerpc*-*-* } } } */ > +/* { dg-options "-O2 -ffast-math" } */ > +/* { dg-options "-O2 -ffast-math -maltivec -fdump-tree-reassoc1" { target { powerpc*-*-* } } } */ > + > +/* To test reassoc can undistribute vector bit_field_ref on bitwise XOR. > + > + v1, v2, v3, v4 of type vector int. > + > + reassoc transforms > + > + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3] ^ > + v2[0] ^ v2[1] ^ v2[2] ^ v2[3] ^ > + v3[0] ^ v3[1] ^ v3[2] ^ v3[3] ^ > + v4[0] ^ v4[1] ^ v4[2] ^ v4[3] ; > + > + into: > + > + T = v1 ^ v2 ^ v3 ^ v4; > + accumulator ^= T[0] ^ T[1] ^ T[2] ^ T[3]; > + > + Fewer bit_field_refs, only four for 128 or more bits vector. */ > + > +typedef int v4si __attribute__((vector_size(16))); > +int test(int accumulator, v4si v1, v4si v2, v4si v3, v4si v4) { > + accumulator ^= v1[0] ^ v1[1] ^ v1[2] ^ v1[3]; > + accumulator ^= v2[0] ^ v2[1] ^ v2[2] ^ v2[3]; > + accumulator ^= v3[0] ^ v3[1] ^ v3[2] ^ v3[3]; > + accumulator ^= v4[0] ^ v4[1] ^ v4[2] ^ v4[3]; > + return accumulator; > +} > +/* { dg-final { scan-tree-dump-times "BIT_FIELD_REF" 4 "reassoc1" { target { powerpc*-*-* } } } } */ > diff --git a/gcc/tree-ssa-reassoc.c b/gcc/tree-ssa-reassoc.c > index e1c4dfe..a6cd85a 100644 > --- a/gcc/tree-ssa-reassoc.c > +++ b/gcc/tree-ssa-reassoc.c > @@ -1772,6 +1772,295 @@ undistribute_ops_list (enum tree_code opcode, > return changed; > } > > +/* Hold the information of one specific VECTOR_TYPE SSA_NAME. > + - offsets: for different BIT_FIELD_REF offsets accessing same VECTOR. > + - ops_indexes: the index of vec ops* for each relavant BIT_FIELD_REF. */ > +struct v_info > +{ > + auto_vec offsets; with SVE this probably needs to be poly_int64 or so > + auto_vec ops_indexes; > +}; To have less allocations you could use a auto_vec, 32> elements; or even hash_map > > where you use .release() in the cleanup and .create (TYPE_VECTOR_SUBPARTS (vector_type)) during collecting and then can use quick_push () (ah, no - duplicates...). > + > +typedef struct v_info *v_info_ptr; > + > +/* Comparison function for qsort on unsigned BIT_FIELD_REF offsets. */ > +static int > +unsigned_cmp (const void *p_i, const void *p_j) > +{ > + if (*(const unsigned HOST_WIDE_INT *) p_i > + >= *(const unsigned HOST_WIDE_INT *) p_j) > + return 1; > + else > + return -1; That's an issue with some qsort implementations comparing an element against itself. I think so you should add the equality case. > +} > + > +/* Cleanup hash map for VECTOR information. */ > +static void > +cleanup_vinfo_map (hash_map &info_map) > +{ > + for (hash_map::iterator it = info_map.begin (); > + it != info_map.end (); ++it) > + { > + v_info_ptr info = (*it).second; > + delete info; > + (*it).second = NULL; > + } > +} > + > +/* Perform un-distribution of BIT_FIELD_REF on VECTOR_TYPE. > + V1[0] + V1[1] + ... + V1[k] + V2[0] + V2[1] + ... + V2[k] + ... Vn[k] > + is transformed to > + Vs = (V1 + V2 + ... + Vn) > + Vs[0] + Vs[1] + ... + Vs[k] > + > + The basic steps are listed below: > + > + 1) Check the addition chain *OPS by looking those summands coming from > + VECTOR bit_field_ref on VECTOR type. Put the information into > + v_info_map for each satisfied summand, using VECTOR SSA_NAME as key. > + > + 2) For each key (VECTOR SSA_NAME), validate all its BIT_FIELD_REFs are > + continous, they can cover the whole VECTOR perfectly without any holes. > + Obtain one VECTOR list which contain candidates to be transformed. > + > + 3) Build the addition statements for all VECTOR candidates, generate > + BIT_FIELD_REFs accordingly. > + > + TODO: > + 1) The current implementation restrict all candidate VECTORs should have > + the same VECTOR type, but it can be extended into different groups by > + VECTOR types in future if any profitable cases found. > + 2) The current implementation requires the whole VECTORs should be fully > + covered, but it can be extended to support partial, checking adjacent > + but not fill the whole, it may need some cost model to define the > + boundary to do or not. > +*/ > +static bool > +undistribute_bitref_for_vector (enum tree_code opcode, vec *ops, > + struct loop *loop) > +{ > + if (ops->length () <= 1) > + return false; > + > + if (opcode != PLUS_EXPR && opcode != MULT_EXPR && opcode != BIT_XOR_EXPR > + && opcode != BIT_IOR_EXPR && opcode != BIT_AND_EXPR) > + return false; > + > + hash_map v_info_map; > + operand_entry *oe1; > + unsigned i; > + > + /* Find those summands from VECTOR BIT_FIELD_REF in addition chain, put the > + information into map. */ > + FOR_EACH_VEC_ELT (*ops, i, oe1) > + { > + enum tree_code dcode; > + gimple *oe1def; > + > + if (TREE_CODE (oe1->op) != SSA_NAME) > + continue; > + oe1def = SSA_NAME_DEF_STMT (oe1->op); > + if (!is_gimple_assign (oe1def)) > + continue; > + dcode = gimple_assign_rhs_code (oe1def); > + if (dcode != BIT_FIELD_REF || !is_reassociable_op (oe1def, dcode, loop)) > + continue; > + > + tree rhs = gimple_op (oe1def, 1); gimple_assign_rhs1 (oe1def); > + tree op0 = TREE_OPERAND (rhs, 0); > + tree vec_type = TREE_TYPE (op0); > + > + if (TREE_CODE (op0) != SSA_NAME || TREE_CODE (vec_type) != VECTOR_TYPE) > + continue; VECTOR_TYPE_P > + tree op1 = TREE_OPERAND (rhs, 1); > + tree op2 = TREE_OPERAND (rhs, 2); instead of op0/1/2 use more descriptive names for the temporaries? bf_name, bf_size, bf_offset? > + > + tree elem_type = TREE_TYPE (vec_type); > + unsigned HOST_WIDE_INT size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); > + if (size != TREE_INT_CST_LOW (op1)) if (!tree_int_cst_equal (TYPE_SIZE (elem_type), op1)) avoids some of the TREE_INT_CST_LOW we like to avoid. You miss a check for op2 % op1 being zero. Given you store a HOST_WIDE_INT offset you also want to check for INTEGER_CST op1/op2 (beware of SVE...). There's also a poly-int friendly multiple_p predicate so you could have the initial checks SVE friendly but bail out on non-INTEGER_CST offset later. > + continue; > + > + /* Ignore it if target machine can't support this VECTOR type. */ > + if (!VECTOR_MODE_P (TYPE_MODE (vec_type))) > + continue; > + > + /* Ignore it if target machine can't support this type of VECTOR > + operation. */ > + optab op_tab = optab_for_tree_code (opcode, vec_type, optab_vector); > + if (optab_handler (op_tab, TYPE_MODE (vec_type)) == CODE_FOR_nothing) > + continue; > + > + v_info_ptr *info_ptr = v_info_map.get (op0); > + if (info_ptr) > + { > + v_info_ptr info = *info_ptr; > + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); > + info->ops_indexes.safe_push (i); > + } > + else > + { > + v_info_ptr info = new v_info; > + info->offsets.safe_push (TREE_INT_CST_LOW (op2)); > + info->ops_indexes.safe_push (i); > + v_info_map.put (op0, info); > + } > + } > + > + /* At least two VECTOR to combine. */ > + if (v_info_map.elements () <= 1) > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } > + > + /* Use the first VECTOR and its information as the reference. > + Firstly, we should validate it, that is: > + 1) sorted offsets are adjacent, no holes. > + 2) can fill the whole VECTOR perfectly. */ > + hash_map::iterator it = v_info_map.begin (); > + tree ref_vec = (*it).first; > + v_info_ptr ref_info = (*it).second; > + ref_info->offsets.qsort (unsigned_cmp); > + tree vec_type = TREE_TYPE (ref_vec); > + tree elem_type = TREE_TYPE (vec_type); > + unsigned HOST_WIDE_INT elem_size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type)); please use tree_to_uhwi (TYPE_SIZE (elem_type)) so we will ICE instead of miscompile will it ever not fit. > + unsigned HOST_WIDE_INT curr; > + unsigned HOST_WIDE_INT prev = ref_info->offsets[0]; > + > + /* Continous check. */ > + FOR_EACH_VEC_ELT_FROM (ref_info->offsets, i, curr, 1) > + { > + if (curr != (prev + elem_size)) > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } > + prev = curr; > + } > + > + /* Check whether fill the whole. */ > + if ((prev + elem_size) != TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (ref_vec)))) compare_tree_int () might be handy here. > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } > + > + auto_vec vectors (v_info_map.elements ()); > + vectors.quick_push (ref_vec); > + > + /* Use the ref_vec to filter others. */ > + for (++it; it != v_info_map.end (); ++it) > + { > + tree vec = (*it).first; > + v_info_ptr info = (*it).second; > + if (TREE_TYPE (ref_vec) != TREE_TYPE (vec)) types_compatible_p () otherwise you reject const v2df vs. v2df for example. > + continue; > + if (ref_info->offsets.length () != info->offsets.length ()) > + continue; > + bool same_offset = true; > + info->offsets.qsort (unsigned_cmp); > + for (unsigned i = 0; i < ref_info->offsets.length (); i++) > + { > + if (ref_info->offsets[i] != info->offsets[i]) > + { > + same_offset = false; > + break; > + } > + } > + if (!same_offset) > + continue; > + vectors.quick_push (vec); > + } > + > + if (vectors.length () < 2) > + { > + cleanup_vinfo_map (v_info_map); > + return false; > + } Since you are using a hashtable keyed off SSA name pointers code generation will depend on host addresses here if you consider there's one mismatching vector type that might become ref_vec dependent on the order of elements in the hashtable. That's a no-no. Even if doing it like above looks to possibly save compile-time by avoiding qsort calls I think the appropriate thing to do is to partition the vector candidates into sets of compatible vectors (consider summing two v2df and two v4df vectors for example) and then take out ones you disqualify because they do not cover the vector in full. I think whether the vector is fully covered can be done way cheaper as well by using a sbitmap and clearing a bit whenever its corresponding lane is in the vector (and bailing out if the bit was already clear since you then got a duplicate). So start with bitmap_ones () and at the end check bitmap_empty_p (). I don't think you depend on the vector being sorted later? > + tree tr; > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "The bit_field_ref vector list for undistribute: "); > + FOR_EACH_VEC_ELT (vectors, i, tr) > + { > + print_generic_expr (dump_file, tr); > + fprintf (dump_file, " "); > + } > + fprintf (dump_file, "\n"); > + } > + > + /* Build the sum for all candidate VECTORs. */ > + unsigned idx; > + gimple *sum = NULL; > + v_info_ptr info; > + tree sum_vec = ref_vec; > + FOR_EACH_VEC_ELT_FROM (vectors, i, tr, 1) > + { > + sum = build_and_add_sum (TREE_TYPE (ref_vec), sum_vec, tr, opcode); > + info = *(v_info_map.get (tr)); > + unsigned j; > + FOR_EACH_VEC_ELT (info->ops_indexes, j, idx) > + { > + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); > + gimple_set_visited (def, true); you set the visited flag to get the ops definition DCEd eventually, right? Note this in a comment. > + if (opcode == PLUS_EXPR || opcode == BIT_XOR_EXPR > + || opcode == BIT_IOR_EXPR) > + (*ops)[idx]->op = build_zero_cst (TREE_TYPE ((*ops)[idx]->op)); > + else if (opcode == MULT_EXPR) > + (*ops)[idx]->op = build_one_cst (TREE_TYPE ((*ops)[idx]->op)); > + else > + { > + gcc_assert (opcode == BIT_AND_EXPR); > + (*ops)[idx]->op > + = build_all_ones_cst (TREE_TYPE ((*ops)[idx]->op)); > + } > + (*ops)[idx]->rank = 0; > + } > + sum_vec = gimple_get_lhs (sum); > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Generating addition -> "); > + print_gimple_stmt (dump_file, sum, 0); > + } > + } > + > + /* Referring to any good shape VECTOR (here using ref_vec), generate the > + BIT_FIELD_REF statements accordingly. */ > + info = *(v_info_map.get (ref_vec)); > + gcc_assert (sum); > + FOR_EACH_VEC_ELT (info->ops_indexes, i, idx) > + { > + tree dst = make_ssa_name (elem_type); > + gimple *gs > + = gimple_build_assign (dst, BIT_FIELD_REF, > + build3 (BIT_FIELD_REF, elem_type, sum_vec, > + TYPE_SIZE (elem_type), > + bitsize_int (info->offsets[i]))); > + insert_stmt_after (gs, sum); > + update_stmt (gs); The update_stmt is redundant. > + gimple *def = SSA_NAME_DEF_STMT ((*ops)[idx]->op); > + gimple_set_visited (def, true); Same here - for DCE? Otherwise looks OK to me. > + (*ops)[idx]->op = gimple_assign_lhs (gs); > + (*ops)[idx]->rank = get_rank ((*ops)[idx]->op); > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "Generating bit_field_ref -> "); > + print_gimple_stmt (dump_file, gs, 0); > + } > + } > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, "undistributiong bit_field_ref for vector done.\n"); > + } > + > + cleanup_vinfo_map (v_info_map); > + > + return true; > +} > + > /* If OPCODE is BIT_IOR_EXPR or BIT_AND_EXPR and CURR is a comparison > expression, examine the other OPS to see if any of them are comparisons > of the same values, which we may be able to combine or eliminate. > @@ -5880,11 +6169,6 @@ reassociate_bb (basic_block bb) > tree lhs, rhs1, rhs2; > enum tree_code rhs_code = gimple_assign_rhs_code (stmt); > > - /* If this is not a gimple binary expression, there is > - nothing for us to do with it. */ > - if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) > - continue; > - > /* If this was part of an already processed statement, > we don't need to touch it again. */ > if (gimple_visited_p (stmt)) > @@ -5911,6 +6195,11 @@ reassociate_bb (basic_block bb) > continue; > } > > + /* If this is not a gimple binary expression, there is > + nothing for us to do with it. */ > + if (get_gimple_rhs_class (rhs_code) != GIMPLE_BINARY_RHS) > + continue; > + > lhs = gimple_assign_lhs (stmt); > rhs1 = gimple_assign_rhs1 (stmt); > rhs2 = gimple_assign_rhs2 (stmt); > @@ -5950,6 +6239,13 @@ reassociate_bb (basic_block bb) > optimize_ops_list (rhs_code, &ops); > } > > + if (undistribute_bitref_for_vector (rhs_code, &ops, > + loop_containing_stmt (stmt))) > + { > + ops.qsort (sort_by_operand_rank); > + optimize_ops_list (rhs_code, &ops); > + } > + > if (rhs_code == PLUS_EXPR > && transform_add_to_multiply (&ops)) > ops.qsort (sort_by_operand_rank); > -- Richard Biener SUSE Linux GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer, Mary Higgins, Sri Rasiah; HRB 21284 (AG Nürnberg)