diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c index d689e21dc11..3df2431be38 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_2.c @@ -32,4 +32,4 @@ TEST_ALL (DEF_LOOP) /* { dg-final { scan-assembler-not {\tmov\tz} } } */ /* { dg-final { scan-assembler-not {\tmovprfx\t} } } */ /* Currently we canonicalize the ?: so that !b[i] is the "false" value. */ -/* { dg-final { scan-assembler-not {\tsel\t} { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c index dcc30768f88..86064ebfcba 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_1.c @@ -11,7 +11,10 @@ INT_TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? (FLOAT_TYPE) a[i] : b[i]; \ + { \ + FLOAT_TYPE bi = b[i]; \ + r[i] = pred[i] ? (FLOAT_TYPE) a[i] : bi; \ + } \ } #define TEST_ALL(T) \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c index 7e5f2a73ed9..e3a947b2698 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_convert_4.c @@ -11,7 +11,10 @@ INT_TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? (INT_TYPE) a[i] : b[i]; \ + { \ + INT_TYPE bi = b[i]; \ + r[i] = pred[i] ? (INT_TYPE) a[i] : bi; \ + } \ } #define TEST_ALL(T) \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c index 991ccf016d1..97d1b8f5d45 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_unary_2.c @@ -13,7 +13,10 @@ TYPE *__restrict pred, int n) \ { \ for (int i = 0; i < n; ++i) \ - r[i] = pred[i] ? OP (a[i]) : b[i]; \ + { \ + TYPE bi = b[i]; \ + r[i] = pred[i] ? OP (a[i]) : bi; \ + } \ } #define TEST_INT_TYPE(T, TYPE) \ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c index 5c04bcdb3f5..a1b0667dab5 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/fmla_2.c @@ -15,5 +15,9 @@ f (double *restrict a, double *restrict b, double *restrict c, } } -/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 } } */ +/* See https://gcc.gnu.org/ml/gcc-patches/2019-08/msg01644.html + for XFAILing the below test. */ + +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 2 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfmla\tz[0-9]+\.d, p[0-7]/m, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-not {\tfmad\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c index 00d84760a19..b38f23e87ba 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_4.c @@ -98,24 +98,24 @@ TEST_CMP (nugt) /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ /* 5 for lt, 5 for ult and 5 for nult. */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for le, 5 for ule and 5 for nule. */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for gt, 5 for ugt and 5 for nugt. */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* 5 for ge, 5 for uge and 5 for nuge. */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 30 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0\n} } } */ /* 3 loops * 5 invocations for all 12 unordered comparisons. */ -/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s\n} 180 } } */ /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 7 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmeq\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 14 { xfail *-*-* } } } */ @@ -123,19 +123,19 @@ TEST_CMP (nugt) /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmne\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 42 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0\n} } } */ /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2, for all 12 unordered comparisons. */ -/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d\n} 252 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c index 23bfb7b2649..2f16fbff522 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/vcond_5.c @@ -19,16 +19,16 @@ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */ /* 5 for le, 5 for ule and 5 for nule. */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */ /* 5 for gt, 5 for ugt, 5 for nueq and 5 for nugt. */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 20 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 40 { xfail *-*-* } } } */ /* 5 for ge, 5 for uge and 5 for nuge. */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} 15 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, z[0-9]+\.s} 30 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.s, p[0-7]/z, z[0-9]+\.s, #0\.0} } } */ /* 3 loops * 5 invocations for ordered, unordered amd ueq. */ @@ -43,14 +43,14 @@ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmlt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmle\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 28 { xfail *-*-* } } } */ /* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 56 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 { xfail *-*-* } } } */ -/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 { xfail *-*-* } } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} 21 } } */ +/* { dg-final { scan-assembler-times {\tfcmge\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, z[0-9]+\.d} 42 } } */ /* { dg-final { scan-assembler-not {\tfcmuo\tp[0-9]+\.d, p[0-7]/z, z[0-9]+\.d, #0\.0} } } */ /* 3 loops * 5 invocations, with 2 invocations having ncopies == 2, diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 3db4a5cdf78..da952645759 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -6603,7 +6603,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, } else vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, - vectype_in); + vectype_in, NULL); } if (dump_enabled_p () && reduction_type == FOLD_LEFT_REDUCTION) @@ -8005,7 +8005,7 @@ vectorizable_live_operation (stmt_vec_info stmt_info, gcc_assert (ncopies == 1 && !slp_node); vect_record_loop_mask (loop_vinfo, &LOOP_VINFO_MASKS (loop_vinfo), - 1, vectype); + 1, vectype, NULL); } } return true; @@ -8204,11 +8204,12 @@ vect_double_mask_nunits (tree type) /* Record that a fully-masked version of LOOP_VINFO would need MASKS to contain a sequence of NVECTORS masks that each control a vector of type - VECTYPE. */ + VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND + these vector masks with the vector version of SCALAR_MASK. */ void vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, - unsigned int nvectors, tree vectype) + unsigned int nvectors, tree vectype, tree scalar_mask) { gcc_assert (nvectors != 0); if (masks->length () < nvectors) @@ -8219,6 +8220,13 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, unsigned int nscalars_per_iter = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); + + if (scalar_mask) + { + scalar_cond_masked_key cond (scalar_mask, nvectors); + loop_vinfo->scalar_cond_masked_set.add (cond); + } + if (rgm->max_nscalars_per_iter < nscalars_per_iter) { rgm->max_nscalars_per_iter = nscalars_per_iter; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index cac7410387b..4db8e24ccd1 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1879,7 +1879,8 @@ static tree permute_vec_elements (tree, tree, tree, stmt_vec_info, says how the load or store is going to be implemented and GROUP_SIZE is the number of load or store statements in the containing group. If the access is a gather load or scatter store, GS_INFO describes - its arguments. + its arguments. If the load or store is conditional, SCALAR_MASK is the + condition under which it occurs. Clear LOOP_VINFO_CAN_FULLY_MASK_P if a fully-masked loop is not supported, otherwise record the required mask types. */ @@ -1888,7 +1889,7 @@ static void check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, vec_load_store_type vls_type, int group_size, vect_memory_access_type memory_access_type, - gather_scatter_info *gs_info) + gather_scatter_info *gs_info, tree scalar_mask) { /* Invariant loads need no special support. */ if (memory_access_type == VMAT_INVARIANT) @@ -1912,7 +1913,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, return; } unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype); + vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); return; } @@ -1936,7 +1937,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, return; } unsigned int ncopies = vect_get_num_copies (loop_vinfo, vectype); - vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype); + vect_record_loop_mask (loop_vinfo, masks, ncopies, vectype, scalar_mask); return; } @@ -1974,7 +1975,7 @@ check_load_store_masking (loop_vec_info loop_vinfo, tree vectype, poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); unsigned int nvectors; if (can_div_away_from_zero_p (group_size * vf, nunits, &nvectors)) - vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype); + vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype, scalar_mask); else gcc_unreachable (); } @@ -3436,7 +3437,9 @@ vectorizable_call (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, unsigned int nvectors = (slp_node ? SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) : ncopies); - vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_out); + tree scalar_mask = gimple_call_arg (stmt_info->stmt, mask_opno); + vect_record_loop_mask (loop_vinfo, masks, nvectors, + vectype_out, scalar_mask); } return true; } @@ -7390,7 +7393,7 @@ vectorizable_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) check_load_store_masking (loop_vinfo, vectype, vls_type, group_size, - memory_access_type, &gs_info); + memory_access_type, &gs_info, mask); STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; vect_model_store_cost (stmt_info, ncopies, rhs_dt, memory_access_type, @@ -8637,7 +8640,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) check_load_store_masking (loop_vinfo, vectype, VLS_LOAD, group_size, - memory_access_type, &gs_info); + memory_access_type, &gs_info, mask); STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; vect_model_load_cost (stmt_info, ncopies, memory_access_type, @@ -9774,6 +9777,10 @@ vect_is_simple_cond (tree cond, vec_info *vinfo, When STMT_INFO is vectorized as a nested cycle, for_reduction is true. + For COND_EXPR if T comes from masked load, and is conditional + on C, we apply loop mask to result of vector comparison, if it's present. + Similarly for E, if it is conditional on !C. + Return true if STMT_INFO is vectorizable in this way. */ bool @@ -9999,6 +10006,35 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* Handle cond expr. */ for (j = 0; j < ncopies; j++) { + tree loop_mask = NULL_TREE; + bool swap_cond_operands = false; + + /* Look up if there is a loop mask associated with the + scalar cond, or it's inverse. */ + + if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + { + scalar_cond_masked_key cond (cond_expr, ncopies); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, vectype, j); + } + else + { + bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0)); + cond.code = invert_tree_comparison (cond.code, honor_nans); + if (loop_vinfo->scalar_cond_masked_set.contains (cond)) + { + vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); + loop_mask = vect_get_loop_mask (gsi, masks, ncopies, + vectype, j); + cond_code = cond.code; + swap_cond_operands = true; + } + } + } + stmt_vec_info new_stmt_info = NULL; if (j == 0) { @@ -10076,6 +10112,9 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vec_then_clause = vec_oprnds2[i]; vec_else_clause = vec_oprnds3[i]; + if (swap_cond_operands) + std::swap (vec_then_clause, vec_else_clause); + if (masked) vec_compare = vec_cond_lhs; else @@ -10114,6 +10153,47 @@ vectorizable_condition (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, } } } + + /* If loop mask is present, then AND it with + result of vec comparison, so later passes (fre4) + will reuse the same condition used in masked load. + + For example: + for (int i = 0; i < 100; ++i) + x[i] = y[i] ? z[i] : 10; + + results in following optimized GIMPLE: + + mask__35.8_43 = vect__4.7_41 != { 0, ... }; + vec_mask_and_46 = loop_mask_40 & mask__35.8_43; + _19 = &MEM[base: z_12(D), index: ivtmp_56, step: 4, offset: 0B]; + vect_iftmp.11_47 = .MASK_LOAD (_19, 4B, vec_mask_and_46); + vect_iftmp.12_52 = VEC_COND_EXPR ; + + instead of recomputing vec != { 0, ... } in vec_cond_expr */ + + if (loop_mask) + { + if (COMPARISON_CLASS_P (vec_compare)) + { + tree tmp = make_ssa_name (vec_cmp_type); + tree op0 = TREE_OPERAND (vec_compare, 0); + tree op1 = TREE_OPERAND (vec_compare, 1); + gassign *g = gimple_build_assign (tmp, + TREE_CODE (vec_compare), + op0, op1); + vect_finish_stmt_generation (stmt_info, g, gsi); + vec_compare = tmp; + } + + tree tmp2 = make_ssa_name (vec_cmp_type); + gassign *g = gimple_build_assign (tmp2, BIT_AND_EXPR, + vec_compare, loop_mask); + vect_finish_stmt_generation (stmt_info, g, gsi); + vec_compare = tmp2; + } + if (reduction_type == EXTRACT_LAST_REDUCTION) { if (!is_gimple_val (vec_compare)) diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 800c99fea26..20945a39c84 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1516,3 +1516,36 @@ make_pass_ipa_increase_alignment (gcc::context *ctxt) { return new pass_ipa_increase_alignment (ctxt); } + +/* If code(T) is comparison op or def of comparison stmt, + extract it's operands. + Else return . */ + +void +scalar_cond_masked_key::get_cond_ops_from_tree (tree t) +{ + if (TREE_CODE_CLASS (TREE_CODE (t)) == tcc_comparison) + { + this->code = TREE_CODE (t); + this->op0 = TREE_OPERAND (t, 0); + this->op1 = TREE_OPERAND (t, 1); + return; + } + + if (TREE_CODE (t) == SSA_NAME) + if (gassign *stmt = dyn_cast (SSA_NAME_DEF_STMT (t))) + { + tree_code code = gimple_assign_rhs_code (stmt); + if (TREE_CODE_CLASS (code) == tcc_comparison) + { + this->code = code; + this->op0 = gimple_assign_rhs1 (stmt); + this->op1 = gimple_assign_rhs2 (stmt); + return; + } + } + + this->code = NE_EXPR; + this->op0 = t; + this->op1 = build_zero_cst (TREE_TYPE (t)); +} diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 837fb5ab525..632f12a30dc 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-data-ref.h" #include "tree-hash-traits.h" #include "target.h" +#include "hash-set.h" /* Used for naming of new temporaries. */ enum vect_var_kind { @@ -177,7 +178,78 @@ public: #define SLP_TREE_TWO_OPERATORS(S) (S)->two_operators #define SLP_TREE_DEF_TYPE(S) (S)->def_type +/* Key for map that records association between + scalar conditions used in masked loads and corresponding + loop mask. The map is populated by vect_record_loop_mask. + vectorizable_condition uses the map to check if scalar + cond (or it's inverse) has a loop mask associated with it, + and if yes, applies loop mask to result of vector comparison. */ + +struct scalar_cond_masked_key +{ + scalar_cond_masked_key (tree t, unsigned ncopies_) + : ncopies (ncopies_) + { + get_cond_ops_from_tree (t); + } + + void get_cond_ops_from_tree (tree); + + unsigned ncopies; + tree_code code; + tree op0; + tree op1; +}; +template<> +struct default_hash_traits +{ + typedef scalar_cond_masked_key compare_type; + typedef scalar_cond_masked_key value_type; + + static inline hashval_t + hash (value_type v) + { + inchash::hash h; + h.add_int (v.code); + inchash::add_expr (v.op0, h, 0); + inchash::add_expr (v.op1, h, 0); + h.add_int (v.ncopies); + return h.end (); + } + + static inline bool + equal (value_type existing, value_type candidate) + { + return (existing.ncopies == candidate.ncopies + && existing.code == candidate.code + && operand_equal_p (existing.op0, candidate.op0, 0) + && operand_equal_p (existing.op1, candidate.op1, 0)); + } + + static inline void + mark_empty (value_type &v) + { + v.ncopies = 0; + } + + static inline bool + is_empty (value_type v) + { + return v.ncopies == 0; + } + + static inline void mark_deleted (value_type &) {} + + static inline bool is_deleted (const value_type &) + { + return false; + } + + static inline void remove (value_type &) {} +}; + +typedef hash_set scalar_cond_masked_set_type; /* Describes two objects whose addresses must be unequal for the vectorized loop to be valid. */ @@ -258,6 +330,9 @@ public: /* Cost data used by the target cost model. */ void *target_cost_data; + /* Set of scalar conditions that have loop mask applied. */ + scalar_cond_masked_set_type scalar_cond_masked_set; + private: stmt_vec_info new_stmt_vec_info (gimple *stmt); void set_vinfo_for_stmt (gimple *, stmt_vec_info); @@ -1641,7 +1716,7 @@ extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, extern tree vect_halve_mask_nunits (tree); extern tree vect_double_mask_nunits (tree); extern void vect_record_loop_mask (loop_vec_info, vec_loop_masks *, - unsigned int, tree); + unsigned int, tree, tree); extern tree vect_get_loop_mask (gimple_stmt_iterator *, vec_loop_masks *, unsigned int, tree, unsigned int);