PR111754: Rework encoding of result for VEC_PERM_EXPR with constant input vectors. gcc/ChangeLog: PR middle-end/111754 * fold-const.cc (fold_vec_perm_cst): Set result's encoding to sel's encoding, and set res_nelts_per_pattern to 2 if sel contains stepped sequence but input vectors do not. (test_nunits_min_2): New test Case 8. (test_nunits_min_4): New tests Case 8 and Case 9. gcc/testsuite/ChangeLog: PR middle-end/111754 * gcc.target/aarch64/sve/slp_3.c: Adjust code-gen. * gcc.target/aarch64/sve/slp_4.c: Likewise. * gcc.dg/vect/pr111754.c: New test. Co-authored-by: Richard Sandiford diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index 332bc8aead2..dff09b81f7b 100644 --- a/gcc/fold-const.cc +++ b/gcc/fold-const.cc @@ -10803,27 +10803,38 @@ fold_vec_perm_cst (tree type, tree arg0, tree arg1, const vec_perm_indices &sel, unsigned res_npatterns, res_nelts_per_pattern; unsigned HOST_WIDE_INT res_nelts; - /* (1) If SEL is a suitable mask as determined by - valid_mask_for_fold_vec_perm_cst_p, then: - res_npatterns = max of npatterns between ARG0, ARG1, and SEL - res_nelts_per_pattern = max of nelts_per_pattern between - ARG0, ARG1 and SEL. - (2) If SEL is not a suitable mask, and TYPE is VLS then: - res_npatterns = nelts in result vector. - res_nelts_per_pattern = 1. - This exception is made so that VLS ARG0, ARG1 and SEL work as before. */ - if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason)) - { - res_npatterns - = std::max (VECTOR_CST_NPATTERNS (arg0), - std::max (VECTOR_CST_NPATTERNS (arg1), - sel.encoding ().npatterns ())); + /* First try to implement the fold in a VLA-friendly way. + + (1) If the selector is simply a duplication of N elements, the + result is likewise a duplication of N elements. + + (2) If the selector is N elements followed by a duplication + of N elements, the result is too. + + (3) If the selector is N elements followed by an interleaving + of N linear series, the situation is more complex. + + valid_mask_for_fold_vec_perm_cst_p detects whether we + can handle this case. If we can, then each of the N linear + series either (a) selects the same element each time or + (b) selects a linear series from one of the input patterns. - res_nelts_per_pattern - = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0), - std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1), - sel.encoding ().nelts_per_pattern ())); + If (b) holds for one of the linear series, the result + will contain a linear series, and so the result will have + the same shape as the selector. If (a) holds for all of + the linear series, the result will be the same as (2) above. + (b) can only hold if one of the input patterns has a + stepped encoding. */ + + if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason)) + { + res_npatterns = sel.encoding ().npatterns (); + res_nelts_per_pattern = sel.encoding ().nelts_per_pattern (); + if (res_nelts_per_pattern == 3 + && VECTOR_CST_NELTS_PER_PATTERN (arg0) < 3 + && VECTOR_CST_NELTS_PER_PATTERN (arg1) < 3) + res_nelts_per_pattern = 2; res_nelts = res_npatterns * res_nelts_per_pattern; } else if (TYPE_VECTOR_SUBPARTS (type).is_constant (&res_nelts)) @@ -17622,6 +17633,29 @@ test_nunits_min_2 (machine_mode vmode) tree expected_res[] = { ARG0(0), ARG1(0), ARG1(1) }; validate_res (1, 3, res, expected_res); } + + /* Case 8: Same as aarch64/sve/slp_3.c: + arg0, arg1 are dup vectors. + sel = { 0, len, 1, len+1, 2, len+2, ... } // (2, 3) + So res = { arg0[0], arg1[0], ... } // (2, 1) + + In this case, since the input vectors are dup, only the first two + elements per pattern in sel are considered significant. */ + { + tree arg0 = build_vec_cst_rand (vmode, 1, 1); + tree arg1 = build_vec_cst_rand (vmode, 1, 1); + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + + vec_perm_builder builder (len, 2, 3); + poly_uint64 mask_elems[] = { 0, len, 1, len + 1, 2, len + 2 }; + builder_push_elems (builder, mask_elems); + + vec_perm_indices sel (builder, 2, len); + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); + + tree expected_res[] = { ARG0(0), ARG1(0) }; + validate_res (2, 1, res, expected_res); + } } } @@ -17790,6 +17824,44 @@ test_nunits_min_4 (machine_mode vmode) ASSERT_TRUE (res == NULL_TREE); ASSERT_TRUE (!strcmp (reason, "step is not multiple of npatterns")); } + + /* Case 8: PR111754: When input vector is not a stepped sequence, + check that the result is not a stepped sequence either, even + if sel has a stepped sequence. */ + { + tree arg0 = build_vec_cst_rand (vmode, 1, 2); + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + + vec_perm_builder builder (len, 1, 3); + poly_uint64 mask_elems[] = { 0, 1, 2 }; + builder_push_elems (builder, mask_elems); + + vec_perm_indices sel (builder, 1, len); + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg0, sel); + + tree expected_res[] = { ARG0(0), ARG0(1) }; + validate_res (sel.encoding ().npatterns (), 2, res, expected_res); + } + + /* Case 9: If sel doesn't contain a stepped sequence, + check that the result has same encoding as sel, irrespective + of shape of input vectors. */ + { + tree arg0 = build_vec_cst_rand (vmode, 1, 3, 1); + tree arg1 = build_vec_cst_rand (vmode, 1, 3, 1); + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); + + vec_perm_builder builder (len, 1, 2); + poly_uint64 mask_elems[] = { 0, len }; + builder_push_elems (builder, mask_elems); + + vec_perm_indices sel (builder, 2, len); + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); + + tree expected_res[] = { ARG0(0), ARG1(0) }; + validate_res (sel.encoding ().npatterns (), + sel.encoding ().nelts_per_pattern (), res, expected_res); + } } } diff --git a/gcc/testsuite/gcc.dg/vect/pr111754.c b/gcc/testsuite/gcc.dg/vect/pr111754.c new file mode 100644 index 00000000000..7c1c16875c7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/pr111754.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +typedef float __attribute__((__vector_size__ (16))) F; + +F foo (F a, F b) +{ + F v = (F) { 9 }; + return __builtin_shufflevector (v, v, 1, 0, 1, 2); +} + +/* { dg-final { scan-tree-dump-not "VEC_PERM_EXPR" "optimized" } } */ +/* { dg-final { scan-tree-dump "return \{ 0.0, 9.0e\\+0, 0.0, 0.0 \}" "optimized" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c index 82dd43a4d98..775c1e1d530 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c @@ -33,21 +33,14 @@ TEST_ALL (VEC_PERM) /* 1 for each 8-bit type. */ /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */ -/* 1 for each 16-bit type plus 1 for double. */ -/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */ +/* 1 for each 16-bit type */ +/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */ /* 1 for each 32-bit type. */ /* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */ -/* 3 for double. */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */ +/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 6 } } */ /* The 64-bit types need: - - ZIP1 ZIP1 (2 ZIP2s optimized away) ZIP1 ZIP2. */ -/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 9 } } */ +/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */ /* The loop should be fully-masked. The 64-bit types need two loads diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c index b1fa5e3cf68..5a9fc8ff750 100644 --- a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c @@ -35,31 +35,20 @@ vec_slp_##TYPE (TYPE *restrict a, int n) \ TEST_ALL (VEC_PERM) -/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double. */ -/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */ +/* 1 for each 8-bit type */ +/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 2 } } */ /* 1 for each 16-bit type. */ /* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #80\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #63\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */ -/* 4 for double. */ -/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */ +/* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 18 } } */ /* The 32-bit types need: - ZIP1 ZIP1 (2 ZIP2s optimized away) ZIP1 ZIP2 and the 64-bit types need: - ZIP1 ZIP1 ZIP1 ZIP1 (4 ZIP2s optimized away) ZIP1 ZIP2 ZIP1 ZIP2 ZIP1 ZIP2 ZIP1 ZIP2. */ -/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 33 } } */ +/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */ /* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */ /* The loop should be fully-masked. The 32-bit types need two loads