diff --git a/gcc/genopinit.c b/gcc/genopinit.c index 195ddf74fa2b7d89760622073dcec9d5d339a097..2bc7cdbf53337beae181afd7bb05b366ab068c6a 100644 --- a/gcc/genopinit.c +++ b/gcc/genopinit.c @@ -321,6 +321,7 @@ main (int argc, const char **argv) " bool supports_vec_scatter_store_cached;\n" "};\n" "extern void init_all_optabs (struct target_optabs *);\n" + "extern bool partial_vectors_supported_p (void);\n" "\n" "extern struct target_optabs default_target_optabs;\n" "extern struct target_optabs *this_fn_optabs;\n" @@ -373,6 +374,33 @@ main (int argc, const char **argv) fprintf (s_file, " ena[%u] = HAVE_%s;\n", i, p->name); fprintf (s_file, "}\n\n"); + fprintf (s_file, + "/* Returns TRUE if the target supports any of the partial vector\n" + " optabs: while_ult_optab, len_load_optab or len_store_optab,\n" + " for any mode. */\n" + "bool\npartial_vectors_supported_p (void)\n{\n"); + bool any_match = false; + fprintf (s_file, "\treturn"); + bool first = true; + for (i = 0; patterns.iterate (i, &p); ++i) + { +#define CMP_NAME(N) !strncmp (p->name, (N), strlen ((N))) + if (CMP_NAME("while_ult") || CMP_NAME ("len_load") + || CMP_NAME ("len_store")) + { + if (first) + fprintf (s_file, " HAVE_%s", p->name); + else + fprintf (s_file, " || HAVE_%s", p->name); + first = false; + any_match = true; + } + } + if (!any_match) + fprintf (s_file, " false"); + fprintf (s_file, ";\n}\n"); + + /* Perform a binary search on a pre-encoded optab+mode*2. */ /* ??? Perhaps even better to generate a minimal perfect hash. Using gperf directly is awkward since it's so geared to working diff --git a/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c new file mode 100644 index 0000000000000000000000000000000000000000..286a7be236f337fee4c4650f42da72000855c5e6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/masked_epilogue.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -fdump-tree-vect-details -march=armv8-a+sve -msve-vector-bits=scalable" } */ + +void f(unsigned char y[restrict], + unsigned char x[restrict], int n) { + for (int i = 0; i < n; ++i) + y[i] = (y[i] + x[i] + 1) >> 1; +} + +/* { dg-final { scan-tree-dump {LOOP EPILOGUE VECTORIZED \(MODE=VNx} "vect" } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index a28bb6321d76b8222bc8cfdade151ca9b4dca406..5af98a36678ae61e99f93beb90920e2d0940c53a 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2824,11 +2824,13 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, { unsigned HOST_WIDE_INT main_vf_max = estimated_poly_value (main_poly_vf, POLY_VALUE_MAX); + unsigned HOST_WIDE_INT old_vf_max + = estimated_poly_value (old_vf, POLY_VALUE_MAX); + unsigned HOST_WIDE_INT new_vf_max + = estimated_poly_value (new_vf, POLY_VALUE_MAX); - old_factor = main_vf_max / estimated_poly_value (old_vf, - POLY_VALUE_MAX); - new_factor = main_vf_max / estimated_poly_value (new_vf, - POLY_VALUE_MAX); + old_factor = CEIL (main_vf_max, old_vf_max); + new_factor = CEIL (main_vf_max, new_vf_max); /* If the loop is not using partial vectors then it will iterate one time less than one that does. It is safe to subtract one here, @@ -3069,8 +3071,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) machine_mode autodetected_vector_mode = VOIDmode; opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); unsigned int mode_i = 0; - unsigned int first_loop_i = 0; - unsigned int first_loop_next_i = 0; unsigned HOST_WIDE_INT simdlen = loop->simdlen; /* First determine the main loop vectorization mode, either the first @@ -3079,7 +3079,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) lowest cost if pick_lowest_cost_p. */ while (1) { - unsigned int loop_vinfo_i = mode_i; bool fatal; opt_loop_vec_info loop_vinfo = vect_analyze_loop_1 (loop, shared, &loop_form_info, @@ -3108,11 +3107,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) first_loop_vinfo = opt_loop_vec_info::success (NULL); } if (first_loop_vinfo == NULL) - { - first_loop_vinfo = loop_vinfo; - first_loop_i = loop_vinfo_i; - first_loop_next_i = mode_i; - } + first_loop_vinfo = loop_vinfo; else { delete loop_vinfo; @@ -3158,32 +3153,37 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) /* Now analyze first_loop_vinfo for epilogue vectorization. */ poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); - /* Handle the case that the original loop can use partial - vectorization, but want to only adopt it for the epilogue. - The retry should be in the same mode as original. */ - if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo)) + /* For epilogues start the analysis from the first mode. The motivation + behind starting from the beginning comes from cases where the VECTOR_MODES + array may contain length-agnostic and length-specific modes. Their + ordering is not guaranteed, so we could end up picking a mode for the main + loop that is after the epilogue's optimal mode. */ + mode_i = 1; + bool supports_partial_vectors = partial_vectors_supported_p (); + poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo); + + while (1) { - gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo) - && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo)); + /* If the target does not support partial vectors we can shorten the + number of modes to analyze for the epilogue as we know we can't pick a + mode that has at least as many NUNITS as the main loop's vectorization + factor, since that would imply the epilogue's vectorization factor + would be at least as high as the main loop's and we would be + vectorizing for more scalar iterations than there would be left. */ + if (!supports_partial_vectors + && maybe_ge (GET_MODE_NUNITS (vector_modes[mode_i]), first_vinfo_vf)) + { + mode_i++; + if (mode_i == vector_modes.length ()) + break; + continue; + } + if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, - "***** Re-trying analysis with same vector mode" - " %s for epilogue with partial vectors.\n", - GET_MODE_NAME (first_loop_vinfo->vector_mode)); - mode_i = first_loop_i; - } - else - { - mode_i = first_loop_next_i; - if (mode_i == vector_modes.length ()) - return first_loop_vinfo; - } - - /* ??? If first_loop_vinfo was using VOIDmode then we probably - want to instead search for the corresponding mode in vector_modes[]. */ + "***** Re-trying epilogue analysis with vector " + "mode %s\n", GET_MODE_NAME (vector_modes[mode_i])); - while (1) - { bool fatal; opt_loop_vec_info loop_vinfo = vect_analyze_loop_1 (loop, shared, &loop_form_info, @@ -3235,11 +3235,6 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared) if (mode_i == vector_modes.length ()) break; - /* Try the next biggest vector size. */ - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "***** Re-trying epilogue analysis with vector " - "mode %s\n", GET_MODE_NAME (vector_modes[mode_i])); } if (!first_loop_vinfo->epilogue_vinfos.is_empty ())