diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc index 5da3d14dc357ba01351bca961af4f100a89665e1..d66626748e5edc46e46edf10e243114a9f74be97 100644 --- a/gcc/config/aarch64/aarch64.cc +++ b/gcc/config/aarch64/aarch64.cc @@ -15637,11 +15637,16 @@ private: unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *, unsigned int); bool prefer_unrolled_loop () const; + unsigned int determine_suggested_unroll_factor (); /* True if we have performed one-time initialization based on the vec_info. */ bool m_analyzed_vinfo = false; + /* This loop uses an average operation that is not supported by SVE, but is + supported by Advanced SIMD and SVE2. */ + bool m_has_avg = false; + /* - If M_VEC_FLAGS is zero then we're costing the original scalar code. - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced SIMD code. @@ -16642,6 +16647,21 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind, as one iteration of the SVE loop. */ if (where == vect_body && m_unrolled_advsimd_niters) m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters; + + /* Detect the use of an averaging operation. */ + gimple *stmt = stmt_info->stmt; + if (is_gimple_call (stmt) + && gimple_call_internal_p (stmt)) + { + switch (gimple_call_internal_fn (stmt)) + { + case IFN_AVG_FLOOR: + case IFN_AVG_CEIL: + m_has_avg = true; + default: + break; + } + } } return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ()); } @@ -16725,6 +16745,68 @@ adjust_body_cost_sve (const aarch64_vec_op_count *ops, return sve_cycles_per_iter; } +unsigned int +aarch64_vector_costs::determine_suggested_unroll_factor () +{ + bool sve = m_vec_flags & VEC_ANY_SVE; + /* If we are trying to unroll an Advanced SIMD main loop that contains + an averaging operation that we do not support with SVE and we might use a + predicated epilogue, we need to be conservative and block unrolling as + this might lead to a less optimal loop for the first and only epilogue + using the original loop's vectorization factor. + TODO: Remove this constraint when we add support for multiple epilogue + vectorization. */ + if (!sve && !TARGET_SVE2 && m_has_avg) + return 1; + + unsigned int max_unroll_factor = 1; + for (auto vec_ops : m_ops) + { + aarch64_simd_vec_issue_info const *vec_issue + = vec_ops.simd_issue_info (); + if (!vec_issue) + return 1; + /* Limit unroll factor to a value adjustable by the user, the default + value is 4. */ + unsigned int unroll_factor = aarch64_vect_unroll_limit; + unsigned int factor + = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1; + unsigned int temp; + + /* Sanity check, this should never happen. */ + if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0) + return 1; + + /* Check stores. */ + if (vec_ops.stores > 0) + { + temp = CEIL (factor * vec_issue->stores_per_cycle, + vec_ops.stores); + unroll_factor = MIN (unroll_factor, temp); + } + + /* Check loads + stores. */ + if (vec_ops.loads > 0) + { + temp = CEIL (factor * vec_issue->loads_stores_per_cycle, + vec_ops.loads + vec_ops.stores); + unroll_factor = MIN (unroll_factor, temp); + } + + /* Check general ops. */ + if (vec_ops.general_ops > 0) + { + temp = CEIL (factor * vec_issue->general_ops_per_cycle, + vec_ops.general_ops); + unroll_factor = MIN (unroll_factor, temp); + } + max_unroll_factor = MAX (max_unroll_factor, unroll_factor); + } + + /* Make sure unroll factor is power of 2. */ + return 1 << ceil_log2 (max_unroll_factor); +} + /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary and return the new cost. */ unsigned int @@ -16861,8 +16943,11 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs) if (loop_vinfo && m_vec_flags && aarch64_use_new_vector_costs_p ()) - m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, - m_costs[vect_body]); + { + m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs, + m_costs[vect_body]); + m_suggested_unroll_factor = determine_suggested_unroll_factor (); + } /* Apply the heuristic described above m_stp_sequence_cost. Prefer the scalar code in the event of a tie, since there is more chance diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index 98ce9c0ab61709b29bd29f3853f025e3a5a1bef2..92220b26ee2bf9f95c9a387c3155779596ee5ad5 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -292,3 +292,7 @@ Constant memmove size in bytes above which to start using MOPS sequence. -param=aarch64-mops-memset-size-threshold= Target Joined UInteger Var(aarch64_mops_memset_size_threshold) Init(256) Param Constant memset size in bytes from which to start using MOPS sequence. + +-param=aarch64-vect-unroll-limit= +Target Joined UInteger Var(aarch64_vect_unroll_limit) Init(4) Param +Limit how much the autovectorizer may unroll a loop. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index fb3dec4ab0c37810d4409fa6da6a82f0154b0a3a..8c27d55d1edf8bc0f4df6fed7e1df3523bc9686c 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -15174,6 +15174,12 @@ If this parameter is set to @var{n}, GCC will not use this heuristic for loops that are known to execute in fewer than @var{n} Advanced SIMD iterations. +@item aarch64-vect-unroll-limit +The vectorizer will use available tuning information to determine whether it +would be beneficial to unroll the main vectorized loop and by how much. This +parameter set's the upper bound of how much the vectorizer will unroll the main +loop. The default value is four. + @end table @end table