commit bf792239150 Author: Stefan Schulze Frielinghaus Date: Mon Feb 8 10:35:30 2021 +0100 ldist: Recognize rawmemchr loop patterns This patch adds support for recognizing loops which mimic the behaviour of function rawmemchr, and replaces those with an internal function call in case a target provides them. In contrast to the original rawmemchr function, this patch also supports different instances where the memory pointed to and the pattern are interpreted as 8, 16, and 32 bit sized, respectively. diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index dd7173126fb..18e12b863c6 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -2917,6 +2917,31 @@ expand_VEC_CONVERT (internal_fn, gcall *) gcc_unreachable (); } +void +expand_RAWMEMCHR (internal_fn, gcall *stmt) +{ + expand_operand ops[3]; + + tree lhs = gimple_call_lhs (stmt); + tree lhs_type = TREE_TYPE (lhs); + rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type)); + + for (unsigned int i = 0; i < 2; ++i) + { + tree rhs = gimple_call_arg (stmt, i); + tree rhs_type = TREE_TYPE (rhs); + rtx rhs_rtx = expand_normal (rhs); + create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type)); + } + + insn_code icode = direct_optab_handler (rawmemchr_optab, ops[2].mode); + + expand_insn (icode, 3, ops); + if (!rtx_equal_p (lhs_rtx, ops[0].value)) + emit_move_insn (lhs_rtx, ops[0].value); +} + /* Expand the IFN_UNIQUE function according to its first argument. */ static void diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index daeace7a34e..95c76795648 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -348,6 +348,7 @@ DEF_INTERNAL_FN (MUL_OVERFLOW, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (TSAN_FUNC_EXIT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (VA_ARG, ECF_NOTHROW | ECF_LEAF, NULL) DEF_INTERNAL_FN (VEC_CONVERT, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (RAWMEMCHR, ECF_PURE | ECF_LEAF | ECF_NOTHROW, NULL) /* An unduplicable, uncombinable function. Generally used to preserve a CFG property in the face of jump threading, tail merging or diff --git a/gcc/optabs.def b/gcc/optabs.def index b192a9d070b..f7c69f914ce 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -267,6 +267,7 @@ OPTAB_D (cpymem_optab, "cpymem$a") OPTAB_D (movmem_optab, "movmem$a") OPTAB_D (setmem_optab, "setmem$a") OPTAB_D (strlen_optab, "strlen$a") +OPTAB_D (rawmemchr_optab, "rawmemchr$I$a") OPTAB_DC(fma_optab, "fma$a4", FMA) OPTAB_D (fms_optab, "fms$a4") diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index 7ee19fc8677..09f200da61f 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -115,6 +115,10 @@ along with GCC; see the file COPYING3. If not see #include "tree-vectorizer.h" #include "tree-eh.h" #include "gimple-fold.h" +#include "rtl.h" +#include "memmodel.h" +#include "insn-codes.h" +#include "optabs.h" #define MAX_DATAREFS_NUM \ @@ -218,7 +222,7 @@ enum partition_kind { be unnecessary and removed once distributed memset can be understood and analyzed in data reference analysis. See PR82604 for more. */ PKIND_PARTIAL_MEMSET, - PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE + PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE, PKIND_RAWMEMCHR }; /* Type of distributed loop. */ @@ -244,6 +248,8 @@ struct builtin_info is only used in memset builtin distribution for now. */ tree dst_base_base; unsigned HOST_WIDE_INT dst_base_offset; + /* Pattern is used only in rawmemchr builtin distribution for now. */ + tree pattern; }; /* Partition for loop distribution. */ @@ -1232,6 +1238,66 @@ generate_memcpy_builtin (class loop *loop, partition *partition) } } +/* Generate a call to rawmemchr for PARTITION in LOOP. */ + +static void +generate_rawmemchr_builtin (class loop *loop, partition *partition) +{ + gimple_stmt_iterator gsi; + tree mem, pattern; + struct builtin_info *builtin = partition->builtin; + gimple *fn_call; + + data_reference_p dr = builtin->src_dr; + tree base = builtin->src_base; + + tree result_old = build_fold_addr_expr (DR_REF (dr)); + tree result_new = copy_ssa_name (result_old); + + /* The new statements will be placed before LOOP. */ + gsi = gsi_last_bb (loop_preheader_edge (loop)->src); + + mem = force_gimple_operand_gsi (&gsi, base, true, NULL_TREE, false, + GSI_CONTINUE_LINKING); + pattern = builtin->pattern; + fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern); + gimple_call_set_lhs (fn_call, result_new); + gimple_set_location (fn_call, partition->loc); + gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); + + imm_use_iterator iter; + gimple *stmt; + use_operand_p use_p; + FOR_EACH_IMM_USE_STMT (stmt, iter, result_old) + { + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + SET_USE (use_p, result_new); + + update_stmt (stmt); + } + + fold_stmt (&gsi); + + if (dump_file && (dump_flags & TDF_DETAILS)) + switch (TYPE_MODE (TREE_TYPE (pattern))) + { + case QImode: + fprintf (dump_file, "generated rawmemchrqi\n"); + break; + + case HImode: + fprintf (dump_file, "generated rawmemchrhi\n"); + break; + + case SImode: + fprintf (dump_file, "generated rawmemchrsi\n"); + break; + + default: + gcc_unreachable (); + } +} + /* Remove and destroy the loop LOOP. */ static void @@ -1334,6 +1400,10 @@ generate_code_for_partition (class loop *loop, generate_memcpy_builtin (loop, partition); break; + case PKIND_RAWMEMCHR: + generate_rawmemchr_builtin (loop, partition); + break; + default: gcc_unreachable (); } @@ -1525,44 +1595,53 @@ find_single_drs (class loop *loop, struct graph *rdg, partition *partition, } } - if (!single_st) + if (!single_ld && !single_st) return false; - /* Bail out if this is a bitfield memory reference. */ - if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF - && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) - return false; - - /* Data reference must be executed exactly once per iteration of each - loop in the loop nest. We only need to check dominance information - against the outermost one in a perfect loop nest because a bb can't - dominate outermost loop's latch without dominating inner loop's. */ - basic_block bb_st = gimple_bb (DR_STMT (single_st)); - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) - return false; + basic_block bb_ld = NULL; + basic_block bb_st = NULL; if (single_ld) { - gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); - /* Direct aggregate copy or via an SSA name temporary. */ - if (load != store - && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) - return false; - /* Bail out if this is a bitfield memory reference. */ if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1))) return false; - /* Load and store must be in the same loop nest. */ - basic_block bb_ld = gimple_bb (DR_STMT (single_ld)); - if (bb_st->loop_father != bb_ld->loop_father) + /* Data reference must be executed exactly once per iteration of each + loop in the loop nest. We only need to check dominance information + against the outermost one in a perfect loop nest because a bb can't + dominate outermost loop's latch without dominating inner loop's. */ + bb_ld = gimple_bb (DR_STMT (single_ld)); + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) + return false; + } + + if (single_st) + { + /* Bail out if this is a bitfield memory reference. */ + if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF + && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) return false; /* Data reference must be executed exactly once per iteration. - Same as single_st, we only need to check against the outermost + Same as single_ld, we only need to check against the outermost loop. */ - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) + bb_st = gimple_bb (DR_STMT (single_st)); + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) + return false; + } + + if (single_ld && single_st) + { + gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); + /* Direct aggregate copy or via an SSA name temporary. */ + if (load != store + && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) + return false; + + /* Load and store must be in the same loop nest. */ + if (bb_st->loop_father != bb_ld->loop_father) return false; edge e = single_exit (bb_st->loop_father); @@ -1681,6 +1760,68 @@ alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr, return builtin; } +/* Given data reference DR in loop nest LOOP, classify if it forms builtin + rawmemchr call. */ + +static bool +classify_builtin_rawmemchr (loop_p loop, partition *partition, + data_reference_p dr, tree loop_result) +{ + tree dr_ref = DR_REF (dr); + tree dr_access_base = build_fold_addr_expr (dr_ref); + tree dr_access_size = TYPE_SIZE_UNIT (TREE_TYPE (dr_ref)); + gimple *dr_stmt = DR_STMT (dr); + affine_iv iv; + tree pattern; + + if (dr_access_base != loop_result) + return false; + + /* A limitation of the current implementation is that we only support + constant patterns. */ + gcond *cond_stmt = as_a (last_stmt (loop->header)); + pattern = gimple_cond_rhs (cond_stmt); + if (gimple_cond_code (cond_stmt) != NE_EXPR + || gimple_cond_lhs (cond_stmt) != gimple_assign_lhs (dr_stmt) + || TREE_CODE (pattern) != INTEGER_CST) + return false; + + /* Bail out if no affine induction variable with constant step can be + determined. */ + if (!simple_iv (loop, loop, dr_access_base, &iv, false)) + return false; + + /* Bail out if memory accesses are not consecutive. */ + if (!operand_equal_p (iv.step, dr_access_size, 0)) + return false; + + /* Bail out if direction of memory accesses is not growing. */ + if (get_range_pos_neg (iv.step) != 1) + return false; + + /* Bail out if target does not provide rawmemchr for a certain mode. */ + machine_mode mode; + switch (TREE_INT_CST_LOW (iv.step)) + { + case 1: mode = QImode; break; + case 2: mode = HImode; break; + case 4: mode = SImode; break; + default: return false; + } + if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing) + return false; + + struct builtin_info *builtin; + builtin = alloc_builtin (NULL, dr, NULL_TREE, iv.base, NULL_TREE); + builtin->pattern = pattern; + + partition->loc = gimple_location (dr_stmt); + partition->builtin = builtin; + partition->kind = PKIND_RAWMEMCHR; + + return true; +} + /* Given data reference DR in loop nest LOOP, classify if it forms builtin memset call. */ @@ -1798,6 +1939,8 @@ loop_distribution::classify_partition (loop_p loop, unsigned i; data_reference_p single_ld = NULL, single_st = NULL; bool volatiles_p = false, has_reduction = false; + unsigned nreductions = 0; + gimple *reduction_stmt = NULL; EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) { @@ -1821,6 +1964,10 @@ loop_distribution::classify_partition (loop_p loop, partition->reduction_p = true; else has_reduction = true; + + /* Determine whether STMT is the only reduction statement or not. */ + reduction_stmt = stmt; + ++nreductions; } } @@ -1840,6 +1987,27 @@ loop_distribution::classify_partition (loop_p loop, if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld)) return has_reduction; + /* If we determined a single load and a single reduction statement, then try + to classify this partition as a rawmemchr builtin. */ + if (single_ld != NULL + && single_st == NULL + && nreductions == 1 + && is_gimple_assign (reduction_stmt)) + { + /* If we classified the partition as a builtin, then ignoring the single + reduction is safe, since the whole partition is replaced by a call. */ + tree reduction_var = gimple_assign_lhs (reduction_stmt); + return !classify_builtin_rawmemchr (loop, partition, single_ld, reduction_var); + } + + if (single_st == NULL) + return has_reduction; + + /* Don't distribute loop if niters is unknown. */ + tree niters = number_of_latch_executions (loop); + if (niters == NULL_TREE || niters == chrec_dont_know) + return has_reduction; + partition->loc = gimple_location (DR_STMT (single_st)); /* Classify the builtin kind. */ @@ -3290,11 +3458,6 @@ loop_distribution::execute (function *fun) && !optimize_loop_for_speed_p (loop))) continue; - /* Don't distribute loop if niters is unknown. */ - tree niters = number_of_latch_executions (loop); - if (niters == NULL_TREE || niters == chrec_dont_know) - continue; - /* Get the perfect loop nest for distribution. */ loop = prepare_perfect_loop_nest (loop); for (; loop; loop = loop->inner)