commit d24639c895ce3c0f9539570bab7b6510e98b1ffa Author: Stefan Schulze Frielinghaus Date: Wed Mar 17 09:00:06 2021 +0100 foo diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index d209a52f823..633f41d9e6d 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -2929,6 +2929,35 @@ expand_VEC_CONVERT (internal_fn, gcall *) gcc_unreachable (); } +/* Expand IFN_RAWMEMCHAR internal function. */ + +void +expand_RAWMEMCHR (internal_fn, gcall *stmt) +{ + expand_operand ops[3]; + + tree lhs = gimple_call_lhs (stmt); + if (!lhs) + return; + tree lhs_type = TREE_TYPE (lhs); + rtx lhs_rtx = expand_expr (lhs, NULL_RTX, VOIDmode, EXPAND_WRITE); + create_output_operand (&ops[0], lhs_rtx, TYPE_MODE (lhs_type)); + + for (unsigned int i = 0; i < 2; ++i) + { + tree rhs = gimple_call_arg (stmt, i); + tree rhs_type = TREE_TYPE (rhs); + rtx rhs_rtx = expand_normal (rhs); + create_input_operand (&ops[i + 1], rhs_rtx, TYPE_MODE (rhs_type)); + } + + insn_code icode = direct_optab_handler (rawmemchr_optab, ops[2].mode); + + expand_insn (icode, 3, ops); + if (!rtx_equal_p (lhs_rtx, ops[0].value)) + emit_move_insn (lhs_rtx, ops[0].value); +} + /* Expand the IFN_UNIQUE function according to its first argument. */ static void diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index daeace7a34e..95c76795648 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -348,6 +348,7 @@ DEF_INTERNAL_FN (MUL_OVERFLOW, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (TSAN_FUNC_EXIT, ECF_NOVOPS | ECF_LEAF | ECF_NOTHROW, NULL) DEF_INTERNAL_FN (VA_ARG, ECF_NOTHROW | ECF_LEAF, NULL) DEF_INTERNAL_FN (VEC_CONVERT, ECF_CONST | ECF_LEAF | ECF_NOTHROW, NULL) +DEF_INTERNAL_FN (RAWMEMCHR, ECF_PURE | ECF_LEAF | ECF_NOTHROW, NULL) /* An unduplicable, uncombinable function. Generally used to preserve a CFG property in the face of jump threading, tail merging or diff --git a/gcc/optabs.def b/gcc/optabs.def index b192a9d070b..f7c69f914ce 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -267,6 +267,7 @@ OPTAB_D (cpymem_optab, "cpymem$a") OPTAB_D (movmem_optab, "movmem$a") OPTAB_D (setmem_optab, "setmem$a") OPTAB_D (strlen_optab, "strlen$a") +OPTAB_D (rawmemchr_optab, "rawmemchr$I$a") OPTAB_DC(fma_optab, "fma$a4", FMA) OPTAB_D (fms_optab, "fms$a4") diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c new file mode 100644 index 00000000000..6db62d7644d --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-1.c @@ -0,0 +1,72 @@ +/* { dg-do run { target s390x-*-* } } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */ + +/* Rawmemchr pattern: reduction stmt but no store */ + +#include +#include + +typedef __SIZE_TYPE__ size_t; +extern void* malloc (size_t); +extern void* memset (void*, int, size_t); + +#define test(T, pattern) \ +__attribute__((noinline)) \ +T *test_##T (T *p) \ +{ \ + while (*p != (T)pattern) \ + ++p; \ + return p; \ +} + +test (uint8_t, 0xab) +test (uint16_t, 0xabcd) +test (uint32_t, 0xabcdef15) + +test (int8_t, 0xab) +test (int16_t, 0xabcd) +test (int32_t, 0xabcdef15) + +#define run(T, pattern, i) \ +{ \ +T *q = p; \ +q[i] = (T)pattern; \ +assert (test_##T (p) == &q[i]); \ +q[i] = 0; \ +} + +int main(void) +{ + void *p = malloc (1024); + assert (p); + memset (p, 0, 1024); + + run (uint8_t, 0xab, 0); + run (uint8_t, 0xab, 1); + run (uint8_t, 0xab, 13); + + run (uint16_t, 0xabcd, 0); + run (uint16_t, 0xabcd, 1); + run (uint16_t, 0xabcd, 13); + + run (uint32_t, 0xabcdef15, 0); + run (uint32_t, 0xabcdef15, 1); + run (uint32_t, 0xabcdef15, 13); + + run (int8_t, 0xab, 0); + run (int8_t, 0xab, 1); + run (int8_t, 0xab, 13); + + run (int16_t, 0xabcd, 0); + run (int16_t, 0xabcd, 1); + run (int16_t, 0xabcd, 13); + + run (int32_t, 0xabcdef15, 0); + run (int32_t, 0xabcdef15, 1); + run (int32_t, 0xabcdef15, 13); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c new file mode 100644 index 00000000000..00d6ea0f8e9 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-rawmemchr-2.c @@ -0,0 +1,83 @@ +/* { dg-do run { target s390x-*-* } } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" { target s390x-*-* } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" { target s390x-*-* } } } */ +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" { target s390x-*-* } } } */ + +/* Rawmemchr pattern: reduction stmt and store */ + +#include +#include + +typedef __SIZE_TYPE__ size_t; +extern void* malloc (size_t); +extern void* memset (void*, int, size_t); + +uint8_t *p_uint8_t; +uint16_t *p_uint16_t; +uint32_t *p_uint32_t; + +int8_t *p_int8_t; +int16_t *p_int16_t; +int32_t *p_int32_t; + +#define test(T, pattern) \ +__attribute__((noinline)) \ +T *test_##T (void) \ +{ \ + while (*p_##T != pattern) \ + ++p_##T; \ + return p_##T; \ +} + +test (uint8_t, 0xab) +test (uint16_t, 0xabcd) +test (uint32_t, 0xabcdef15) + +test (int8_t, (int8_t)0xab) +test (int16_t, (int16_t)0xabcd) +test (int32_t, (int32_t)0xabcdef15) + +#define run(T, pattern, i) \ +{ \ +T *q = p; \ +q[i] = pattern; \ +p_##T = p; \ +T *r = test_##T (); \ +assert (r == p_##T); \ +assert (r == &q[i]); \ +q[i] = 0; \ +} + +int main(void) +{ + void *p = malloc (1024); + assert (p); + memset (p, '\0', 1024); + + run (uint8_t, 0xab, 0); + run (uint8_t, 0xab, 1); + run (uint8_t, 0xab, 13); + + run (uint16_t, 0xabcd, 0); + run (uint16_t, 0xabcd, 1); + run (uint16_t, 0xabcd, 13); + + run (uint32_t, 0xabcdef15, 0); + run (uint32_t, 0xabcdef15, 1); + run (uint32_t, 0xabcdef15, 13); + + run (int8_t, (int8_t)0xab, 0); + run (int8_t, (int8_t)0xab, 1); + run (int8_t, (int8_t)0xab, 13); + + run (int16_t, (int16_t)0xabcd, 0); + run (int16_t, (int16_t)0xabcd, 1); + run (int16_t, (int16_t)0xabcd, 13); + + run (int32_t, (int32_t)0xabcdef15, 0); + run (int32_t, (int32_t)0xabcdef15, 1); + run (int32_t, (int32_t)0xabcdef15, 13); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-strlen-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-strlen-1.c new file mode 100644 index 00000000000..918b60099e4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-strlen-1.c @@ -0,0 +1,100 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ +/* { dg-final { scan-tree-dump-times "generated strlenQI\n" 4 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "generated strlenHI\n" 4 "ldist" { target s390x-*-* } } } */ +/* { dg-final { scan-tree-dump-times "generated strlenSI\n" 4 "ldist" { target s390x-*-* } } } */ + +#include +#include + +typedef __SIZE_TYPE__ size_t; +extern void* malloc (size_t); +extern void* memset (void*, int, size_t); + +#define test(T, U) \ +__attribute__((noinline)) \ +U test_##T##U (T *s) \ +{ \ + U i; \ + for (i=0; s[i]; ++i); \ + return i; \ +} + +test (uint8_t, size_t) +test (uint16_t, size_t) +test (uint32_t, size_t) +test (uint8_t, int) +test (uint16_t, int) +test (uint32_t, int) + +test (int8_t, size_t) +test (int16_t, size_t) +test (int32_t, size_t) +test (int8_t, int) +test (int16_t, int) +test (int32_t, int) + +#define run(T, U, i) \ +{ \ +T *q = p; \ +q[i] = 0; \ +assert (test_##T##U (p) == i); \ +memset (&q[i], 0xf, sizeof (T)); \ +} + +int main(void) +{ + void *p = malloc (1024); + assert (p); + memset (p, 0xf, 1024); + + run (uint8_t, size_t, 0); + run (uint8_t, size_t, 1); + run (uint8_t, size_t, 13); + + run (int8_t, size_t, 0); + run (int8_t, size_t, 1); + run (int8_t, size_t, 13); + + run (uint8_t, int, 0); + run (uint8_t, int, 1); + run (uint8_t, int, 13); + + run (int8_t, int, 0); + run (int8_t, int, 1); + run (int8_t, int, 13); + + run (uint16_t, size_t, 0); + run (uint16_t, size_t, 1); + run (uint16_t, size_t, 13); + + run (int16_t, size_t, 0); + run (int16_t, size_t, 1); + run (int16_t, size_t, 13); + + run (uint16_t, int, 0); + run (uint16_t, int, 1); + run (uint16_t, int, 13); + + run (int16_t, int, 0); + run (int16_t, int, 1); + run (int16_t, int, 13); + + run (uint32_t, size_t, 0); + run (uint32_t, size_t, 1); + run (uint32_t, size_t, 13); + + run (int32_t, size_t, 0); + run (int32_t, size_t, 1); + run (int32_t, size_t, 13); + + run (uint32_t, int, 0); + run (uint32_t, int, 1); + run (uint32_t, int, 13); + + run (int32_t, int, 0); + run (int32_t, int, 1); + run (int32_t, int, 13); + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-strlen-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-strlen-2.c new file mode 100644 index 00000000000..e25d6ea5b56 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-strlen-2.c @@ -0,0 +1,58 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details" } */ +/* { dg-final { scan-tree-dump-times "generated strlenQI\n" 3 "ldist" } } */ + +#include + +typedef __SIZE_TYPE__ size_t; +extern void* malloc (size_t); +extern void* memset (void*, int, size_t); + +__attribute__((noinline)) +int test_pos (char *s) +{ + int i; + for (i=42; s[i]; ++i); + return i; +} + +__attribute__((noinline)) +int test_neg (char *s) +{ + int i; + for (i=-42; s[i]; ++i); + return i; +} + +__attribute__((noinline)) +int test_including_null_char (char *s) +{ + int i; + for (i=1; s[i-1]; ++i); + return i; +} + +int main(void) +{ + void *p = malloc (1024); + assert (p); + memset (p, 0xf, 1024); + char *s = (char *)p + 100; + + s[42+13] = 0; + assert (test_pos (s) == 42+13); + s[42+13] = 0xf; + + s[13] = 0; + assert (test_neg (s) == 13); + s[13] = 0xf; + + s[-13] = 0; + assert (test_neg (s) == -13); + s[-13] = 0xf; + + s[13] = 0; + assert (test_including_null_char (s) == 13+1); + + return 0; +} diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index 65aa1df4aba..6bd4bc8588a 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -116,6 +116,10 @@ along with GCC; see the file COPYING3. If not see #include "tree-eh.h" #include "gimple-fold.h" #include "tree-affine.h" +#include "intl.h" +#include "rtl.h" +#include "memmodel.h" +#include "optabs.h" #define MAX_DATAREFS_NUM \ @@ -650,6 +654,10 @@ class loop_distribution control_dependences *cd, int *nb_calls, bool *destroy_p, bool only_patterns_p); + /* Transform loops which mimic the effects of builtins rawmemchr or strlen and + replace them accordingly. */ + bool transform_reduction_loop (loop_p loop); + /* Compute topological order for basic blocks. Topological order is needed because data dependence is computed for data references in lexicographical order. */ @@ -1490,14 +1498,14 @@ loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v) data references. */ static bool -find_single_drs (class loop *loop, struct graph *rdg, partition *partition, +find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts, data_reference_p *dst_dr, data_reference_p *src_dr) { unsigned i; data_reference_p single_ld = NULL, single_st = NULL; bitmap_iterator bi; - EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) + EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi) { gimple *stmt = RDG_STMT (rdg, i); data_reference_p dr; @@ -1538,44 +1546,47 @@ find_single_drs (class loop *loop, struct graph *rdg, partition *partition, } } - if (!single_st) - return false; - - /* Bail out if this is a bitfield memory reference. */ - if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF - && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) + if (!single_ld && !single_st) return false; - /* Data reference must be executed exactly once per iteration of each - loop in the loop nest. We only need to check dominance information - against the outermost one in a perfect loop nest because a bb can't - dominate outermost loop's latch without dominating inner loop's. */ - basic_block bb_st = gimple_bb (DR_STMT (single_st)); - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) - return false; + basic_block bb_ld = NULL; + basic_block bb_st = NULL; if (single_ld) { - gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); - /* Direct aggregate copy or via an SSA name temporary. */ - if (load != store - && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) - return false; - /* Bail out if this is a bitfield memory reference. */ if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1))) return false; - /* Load and store must be in the same loop nest. */ - basic_block bb_ld = gimple_bb (DR_STMT (single_ld)); - if (bb_st->loop_father != bb_ld->loop_father) + /* Data reference must be executed exactly once per iteration of each + loop in the loop nest. We only need to check dominance information + against the outermost one in a perfect loop nest because a bb can't + dominate outermost loop's latch without dominating inner loop's. */ + bb_ld = gimple_bb (DR_STMT (single_ld)); + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) + return false; + } + + if (single_st) + { + /* Bail out if this is a bitfield memory reference. */ + if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF + && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) return false; /* Data reference must be executed exactly once per iteration. - Same as single_st, we only need to check against the outermost + Same as single_ld, we only need to check against the outermost loop. */ - if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) + bb_st = gimple_bb (DR_STMT (single_st)); + if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) + return false; + } + + if (single_ld && single_st) + { + /* Load and store must be in the same loop nest. */ + if (bb_st->loop_father != bb_ld->loop_father) return false; edge e = single_exit (bb_st->loop_father); @@ -1850,9 +1861,19 @@ loop_distribution::classify_partition (loop_p loop, return has_reduction; /* Find single load/store data references for builtin partition. */ - if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld)) + if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld) + || !single_st) return has_reduction; + if (single_ld && single_st) + { + gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); + /* Direct aggregate copy or via an SSA name temporary. */ + if (load != store + && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) + return has_reduction; + } + partition->loc = gimple_location (DR_STMT (single_st)); /* Classify the builtin kind. */ @@ -3257,6 +3278,379 @@ find_seed_stmts_for_distribution (class loop *loop, vec *work_list) return work_list->length () > 0; } +/* A helper function for generate_{rawmemchr,strlen}_builtin functions in order + to place new statements SEQ before LOOP and replace the old reduction + variable with the new one. */ + +static void +generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq, + tree reduction_var_old, tree reduction_var_new, + const char *info, machine_mode load_mode) +{ + /* Place new statements before LOOP. */ + gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src); + gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING); + + /* Replace old reduction variable with new one. */ + imm_use_iterator iter; + gimple *stmt; + use_operand_p use_p; + FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old) + { + FOR_EACH_IMM_USE_ON_STMT (use_p, iter) + SET_USE (use_p, reduction_var_new); + + update_stmt (stmt); + } + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, info, GET_MODE_NAME (load_mode)); +} + +/* Generate a call to rawmemchr and place it before LOOP. REDUCTION_VAR is + replaced with a fresh SSA name representing the result of the call. */ + +static void +generate_rawmemchr_builtin (loop_p loop, tree reduction_var, + data_reference_p store_dr, tree base, tree pattern, + location_t loc) +{ + gimple_seq seq = NULL; + + tree mem = force_gimple_operand (base, &seq, true, NULL_TREE); + gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern); + tree reduction_var_new = copy_ssa_name (reduction_var); + gimple_call_set_lhs (fn_call, reduction_var_new); + gimple_set_location (fn_call, loc); + gimple_seq_add_stmt (&seq, fn_call); + + if (store_dr) + { + gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new); + gimple_seq_add_stmt (&seq, g); + } + + generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new, + "generated rawmemchr%s\n", + TYPE_MODE (TREE_TYPE (TREE_TYPE (base)))); +} + +/* Helper function for generate_strlen_builtin(,_using_rawmemchr) */ + +static void +generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq, + tree reduction_var_old, tree reduction_var_new, + machine_mode mode, tree start_len) +{ + /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be + converted if types of old and new reduction variable are not compatible. */ + reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old), + reduction_var_new); + + /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start + length. */ + if (!integer_zerop (start_len)) + { + tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new)); + gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new, + start_len); + gimple_seq_add_stmt (&seq, g); + reduction_var_new = lhs; + } + + generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new, + "generated strlen%s\n", mode); +} + +/* Generate a call to strlen and place it before LOOP. REDUCTION_VAR is + replaced with a fresh SSA name representing the result of the call. */ + +static void +generate_strlen_builtin (loop_p loop, tree reduction_var, tree base, + tree start_len, location_t loc) +{ + gimple_seq seq = NULL; + + tree reduction_var_new = make_ssa_name (size_type_node); + + tree mem = force_gimple_operand (base, &seq, true, NULL_TREE); + tree fn = build_fold_addr_expr (builtin_decl_explicit (BUILT_IN_STRLEN)); + gimple *fn_call = gimple_build_call (fn, 1, mem); + gimple_call_set_lhs (fn_call, reduction_var_new); + gimple_set_location (fn_call, loc); + gimple_seq_add_stmt (&seq, fn_call); + + generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new, + QImode, start_len); +} + +/* Generate code in order to mimic the behaviour of strlen but this time over + an array of elements with mode different than QI. REDUCTION_VAR is replaced + with a fresh SSA name representing the result, i.e., the length. */ + +static void +generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var, + tree base, tree start_len, + location_t loc) +{ + gimple_seq seq = NULL; + + tree start = force_gimple_operand (base, &seq, true, NULL_TREE); + tree zero = build_zero_cst (TREE_TYPE (TREE_TYPE (start))); + gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero); + tree end = make_ssa_name (TREE_TYPE (base)); + gimple_call_set_lhs (fn_call, end); + gimple_set_location (fn_call, loc); + gimple_seq_add_stmt (&seq, fn_call); + + /* Determine the number of elements between START and END by + evaluating (END - START) / sizeof (*START). */ + tree diff = make_ssa_name (ptrdiff_type_node); + gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start); + gimple_seq_add_stmt (&seq, diff_stmt); + /* Let SIZE be the size of the the pointed-to type of START. */ + tree size = gimple_convert (&seq, ptrdiff_type_node, + TYPE_SIZE_UNIT (TREE_TYPE (TREE_TYPE (start)))); + tree count = make_ssa_name (ptrdiff_type_node); + gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size); + gimple_seq_add_stmt (&seq, count_stmt); + + generate_strlen_builtin_1 (loop, seq, reduction_var, count, + TYPE_MODE (TREE_TYPE (TREE_TYPE (base))), + start_len); +} + +static bool +reduction_var_overflows_first (tree reduction_var, tree load_type) +{ + unsigned precision_ptrdiff = TYPE_PRECISION (ptrdiff_type_node); + unsigned precision_reduction_var = TYPE_PRECISION (TREE_TYPE (reduction_var)); + unsigned size_exponent = wi::exact_log2 (wi::to_wide (TYPE_SIZE_UNIT (load_type))); + return wi::ltu_p (precision_reduction_var, precision_ptrdiff - 1 - size_exponent); +} + +/* Transform loops which mimic the effects of builtins rawmemchr or strlen and + replace them accordingly. For example, a loop of the form + + for (; *p != 42; ++p); + + is replaced by + + p = rawmemchr (p, 42); + + under the assumption that rawmemchr is available for a particular MODE. + Another example is + + int i; + for (i = 42; s[i]; ++i); + + which is replaced by + + i = (int)strlen (&s[42]) + 42; + + for some character array S. In case array S is not of type character array + we end up with + + i = (int)(rawmemchr (&s[42], 0) - &s[42]) + 42; + + assuming that rawmemchr is available for a particular MODE. */ + +bool +loop_distribution::transform_reduction_loop (loop_p loop) +{ + gimple *reduction_stmt = NULL; + data_reference_p load_dr = NULL, store_dr = NULL; + + edge e = single_exit (loop); + gcond *cond = safe_dyn_cast (last_stmt (e->src)); + if (!cond) + return false; + /* Ensure loop condition is an (in)equality test and loop is exited either if + the inequality test fails or the equality test succeeds. */ + if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR) + && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR)) + return false; + /* A limitation of the current implementation is that we only support + constant patterns in (in)equality tests. */ + tree pattern = gimple_cond_rhs (cond); + if (TREE_CODE (pattern) != INTEGER_CST) + return false; + + basic_block *bbs = get_loop_body (loop); + + for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i) + { + basic_block bb = bbs[i]; + + for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); + gsi_next_nondebug (&bsi)) + { + gphi *phi = bsi.phi (); + if (virtual_operand_p (gimple_phi_result (phi))) + continue; + if (stmt_has_scalar_dependences_outside_loop (loop, phi)) + { + if (reduction_stmt) + return false; + reduction_stmt = phi; + } + } + + for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); + gsi_next_nondebug (&bsi), ++ninsns) + { + /* Bail out early for loops which are unlikely to match. */ + if (ninsns > 16) + return false; + gimple *stmt = gsi_stmt (bsi); + if (gimple_clobber_p (stmt)) + continue; + if (gimple_code (stmt) == GIMPLE_LABEL) + continue; + if (gimple_has_volatile_ops (stmt)) + return false; + if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) + { + if (reduction_stmt) + return false; + reduction_stmt = stmt; + } + } + } + + /* A limitation of the current implementation is that we require a reduction + statement. Therefore, loops without a reduction statement as in the + following are not recognized: + int *p; + void foo (void) { for (; *p; ++p); } */ + if (reduction_stmt == NULL) + return false; + + /* Reduction variables are guaranteed to be SSA names. */ + tree reduction_var; + switch (gimple_code (reduction_stmt)) + { + case GIMPLE_ASSIGN: + case GIMPLE_PHI: + reduction_var = gimple_get_lhs (reduction_stmt); + break; + default: + /* Bail out e.g. for GIMPLE_CALL. */ + return false; + } + + struct graph *rdg = build_rdg (loop, NULL); + if (rdg == NULL) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Loop %d not transformed: failed to build the RDG.\n", + loop->num); + + return false; + } + auto_bitmap partition_stmts; + bitmap_set_range (partition_stmts, 0, rdg->n_vertices); + find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr); + free_rdg (rdg); + + /* Bail out if there is no single load. */ + if (load_dr == NULL) + return false; + + /* Reaching this point we have a loop with a single reduction variable, + a single load, and an optional single store. */ + + tree load_ref = DR_REF (load_dr); + tree load_type = TREE_TYPE (load_ref); + tree load_access_base = build_fold_addr_expr (load_ref); + tree load_access_size = TYPE_SIZE_UNIT (load_type); + affine_iv load_iv, reduction_iv; + + if (!INTEGRAL_TYPE_P (load_type) + || !type_has_mode_precision_p (load_type)) + return false; + + /* We already ensured that the loop condition tests for (in)equality where the + rhs is a constant pattern. Now ensure that the lhs is the result of the + load. */ + if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr))) + return false; + + /* Bail out if no affine induction variable with constant step can be + determined. */ + if (!simple_iv (loop, loop, load_access_base, &load_iv, false)) + return false; + + /* Bail out if memory accesses are not consecutive or not growing. */ + if (!operand_equal_p (load_iv.step, load_access_size, 0)) + return false; + + if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false)) + return false; + + /* Handle rawmemchr like loops. */ + if (operand_equal_p (load_iv.base, reduction_iv.base) + && operand_equal_p (load_iv.step, reduction_iv.step)) + { + if (store_dr) + { + /* Ensure that we store to X and load from X+I where I>0. */ + if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR + || !integer_onep (TREE_OPERAND (load_iv.base, 1))) + return false; + tree ptr_base = TREE_OPERAND (load_iv.base, 0); + if (TREE_CODE (ptr_base) != SSA_NAME) + return false; + gimple *def = SSA_NAME_DEF_STMT (ptr_base); + if (!gimple_assign_single_p (def) + || gimple_assign_rhs1 (def) != DR_REF (store_dr)) + return false; + /* Ensure that the reduction value is stored. */ + if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var) + return false; + } + /* Bail out if target does not provide rawmemchr for a certain mode. */ + machine_mode mode = TYPE_MODE (load_type); + if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing) + return false; + location_t loc = gimple_location (DR_STMT (load_dr)); + generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base, + pattern, loc); + return true; + } + + /* Handle strlen like loops. */ + if (store_dr == NULL + && integer_zerop (pattern) + && TREE_CODE (reduction_iv.base) == INTEGER_CST + && TREE_CODE (reduction_iv.step) == INTEGER_CST + && integer_onep (reduction_iv.step)) + { + location_t loc = gimple_location (DR_STMT (load_dr)); + if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node) + && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node) + && (TYPE_PRECISION (size_type_node) == 64 + || (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (reduction_var)) + && TYPE_PRECISION (reduction_var) <= TYPE_PRECISION (size_type_node)))) + generate_strlen_builtin (loop, reduction_var, load_iv.base, + reduction_iv.base, loc); + else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type)) + != CODE_FOR_nothing + && (TYPE_PRECISION (ptrdiff_type_node) == 64 + || (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (reduction_var)) + && reduction_var_overflows_first (reduction_var, load_type)))) + generate_strlen_builtin_using_rawmemchr (loop, reduction_var, + load_iv.base, + reduction_iv.base, loc); + else + return false; + return true; + } + + return false; +} + /* Given innermost LOOP, return the outermost enclosing loop that forms a perfect loop nest. */ @@ -3321,10 +3715,27 @@ loop_distribution::execute (function *fun) && !optimize_loop_for_speed_p (loop))) continue; - /* Don't distribute loop if niters is unknown. */ + /* If niters is unknown don't distribute loop but rather try to transform + it to a call to a builtin. */ tree niters = number_of_latch_executions (loop); if (niters == NULL_TREE || niters == chrec_dont_know) - continue; + { + datarefs_vec.create (20); + if (transform_reduction_loop (loop)) + { + changed = true; + loops_to_be_destroyed.safe_push (loop); + if (dump_enabled_p ()) + { + dump_user_location_t loc = find_loop_location (loop); + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, + loc, "Loop %d transformed into a builtin.\n", + loop->num); + } + } + free_data_refs (datarefs_vec); + continue; + } /* Get the perfect loop nest for distribution. */ loop = prepare_perfect_loop_nest (loop);