commit 74e3e742e446f6a892c8ee755f640756efc816cd Author: Kyrylo Tkachov Date: Tue Jul 12 12:30:47 2016 +0100 GIMPLE store merging pass diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 9e08fbf..d83b2f1 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1300,6 +1300,7 @@ OBJS = \ gimple-ssa-isolate-paths.o \ gimple-ssa-nonnull-compare.o \ gimple-ssa-split-paths.o \ + gimple-ssa-store-merging.o \ gimple-ssa-strength-reduction.o \ gimple-ssa-sprintf.o \ gimple-streamer-in.o \ diff --git a/gcc/common.opt b/gcc/common.opt index 168735e..4d73852 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1460,6 +1460,10 @@ fstrict-volatile-bitfields Common Report Var(flag_strict_volatile_bitfields) Init(-1) Optimization Force bitfield accesses to match their type width. +fstore-merging +Common Var(flag_store_merging) Optimization +Use the tree store merging pass. + fguess-branch-probability Common Report Var(flag_guess_branch_prob) Optimization Enable guessing of branch probabilities. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 6767462..83acd3b 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -403,7 +403,7 @@ Objective-C and Objective-C++ Dialects}. -fsingle-precision-constant -fsplit-ivs-in-unroller @gol -fsplit-paths @gol -fsplit-wide-types -fssa-backprop -fssa-phiopt @gol --fstdarg-opt -fstrict-aliasing @gol +-fstdarg-opt -fstore-merging -fstrict-aliasing @gol -fstrict-overflow -fthread-jumps -ftracer -ftree-bit-ccp @gol -ftree-builtin-call-dce -ftree-ccp -ftree-ch @gol -ftree-coalesce-vars -ftree-copy-prop -ftree-dce -ftree-dominator-opts @gol @@ -414,8 +414,8 @@ Objective-C and Objective-C++ Dialects}. -ftree-loop-vectorize @gol -ftree-parallelize-loops=@var{n} -ftree-pre -ftree-partial-pre -ftree-pta @gol -ftree-reassoc -ftree-sink -ftree-slsr -ftree-sra @gol --ftree-switch-conversion -ftree-tail-merge -ftree-ter @gol --ftree-vectorize -ftree-vrp -funconstrained-commons @gol +-ftree-switch-conversion -ftree-tail-merge @gol +-ftree-ter -ftree-vectorize -ftree-vrp -funconstrained-commons @gol -funit-at-a-time -funroll-all-loops -funroll-loops @gol -funsafe-math-optimizations -funswitch-loops @gol -fipa-ra -fvariable-expansion-in-unroller -fvect-cost-model -fvpt @gol @@ -6593,6 +6593,7 @@ compilation time. -fsplit-wide-types @gol -fssa-backprop @gol -fssa-phiopt @gol +-fstore-merging @gol -ftree-bit-ccp @gol -ftree-ccp @gol -ftree-ch @gol @@ -7927,6 +7928,13 @@ Perform scalar replacement of aggregates. This pass replaces structure references with scalars to prevent committing structures to memory too early. This flag is enabled by default at @option{-O} and higher. +@item -fstore-merging +@opindex fstore-merging +Perform merging of narrow stores to consecutive memory addresses. This pass +merges contiguous stores of immediate values narrower than a word into fewer +wider stores to reduce the number of instructions. This is enabled by default +at @option{-O} and higher. + @item -ftree-ter @opindex ftree-ter Perform temporary expression replacement during the SSA->normal phase. Single diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c new file mode 100644 index 0000000..c77933d --- /dev/null +++ b/gcc/gimple-ssa-store-merging.c @@ -0,0 +1,1113 @@ +/* GIMPLE store merging pass. + Copyright (C) 2016 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GCC; see the file COPYING3. If not see + . */ + +/* The purpose of this pass is to combine multiple memory stores of + constant values to consecutive memory locations into fewer wider stores. + For example, if we have a sequence peforming four byte stores to + consecutive memory locations: + [p ] := imm1; + [p + 1B] := imm2; + [p + 2B] := imm3; + [p + 3B] := imm4; + we can transform this into a single 4-byte store if the target supports it: + [p] := imm1:imm2:imm3:imm4 //concatenated immediates according to endianness. + + The algorithm is applied to each basic block in three phases: + + 1) Scan through the basic block recording constant assignments to + destinations that can be expressed as a store to memory of a certain size + at a certain bit offset. Record store chains to different bases in a + hash_map (m_stores) and make sure to terminate such chains when appropriate + (for example when when the stored values get used subsequently). + These stores can be a result of structure element initializers, array stores + etc. A store_immediate_info object is recorded for every such store. + Record as many such assignments to a single base as possible until a + statement that interferes with the store sequence is encountered. + + 2) Analyze the chain of stores recorded in phase 1) (i.e. the vector of + store_immediate_info objects) and coalesce contiguous stores into + merged_store_group objects. + + For example, given the stores: + [p ] := 0; + [p + 1B] := 1; + [p + 3B] := 0; + [p + 4B] := 1; + [p + 5B] := 0; + [p + 6B] := 0; + This phase would produce two merged_store_group objects, one recording the + two bytes stored in the memory region [p : p + 1] and another + recording the four bytes stored in the memory region [p + 3 : p + 6]. + + 3) The merged_store_group objects produced in phase 2) are processed + to generate the sequence of wider stores that set the contiguous memory + regions to the sequence of bytes that correspond to it. This may emit + multiple stores per store group to handle contiguous stores that are not + of a size that is a power of 2. For example it can try to emit a 40-bit + store as a 32-bit store followed by an 8-bit store. + We try to emit as wide stores as we can while respecting STRICT_ALIGNMENT or + SLOW_UNALIGNED_ACCESS rules. + + Note on endianness and example: + Consider 2 contiguous 16-bit stores followed by 2 contiguous 8-bit stores: + [p ] := 0x1234; + [p + 2B] := 0x5678; + [p + 4B] := 0xab; + [p + 5B] := 0xcd; + + The memory layout for little-endian (LE) and big-endian (BE) must be: + p |LE|BE| + --------- + 0 |34|12| + 1 |12|34| + 2 |78|56| + 3 |56|78| + 4 |ab|ab| + 5 |cd|cd| + + To merge these into a single 48-bit merged value 'val' in phase 2) + on little-endian we insert stores to higher (consecutive) bitpositions + into the most significant bits of the merged value. + The final merged value would be: 0xcdab56781234 + + For big-endian we insert stores to higher bitpositions into the least + significant bits of the merged value. + The final merged value would be: 0x12345678abcd + + Then, in phase 3), we want to emit this 48-bit value as a 32-bit store + followed by a 16-bit store. Again, we must consider endianness when + breaking down the 48-bit value 'val' computed above. + For little endian we emit: + [p] (32-bit) := 0x56781234; // val & 0x0000ffffffff; + [p + 4B] (16-bit) := 0xcdab; // (val & 0xffff00000000) >> 32; + + Whereas for big-endian we emit: + [p] (32-bit) := 0x12345678; // (val & 0xffffffff0000) >> 16; + [p + 4B] (16-bit) := 0xabcd; // val & 0x00000000ffff; */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "backend.h" +#include "tree.h" +#include "gimple.h" +#include "builtins.h" +#include "fold-const.h" +#include "tree-pass.h" +#include "ssa.h" +#include "gimple-pretty-print.h" +#include "alias.h" +#include "fold-const.h" +#include "params.h" +#include "print-tree.h" +#include "tree-hash-traits.h" +#include "gimple-iterator.h" +#include "gimplify.h" +#include "stor-layout.h" +#include "tree-cfg.h" +#include "predict.h" +#include "target.h" + +/* The maximum size (in bits) of the stores this pass should generate. */ +#define MAX_STORE_BITSIZE (BITS_PER_WORD) +#define MAX_STORE_BYTES (MAX_STORE_BITSIZE / BITS_PER_UNIT) + +namespace { + +/* Struct recording the information about a single store of an immediate + to memory. These are created in the first phase and coalesced into + merged_store_group objects in the second phase. */ + +struct store_immediate_info +{ + unsigned HOST_WIDE_INT bitsize; + unsigned HOST_WIDE_INT bitpos; + tree val; + tree dest; + gimple *stmt; + unsigned int order; + store_immediate_info (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, tree, + tree, gimple *, unsigned int); +}; + +store_immediate_info::store_immediate_info (unsigned HOST_WIDE_INT bs, + unsigned HOST_WIDE_INT bp, tree v, + tree d, gimple *st, + unsigned int ord) + : bitsize (bs), bitpos (bp), val (v), dest (d), stmt (st), order (ord) +{ +} + +/* Struct representing a group of stores to contiguous memory locations. + These are produced by the second phase (coalescing) and consumed in the + third phase that outputs the widened stores. */ + +struct merged_store_group +{ + unsigned HOST_WIDE_INT start; + unsigned HOST_WIDE_INT width; + /* The size of the allocated memory for val. */ + unsigned HOST_WIDE_INT buf_size; + unsigned char *val; + unsigned int align; + auto_vec stores; + /* We record the first and last original statements in the sequence because + we'll need their vuse/vdef and replacement position. */ + gimple *last_stmt; + unsigned int last_order; + gimple *first_stmt; + unsigned int first_order; + + merged_store_group (store_immediate_info *); + ~merged_store_group (); + void merge_into (store_immediate_info *); + void merge_overlapping (store_immediate_info *); + void apply_stores (); +}; + +/* Debug helper. Dump LEN elements of byte array PTR to FD in hex. */ + +static void +dump_char_array (FILE *fd, unsigned char *ptr, unsigned int len) +{ + if (!fd) + return; + + for (unsigned int i = 0; i < len; i++) + fprintf (fd, "%x ", ptr[i]); + fprintf (fd, "\n"); +} + +/* Write BITLEN bits of EXPR to the byte array PTR at + bit position BITPOS. PTR should contain TOTAL_BYTES elements. */ + +static void +encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos, + unsigned int total_bytes) +{ + unsigned int last_byte = (bitpos + bitlen - 1) / BITS_PER_UNIT; + unsigned int first_byte = bitpos / BITS_PER_UNIT; + tree tmp_int = expr; + bool sub_byte_op_p = (bitlen % BITS_PER_UNIT) || (bitpos % BITS_PER_UNIT); + /* If the expression is not an integer encode it into the temporary buffer + and read it back as an integer. */ + if (TREE_CODE (expr) != INTEGER_CST && sub_byte_op_p) + { + unsigned char *tmpbuf = XALLOCAVEC (unsigned char, total_bytes); + memset (tmpbuf, 0, total_bytes); + native_encode_expr (expr, tmpbuf, total_bytes, 0); + tree read_back_type + = build_nonstandard_integer_type (tree_to_shwi ( + TYPE_SIZE (TREE_TYPE (expr))), UNSIGNED); + tmp_int + = native_interpret_expr (read_back_type, tmpbuf, total_bytes); + } + + gcc_assert (tmp_int); + + /* If we're inserting a non-bytesized width or not at a byte boundary + use an intermediate wide_int to perform the bit-insertion correctly. */ + if (sub_byte_op_p + || (TREE_CODE (expr) == INTEGER_CST + && mode_for_size (bitlen, MODE_INT, 0) == BLKmode)) + { + unsigned int byte_size = last_byte - first_byte + 1; + + /* The functions native_encode_expr/native_interpret_expr uses the + TYPE_MODE of the type to determine the number of bytes to write/read + so if we want to process a number of bytes that does not have a + TYPE_MODE of equal size we need to use a type that has a valid mode + for it. */ + + machine_mode mode + = smallest_mode_for_size (byte_size * BITS_PER_UNIT, MODE_INT); + tree dest_int_type + = build_nonstandard_integer_type (GET_MODE_BITSIZE (mode), UNSIGNED); + byte_size = GET_MODE_SIZE (mode); + /* The region from the byte array that we're inserting into. */ + tree ptr_wide_int + = native_interpret_expr (dest_int_type, ptr + first_byte, + total_bytes); + + gcc_assert (ptr_wide_int); + wide_int dest_wide_int + = wi::to_wide (ptr_wide_int, TYPE_PRECISION (dest_int_type)); + + wide_int expr_wide_int + = wi::to_wide (tmp_int, byte_size * BITS_PER_UNIT); + if (BYTES_BIG_ENDIAN) + { + unsigned int insert_pos + = byte_size * BITS_PER_UNIT - bitlen - (bitpos % BITS_PER_UNIT); + dest_wide_int + = wi::insert (dest_wide_int, expr_wide_int, insert_pos, bitlen); + } + else + dest_wide_int = wi::insert (dest_wide_int, expr_wide_int, + bitpos % BITS_PER_UNIT, bitlen); + + tree res = wide_int_to_tree (dest_int_type, dest_wide_int); + native_encode_expr (res, ptr + first_byte, total_bytes, 0); + + } + /* If we're inserting a "well-behaved" value at a normal position just + call native_encode_expr directly. */ + else + native_encode_expr (tmp_int, ptr + first_byte, + total_bytes, 0); +} + +/* Sorting function for store_immediate_info objects. + Sorts them by bitposition. */ + +static int +sort_by_bitpos (const void *x, const void *y) +{ + store_immediate_info *const *tmp = (store_immediate_info * const *) x; + store_immediate_info *const *tmp2 = (store_immediate_info * const *) y; + + if ((*tmp)->bitpos <= (*tmp2)->bitpos) + return -1; + else if ((*tmp)->bitpos > (*tmp2)->bitpos) + return 1; + + gcc_unreachable (); + return 0; +} + +/* Sorting function for store_immediate_info objects. + Sorts them by the order field. */ + +static int +sort_by_order (const void *x, const void *y) +{ + store_immediate_info *const *tmp = (store_immediate_info * const *) x; + store_immediate_info *const *tmp2 = (store_immediate_info * const *) y; + + if ((*tmp)->order < (*tmp2)->order) + return -1; + else if ((*tmp)->order > (*tmp2)->order) + return 1; + + gcc_unreachable (); + return 0; +} + +/* Initialize a merged_store_group object from a store_immediate_info + object. */ + +merged_store_group::merged_store_group (store_immediate_info *info) +{ + start = info->bitpos; + width = info->bitsize; + /* VAL has memory allocated for it in apply_stores once the group + width has been finalized. */ + val = NULL; + align = get_object_alignment (info->dest); + stores.create (1); + stores.safe_push (info); + last_stmt = info->stmt; + last_order = info->order; + first_stmt = last_stmt; + first_order = last_order; + buf_size = 0; +} + +merged_store_group::~merged_store_group () +{ + if (val) + XDELETEVEC (val); +} + +/* Merge a store recorded by INFO into this merged store. + The store is not overlapping with the existing recorded + stores. */ + +void +merged_store_group::merge_into (store_immediate_info *info) +{ + unsigned HOST_WIDE_INT wid = info->bitsize; + /* Make sure we're inserting in the position we think we're inserting. */ + gcc_assert (info->bitpos == start + width); + + width += wid; + gimple *stmt = info->stmt; + stores.safe_push (info); + if (info->order > last_order) + { + last_order = info->order; + last_stmt = stmt; + } + else if (info->order < first_order) + { + first_order = info->order; + first_stmt = stmt; + } +} + +/* Merge a store described by INFO into this merged store. + INFO overlaps in some way with the current store (i.e. it's not contiguous + which is handled by merged_store_group::merge_into). */ + +void +merged_store_group::merge_overlapping (store_immediate_info *info) +{ + gimple *stmt = info->stmt; + stores.safe_push (info); + + /* If the store extends the size of the group, extend the width. */ + if ((info->bitpos + info->bitsize) > (start + width)) + width += info->bitpos + info->bitsize - (start + width); + + if (info->order > last_order) + { + last_order = info->order; + last_stmt = stmt; + } + else if (info->order < first_order) + { + first_order = info->order; + first_stmt = stmt; + } +} + +/* Go through all the recorded stores in this group in program order and + apply their values to the VAL byte array to create the final merged + value. */ + +void +merged_store_group::apply_stores () +{ + stores.qsort (sort_by_order); + struct store_immediate_info *info; + unsigned int i; + /* Create a buffer of a size that is 2 times the number of bytes we're + storing. That way native_encode_expr can write power-of-2-sized + chunks without overrunning. */ + buf_size + = 2 * (ROUND_UP (width, BITS_PER_UNIT) / BITS_PER_UNIT); + val = XNEWVEC (unsigned char, buf_size); + memset (val, 0, buf_size); + + FOR_EACH_VEC_ELT (stores, i, info) + { + unsigned int pos_in_buffer = info->bitpos - start; + encode_tree_to_bitpos (info->val, val, info->bitsize, pos_in_buffer, + buf_size); + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "After writing "); + print_generic_expr (dump_file, info->val, 0); + fprintf (dump_file, " of size " HOST_WIDE_INT_PRINT_DEC + "at position %d the merged region contains:\n", + info->bitsize, pos_in_buffer); + dump_char_array (dump_file, val, buf_size); + } + } +} + +/* Structure describing the store chain. */ + +struct imm_store_chain_info +{ + auto_vec m_store_info; + auto_vec m_merged_store_groups; + + bool terminate_and_process_chain (tree); + bool coalesce_immediate_stores (); + bool output_merged_store (tree, merged_store_group *); + bool output_merged_stores (tree); +}; + +const pass_data pass_data_tree_store_merging = { + GIMPLE_PASS, /* type */ + "store-merging", /* name */ + OPTGROUP_NONE, /* optinfo_flags */ + TV_NONE, /* tv_id */ + PROP_ssa, /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + TODO_update_ssa, /* todo_flags_finish */ +}; + +class pass_store_merging : public gimple_opt_pass +{ +public: + pass_store_merging (gcc::context *ctxt) + : gimple_opt_pass (pass_data_tree_store_merging, ctxt) + { + } + + /* Pass not supported for PDP-endianness. */ + virtual bool + gate (function *) + { + return flag_store_merging && optimize + && (WORDS_BIG_ENDIAN == BYTES_BIG_ENDIAN); + } + + virtual unsigned int execute (function *); + +private: + hash_map m_stores; + + bool terminate_and_process_all_chains (basic_block); + bool terminate_all_aliasing_chains (tree, tree, gimple *); + bool terminate_and_release_chain (tree); +}; // class pass_store_merging + +/* Terminate and process all recorded chains. Return true if any changes + were made. The chains occur in basic block BB, so purge potentially dead + EH edges from there. */ + +bool +pass_store_merging::terminate_and_process_all_chains (basic_block bb) +{ + hash_map::iterator iter + = m_stores.begin (); + bool ret = false; + for (; iter != m_stores.end (); ++iter) + ret |= terminate_and_release_chain ((*iter).first); + + if (ret) + gimple_purge_dead_eh_edges (bb); + return ret; +} + +/* Terminate all chains that are affected by the assignment to DEST, appearing + in statement STMT and ultimately points to the object BASE. Return true if + at least one aliasing chain was terminated. BASE and DEST are allowed to + be NULL_TREE. In that case the aliasing checks are performed on the whole + statement rather than a particular operand in it. */ + +bool +pass_store_merging::terminate_all_aliasing_chains (tree dest, tree base, + gimple *stmt) +{ + bool ret = false; + + /* If the statement doesn't touch memory it can't alias. */ + if (!gimple_vuse (stmt)) + return false; + + struct imm_store_chain_info **chain_info = NULL; + + basic_block bb = gimple_bb (stmt); + if (base) + { + chain_info = m_stores.get (base); + if (chain_info) + { + struct store_immediate_info *info; + unsigned int i; + FOR_EACH_VEC_ELT ((*chain_info)->m_store_info, i, info) + { + if (refs_may_alias_p (info->dest, dest)) + { + if (dump_file) + { + fprintf (dump_file, "stmt causes chain termination:\n"); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + terminate_and_release_chain (base); + ret = true; + break; + } + } + } + } + + hash_map::iterator iter + = m_stores.begin (); + + for (; iter != m_stores.end (); ++iter) + { + if (chain_info && (*chain_info) == (*iter).second) + continue; + + tree key = (*iter).first; + if (ref_maybe_used_by_stmt_p (stmt, key) + || stmt_may_clobber_ref_p (stmt, key)) + { + terminate_and_release_chain (key); + ret = true; + } + } + if (ret) + gimple_purge_dead_eh_edges (bb); + + return ret; +} + +/* Helper function. Terminate the recorded chain storing to base object + BASE. Return true if the merging and output was successful. The m_stores + entry is removed after the processing in any case. */ + +bool +pass_store_merging::terminate_and_release_chain (tree base) +{ + struct imm_store_chain_info **chain_info = m_stores.get (base); + + if (!chain_info) + return false; + + gcc_assert (*chain_info); + + bool ret = (*chain_info)->terminate_and_process_chain (base); + delete *chain_info; + m_stores.remove (base); + + return ret; +} + +/* Go through the candidate stores recorded in m_store_info and merge them + into merged_store_group objects recorded into m_merged_store_groups + representing the widened stores. Return true if coalescing was successful + and the number of widened stores is fewer than the original number + of stores. */ + +bool +imm_store_chain_info::coalesce_immediate_stores () +{ + /* Anything less can't be processed. */ + if (m_store_info.length () < 2) + return false; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Attempting to coalesce %u stores in chain.\n", + m_store_info.length ()); + + store_immediate_info *info; + unsigned int i; + + /* Order the stores by the bitposition they write to. */ + m_store_info.qsort (sort_by_bitpos); + + info = m_store_info[0]; + merged_store_group *merged_store = new merged_store_group (info); + m_merged_store_groups.safe_push (merged_store); + + FOR_EACH_VEC_ELT (m_store_info, i, info) + { + if (dump_file) + { + fprintf (dump_file, "Store %u:\nbitsize:" HOST_WIDE_INT_PRINT_DEC + " bitpos:" HOST_WIDE_INT_PRINT_DEC " val:\n", + i, info->bitsize, info->bitpos); + print_generic_expr (dump_file, info->val, 0); + fprintf (dump_file, "\n------------\n"); + } + + if (i == 0) + continue; + + /* |---store 1---| + |---store 2---| + Overlapping stores. */ + unsigned HOST_WIDE_INT start = info->bitpos; + if (IN_RANGE (start, merged_store->start, + merged_store->start + merged_store->width - 1)) + { + merged_store->merge_overlapping (info); + continue; + } + + /* |---store 1---| |---store 2---|. + Gap between stores. Start a new group. */ + if (start != merged_store->start + merged_store->width) + { + merged_store = new merged_store_group (info); + m_merged_store_groups.safe_push (merged_store); + continue; + } + + /* |---store 1---||---store 2---| + This store is consecutive to the previous one. + Merge it into the current store group. */ + merged_store->merge_into (info); + } + + gcc_assert (m_merged_store_groups.length () <= m_store_info.length ()); + bool success = m_merged_store_groups.length () < m_store_info.length (); + if (dump_file) + { + if (success) + fprintf (dump_file, "Coalescing successful!\n" + "Merged into %u stores\n", + m_merged_store_groups.length ()); + } + + if (success) + { + FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_store) + { + merged_store->apply_stores (); + } + } + return success; +} + +/* Return the type to use for the merged stores described by GROUP. + This is needed to get the alias sets right. */ + +static tree +get_type_for_merged_store (merged_store_group *group) +{ + struct store_immediate_info *store; + unsigned int i; + tree lhs = gimple_assign_lhs (group->stores[0]->stmt); + tree type = reference_alias_ptr_type (lhs); + FOR_EACH_VEC_ELT (group->stores, i, store) + { + gimple *stmt = store->stmt; + if (i == 0) + continue; + + lhs = gimple_assign_lhs (stmt); + tree type1 = reference_alias_ptr_type (lhs); + if (!alias_ptr_types_compatible_p (type, type1)) + return ptr_type_node; + } + return type; +} + +/* Return the location_t information we can find among the statements + in GROUP. */ + +static location_t +get_merged_store_location (merged_store_group *group) +{ + struct store_immediate_info *store; + unsigned int i; + + FOR_EACH_VEC_ELT (group->stores, i, store) + { + gimple *stmt = store->stmt; + if (gimple_has_location (stmt)) + return gimple_location (stmt); + } + return UNKNOWN_LOCATION; +} + +/* Return true if storing an integer of bitsize SIZE using an unaligned + access is prohibitively slow. */ + +static bool +slow_unaligned_size_access (unsigned int size) +{ + machine_mode mode = mode_for_size (size, MODE_INT, 0); + gcc_assert (mode != BLKmode); + return SLOW_UNALIGNED_ACCESS (mode, GET_MODE_ALIGNMENT (mode)); +} + +/* Given a merged store group GROUP output the widened version of it. + The store chain is against the base object BASE. + Try store sizes of at most MAX_STORE_BITSIZE bits wide and don't output + unaligned stores for STRICT_ALIGNMENT targets or if it's too expensive. + Make sure that the number of statements output is less than the number of + original statements. If a better sequence is possible emit it and + return true. */ + +bool +imm_store_chain_info::output_merged_store (tree base, merged_store_group *group) +{ + unsigned HOST_WIDE_INT pos = group->start; + unsigned HOST_WIDE_INT size = group->width; + unsigned HOST_WIDE_INT bytepos = pos / BITS_PER_UNIT; + unsigned int align = group->align; + + unsigned int orig_num_stmts = group->stores.length (); + if (orig_num_stmts < 2) + return false; + + /* We don't handle partial bitfields for now. */ + if ((size % BITS_PER_UNIT != 0) || (pos % BITS_PER_UNIT != 0)) + return false; + + gimple_stmt_iterator last_gsi = gsi_for_stmt (group->last_stmt); + basic_block bb = gsi_bb (last_gsi); + + /* Allow unaligned accesses in cold blocks. This depends on the predict.c + machinery getting the estimates right. */ + bool allow_unaligned + = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED) + && optimize_bb_for_size_p (bb); + + unsigned int try_size = MAX_STORE_BITSIZE; + while (try_size > size + || ((!allow_unaligned || slow_unaligned_size_access (try_size)) + && try_size > align)) + { + try_size /= 2; + if (try_size < BITS_PER_UNIT) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Failed to output merged store.\n" + "Failed to find starting size meeting" + " alignment requirements.\n"); + } + return false; + } + } + + unsigned HOST_WIDE_INT try_pos = bytepos; + int wi_offset = 0; + if (dump_file) + { + fprintf (dump_file, + "Trying to output merged store at pos " HOST_WIDE_INT_PRINT_DEC + ", of size " HOST_WIDE_INT_PRINT_DEC ", " + "starting size %u and alignment %u\n", + pos, size, try_size, align); + } + + gimple_seq seq = NULL; + unsigned int num_stmts = 0; + tree offset_type = get_type_for_merged_store (group); + tree last_vdef, new_vuse; + last_vdef = gimple_vdef (group->last_stmt); + new_vuse = gimple_vuse (group->last_stmt); + location_t loc = get_merged_store_location (group); + gimple *stmt = NULL; + + while (size > 0) + { + tree int_type = build_nonstandard_integer_type (try_size, UNSIGNED); + SET_TYPE_ALIGN (int_type, align); + tree addr = build_fold_addr_expr (base); + tree dest = fold_build2 (MEM_REF, int_type, addr, + build_int_cst (offset_type, try_pos)); + + tree src = native_interpret_expr (int_type, + group->val + wi_offset / BITS_PER_UNIT, + group->buf_size); + + if (dump_file) + { + fprintf (dump_file, "Output src is:\n"); + print_generic_expr (dump_file, src, 0); + fprintf (dump_file, "\n"); + } + stmt = gimple_build_assign (dest, src); + gimple_set_vuse (stmt, new_vuse); + gimple_seq_add_stmt_without_update (&seq, stmt); + gimple_set_bb (stmt, gsi_bb (last_gsi)); + + num_stmts++; + + try_pos += try_size / BITS_PER_UNIT; + + wi_offset += try_size; + size -= try_size; + align = try_size; + while (size < try_size) + try_size /= 2; + + if (num_stmts >= orig_num_stmts) + { + if (dump_file) + { + fprintf (dump_file, "Exceeded original number of stmts (%u)." + " Not profitable to emit new sequence.\n", + orig_num_stmts); + } + return false; + } + + tree new_vdef + = size ? make_ssa_name (gimple_vop (cfun), stmt) : last_vdef; + gimple_set_vdef (stmt, new_vdef); + SSA_NAME_DEF_STMT (new_vdef) = stmt; + new_vuse = new_vdef; + } + gcc_assert (size == 0); + gcc_assert (seq); + annotate_all_with_location (seq, loc); + if (dump_file) + { + fprintf (dump_file, + "New sequence of %u stmts to replace old one of %u stmts\n", + num_stmts, orig_num_stmts); + if (dump_flags & TDF_DETAILS) + print_gimple_seq (dump_file, seq, 0, TDF_VOPS | TDF_MEMSYMS); + } + gsi_insert_seq_after (&last_gsi, seq, GSI_SAME_STMT); + + return true; +} + +/* Process the merged_store_group objects created in the coalescing phase. + The stores are all against the base object BASE. + Try to output the widened stores and delete the original statements if + successful. Return true iff any changes were made. */ + +bool +imm_store_chain_info::output_merged_stores (tree base) +{ + unsigned int i; + merged_store_group *merged_store; + bool ret = false; + FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_store) + { + if (output_merged_store (base, merged_store)) + { + unsigned int j; + store_immediate_info *store; + FOR_EACH_VEC_ELT (merged_store->stores, j, store) + { + gimple *stmt = store->stmt; + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_remove (&gsi, true); + if (stmt != merged_store->last_stmt) + { + unlink_stmt_vdef (stmt); + release_defs (stmt); + } + } + ret = true; + } + } + if (ret && dump_file) + fprintf (dump_file, "Merging successful!\n"); + + return ret; +} + +/* Coalesce the store_immediate_info objects recorded against the base object + BASE in the first phase and output them. + Delete the allocated structures. + Return true if any changes were made. */ + +bool +imm_store_chain_info::terminate_and_process_chain (tree base) +{ + /* Process store chain. */ + bool ret = false; + if (m_store_info.length () > 1) + { + ret = coalesce_immediate_stores (); + if (ret) + ret = output_merged_stores (base); + } + + /* Delete all the entries we allocated ourselves. */ + store_immediate_info *info; + unsigned int i; + FOR_EACH_VEC_ELT (m_store_info, i, info) + delete info; + + merged_store_group *merged_info; + FOR_EACH_VEC_ELT (m_merged_store_groups, i, merged_info) + delete merged_info; + + return ret; +} + +/* Return true iff LHS is a destination potentially interesting for + store merging. */ + +static bool +lhs_code_valid_for_store_merging_p (tree lhs) +{ + tree_code code = TREE_CODE (lhs); + tree type = TREE_TYPE (lhs); + + /* If the target requests a promotion to a wider type get_inner_reference may + give bad results for the store destination (for example __fp16 to float + promotion in aarch64). Be conservative here. */ + tree prom_type = targetm.promoted_type (type); + if (prom_type && TYPE_PRECISION (type) != TYPE_PRECISION (prom_type)) + return false; + + if (code == ARRAY_REF || code == ARRAY_RANGE_REF || code == MEM_REF + || code == COMPONENT_REF || code == BIT_FIELD_REF) + return true; + + return false; +} + +/* Return true if the tree RHS is a constant we want to consider + during store merging. In practice accept all codes that + native_encode_expr accepts. */ + +static bool +rhs_valid_for_store_merging_p (tree rhs) +{ + tree_code code = TREE_CODE (rhs); + if (TREE_CODE_CLASS (code) != tcc_constant + || !tree_fits_shwi_p (TYPE_SIZE (TREE_TYPE (rhs)))) + return false; + + return code != VOID_CST; +} + +/* Entry point for the pass. Go over each basic block recording chains of + immediate stores. Upon encountering a terminating statement (as defined + by stmt_terminates_chain_p) process the recorded stores and emit the widened + variants. */ + +unsigned int +pass_store_merging::execute (function *fun) +{ + basic_block bb; + hash_set orig_stmts; + + FOR_EACH_BB_FN (bb, fun) + { + gimple_stmt_iterator gsi; + unsigned HOST_WIDE_INT num_statements = 0; + /* Record the original statements so that we can keep track of + statements emitted in this pass and not re-process new + statements. */ + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple_set_visited (gsi_stmt (gsi), false); + if (is_gimple_debug (gsi_stmt (gsi))) + continue; + + num_statements++; + } + + if (num_statements < 2) + continue; + + if (dump_file) + fprintf (dump_file, "Processing basic block <%d>:\n", bb->index); + + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + + if (gimple_has_volatile_ops (stmt)) + { + /* Terminate all chains. */ + if (dump_file) + fprintf (dump_file, "Volatile access terminates " + "all chains\n"); + terminate_and_process_all_chains (bb); + continue; + } + + if (is_gimple_debug (stmt)) + continue; + + if (gimple_assign_single_p (stmt) && gimple_vdef (stmt) + && lhs_code_valid_for_store_merging_p (gimple_assign_lhs (stmt))) + { + tree lhs = gimple_assign_lhs (stmt); + tree rhs = gimple_assign_rhs1 (stmt); + + HOST_WIDE_INT bitsize, bitpos; + machine_mode mode; + int unsignedp = 0, reversep = 0, volatilep = 0; + tree offset, base_addr; + base_addr + = get_inner_reference (lhs, &bitsize, &bitpos, &offset, &mode, + &unsignedp, &reversep, &volatilep); + bool invalid = offset || reversep + || ((bitsize > MAX_BITSIZE_MODE_ANY_INT) + && (bitpos % BITS_PER_UNIT) + && (TREE_CODE (rhs) != INTEGER_CST)) + || !rhs_valid_for_store_merging_p (rhs) + || TREE_THIS_VOLATILE (base_addr); + + /* In some cases get_inner_reference may return a + MEM_REF [ptr + byteoffset]. For the purposes of this pass + canonicalize the base_addr to MEM_REF [ptr] and take + byteoffset into account in the bitpos. This occurs in + PR 23684 and this way we can catch more chains. */ + if (TREE_CODE (base_addr) == MEM_REF + && POINTER_TYPE_P (TREE_TYPE (TREE_OPERAND (base_addr, 0))) + && TREE_CODE (TREE_OPERAND (base_addr, 1)) == INTEGER_CST + && tree_fits_shwi_p (TREE_OPERAND (base_addr, 1))) + { + bitpos += tree_to_shwi (TREE_OPERAND (base_addr, 1)) + * BITS_PER_UNIT; + + base_addr = fold_build2 (MEM_REF, TREE_TYPE (base_addr), + TREE_OPERAND (base_addr, 0), + build_zero_cst (TREE_TYPE ( + TREE_OPERAND (base_addr, 1)))); + } + + struct imm_store_chain_info **chain_info + = m_stores.get (base_addr); + + if (!invalid) + { + store_immediate_info *info; + if (chain_info) + { + info = new store_immediate_info ( + bitsize, bitpos, rhs, lhs, stmt, + (*chain_info)->m_store_info.length ()); + if (dump_file) + { + fprintf (dump_file, + "Recording immediate store from stmt:\n"); + print_gimple_stmt (dump_file, stmt, 0, 0); + } + (*chain_info)->m_store_info.safe_push (info); + continue; + } + + /* Store aliases any existing chain? */ + terminate_all_aliasing_chains (lhs, base_addr, stmt); + /* Start a new chain. */ + struct imm_store_chain_info *new_chain + = new imm_store_chain_info; + info = new store_immediate_info (bitsize, bitpos, rhs, lhs, + stmt, 0); + new_chain->m_store_info.safe_push (info); + m_stores.put (base_addr, new_chain); + if (dump_file) + { + fprintf (dump_file, + "Starting new chain with statement:\n"); + print_gimple_stmt (dump_file, stmt, 0, 0); + fprintf (dump_file, "The base object is:\n"); + print_generic_expr (dump_file, base_addr, 0); + fprintf (dump_file, "\n"); + } + } + else + terminate_all_aliasing_chains (lhs, base_addr, stmt); + + continue; + } + + terminate_all_aliasing_chains (NULL_TREE, NULL_TREE, stmt); + } + terminate_and_process_all_chains (bb); + } + return 0; +} + +} // anon namespace + +/* Construct and return a store merging pass object. */ + +gimple_opt_pass * +make_pass_store_merging (gcc::context *ctxt) +{ + return new pass_store_merging (ctxt); +} diff --git a/gcc/opts.c b/gcc/opts.c index 45f1f89c..e63d7e4 100644 --- a/gcc/opts.c +++ b/gcc/opts.c @@ -463,6 +463,7 @@ static const struct default_options default_options_table[] = { OPT_LEVELS_1_PLUS, OPT_ftree_dse, NULL, 1 }, { OPT_LEVELS_1_PLUS, OPT_ftree_ter, NULL, 1 }, { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_ftree_sra, NULL, 1 }, + { OPT_LEVELS_1_PLUS_NOT_DEBUG, OPT_fstore_merging, NULL, 1 }, { OPT_LEVELS_1_PLUS, OPT_ftree_fre, NULL, 1 }, { OPT_LEVELS_1_PLUS, OPT_ftree_copy_prop, NULL, 1 }, { OPT_LEVELS_1_PLUS, OPT_ftree_sink, NULL, 1 }, diff --git a/gcc/params.def b/gcc/params.def index 8907aa4..e63e594 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -1100,6 +1100,12 @@ DEFPARAM (PARAM_MAX_TAIL_MERGE_COMPARISONS, "Maximum amount of similar bbs to compare a bb with.", 10, 0, 0) +DEFPARAM (PARAM_STORE_MERGING_ALLOW_UNALIGNED, + "store-merging-allow-unaligned", + "Allow the store merging pass to introduce unaligned stores " + "if it is legal to do so", + 1, 0, 1) + DEFPARAM (PARAM_MAX_TAIL_MERGE_ITERATIONS, "max-tail-merge-iterations", "Maximum amount of iterations of the pass over a function.", diff --git a/gcc/passes.def b/gcc/passes.def index 2830421..ee7dd50 100644 --- a/gcc/passes.def +++ b/gcc/passes.def @@ -329,6 +329,7 @@ along with GCC; see the file COPYING3. If not see NEXT_PASS (pass_phiopt); NEXT_PASS (pass_fold_builtins); NEXT_PASS (pass_optimize_widening_mul); + NEXT_PASS (pass_store_merging); NEXT_PASS (pass_tail_calls); /* If DCE is not run before checking for uninitialized uses, we may get false warnings (e.g., testsuite/gcc.dg/uninit-5.c). diff --git a/gcc/testsuite/gcc.c-torture/execute/pr22141-1.c b/gcc/testsuite/gcc.c-torture/execute/pr22141-1.c new file mode 100644 index 0000000..7c888b4 --- /dev/null +++ b/gcc/testsuite/gcc.c-torture/execute/pr22141-1.c @@ -0,0 +1,122 @@ +/* PR middle-end/22141 */ + +extern void abort (void); + +struct S +{ + struct T + { + char a; + char b; + char c; + char d; + } t; +} u; + +struct U +{ + struct S s[4]; +}; + +void __attribute__((noinline)) +c1 (struct T *p) +{ + if (p->a != 1 || p->b != 2 || p->c != 3 || p->d != 4) + abort (); + __builtin_memset (p, 0xaa, sizeof (*p)); +} + +void __attribute__((noinline)) +c2 (struct S *p) +{ + c1 (&p->t); +} + +void __attribute__((noinline)) +c3 (struct U *p) +{ + c2 (&p->s[2]); +} + +void __attribute__((noinline)) +f1 (void) +{ + u = (struct S) { { 1, 2, 3, 4 } }; +} + +void __attribute__((noinline)) +f2 (void) +{ + u.t.a = 1; + u.t.b = 2; + u.t.c = 3; + u.t.d = 4; +} + +void __attribute__((noinline)) +f3 (void) +{ + u.t.d = 4; + u.t.b = 2; + u.t.a = 1; + u.t.c = 3; +} + +void __attribute__((noinline)) +f4 (void) +{ + struct S v; + v.t.a = 1; + v.t.b = 2; + v.t.c = 3; + v.t.d = 4; + c2 (&v); +} + +void __attribute__((noinline)) +f5 (struct S *p) +{ + p->t.a = 1; + p->t.c = 3; + p->t.d = 4; + p->t.b = 2; +} + +void __attribute__((noinline)) +f6 (void) +{ + struct U v; + v.s[2].t.a = 1; + v.s[2].t.b = 2; + v.s[2].t.c = 3; + v.s[2].t.d = 4; + c3 (&v); +} + +void __attribute__((noinline)) +f7 (struct U *p) +{ + p->s[2].t.a = 1; + p->s[2].t.c = 3; + p->s[2].t.d = 4; + p->s[2].t.b = 2; +} + +int +main (void) +{ + struct U w; + f1 (); + c2 (&u); + f2 (); + c1 (&u.t); + f3 (); + c2 (&u); + f4 (); + f5 (&u); + c2 (&u); + f6 (); + f7 (&w); + c3 (&w); + return 0; +} diff --git a/gcc/testsuite/gcc.c-torture/execute/pr22141-2.c b/gcc/testsuite/gcc.c-torture/execute/pr22141-2.c new file mode 100644 index 0000000..cb9cc79 --- /dev/null +++ b/gcc/testsuite/gcc.c-torture/execute/pr22141-2.c @@ -0,0 +1,122 @@ +/* PR middle-end/22141 */ + +extern void abort (void); + +struct S +{ + struct T + { + char a; + char b; + char c; + char d; + } t; +} u __attribute__((aligned)); + +struct U +{ + struct S s[4]; +}; + +void __attribute__((noinline)) +c1 (struct T *p) +{ + if (p->a != 1 || p->b != 2 || p->c != 3 || p->d != 4) + abort (); + __builtin_memset (p, 0xaa, sizeof (*p)); +} + +void __attribute__((noinline)) +c2 (struct S *p) +{ + c1 (&p->t); +} + +void __attribute__((noinline)) +c3 (struct U *p) +{ + c2 (&p->s[2]); +} + +void __attribute__((noinline)) +f1 (void) +{ + u = (struct S) { { 1, 2, 3, 4 } }; +} + +void __attribute__((noinline)) +f2 (void) +{ + u.t.a = 1; + u.t.b = 2; + u.t.c = 3; + u.t.d = 4; +} + +void __attribute__((noinline)) +f3 (void) +{ + u.t.d = 4; + u.t.b = 2; + u.t.a = 1; + u.t.c = 3; +} + +void __attribute__((noinline)) +f4 (void) +{ + struct S v __attribute__((aligned)); + v.t.a = 1; + v.t.b = 2; + v.t.c = 3; + v.t.d = 4; + c2 (&v); +} + +void __attribute__((noinline)) +f5 (struct S *p) +{ + p->t.a = 1; + p->t.c = 3; + p->t.d = 4; + p->t.b = 2; +} + +void __attribute__((noinline)) +f6 (void) +{ + struct U v __attribute__((aligned)); + v.s[2].t.a = 1; + v.s[2].t.b = 2; + v.s[2].t.c = 3; + v.s[2].t.d = 4; + c3 (&v); +} + +void __attribute__((noinline)) +f7 (struct U *p) +{ + p->s[2].t.a = 1; + p->s[2].t.c = 3; + p->s[2].t.d = 4; + p->s[2].t.b = 2; +} + +int +main (void) +{ + struct U w __attribute__((aligned)); + f1 (); + c2 (&u); + f2 (); + c1 (&u.t); + f3 (); + c2 (&u); + f4 (); + f5 (&u); + c2 (&u); + f6 (); + f7 (&w); + c3 (&w); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/store_merging_1.c b/gcc/testsuite/gcc.dg/store_merging_1.c new file mode 100644 index 0000000..09a4d14 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_1.c @@ -0,0 +1,35 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target non_strict_align } */ +/* { dg-options "-O -fdump-tree-store-merging" } */ + +struct bar { + int a; + char b; + char c; + char d; + char e; + char f; + char g; +}; + +void +foo1 (struct bar *p) +{ + p->b = 0; + p->a = 0; + p->c = 0; + p->d = 0; + p->e = 0; +} + +void +foo2 (struct bar *p) +{ + p->b = 0; + p->a = 0; + p->c = 1; + p->d = 0; + p->e = 0; +} + +/* { dg-final { scan-tree-dump-times "Merging successful" 2 "store-merging" } } */ diff --git a/gcc/testsuite/gcc.dg/store_merging_2.c b/gcc/testsuite/gcc.dg/store_merging_2.c new file mode 100644 index 0000000..d3acc2d --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_2.c @@ -0,0 +1,80 @@ +/* { dg-do run } */ +/* { dg-require-effective-target non_strict_align } */ +/* { dg-options "-O -fdump-tree-store-merging" } */ + +struct bar +{ + int a; + unsigned char b; + unsigned char c; + short d; + unsigned char e; + unsigned char f; + unsigned char g; +}; + +__attribute__ ((noinline)) void +foozero (struct bar *p) +{ + p->b = 0; + p->a = 0; + p->c = 0; + p->d = 0; + p->e = 0; + p->f = 0; + p->g = 0; +} + +__attribute__ ((noinline)) void +foo1 (struct bar *p) +{ + p->b = 1; + p->a = 2; + p->c = 3; + p->d = 4; + p->e = 5; + p->f = 0; + p->g = 0xff; +} + +__attribute__ ((noinline)) void +foo2 (struct bar *p, struct bar *p2) +{ + p->b = 0xff; + p2->b = 0xa; + p->a = 0xfffff; + p2->c = 0xc; + p->c = 0xff; + p2->d = 0xbf; + p->d = 0xfff; +} + +int +main (void) +{ + struct bar b1, b2; + foozero (&b1); + foozero (&b2); + + foo1 (&b1); + if (b1.b != 1 || b1.a != 2 || b1.c != 3 || b1.d != 4 || b1.e != 5 + || b1.f != 0 || b1.g != 0xff) + __builtin_abort (); + + foozero (&b1); + /* Make sure writes to aliasing struct pointers preserve the + correct order. */ + foo2 (&b1, &b1); + if (b1.b != 0xa || b1.a != 0xfffff || b1.c != 0xff || b1.d != 0xfff) + __builtin_abort (); + + foozero (&b1); + foo2 (&b1, &b2); + if (b1.a != 0xfffff || b1.b != 0xff || b1.c != 0xff || b1.d != 0xfff + || b2.b != 0xa || b2.c != 0xc || b2.d != 0xbf) + __builtin_abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "Merging successful" 2 "store-merging" } } */ diff --git a/gcc/testsuite/gcc.dg/store_merging_3.c b/gcc/testsuite/gcc.dg/store_merging_3.c new file mode 100644 index 0000000..cd756c1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_3.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target non_strict_align } */ +/* { dg-options "-O -fdump-tree-store-merging" } */ + +/* Make sure stores to volatile addresses don't get combined with + other accesses. */ + +struct bar +{ + int a; + char b; + char c; + volatile short d; + char e; + char f; + char g; +}; + +void +foozero (struct bar *p) +{ + p->b = 0xa; + p->a = 0xb; + p->c = 0xc; + p->d = 0; + p->e = 0xd; + p->f = 0xe; + p->g = 0xf; +} + +/* { dg-final { scan-tree-dump "Volatile access terminates all chains" "store-merging" } } */ +/* { dg-final { scan-tree-dump-times "=\{v\} 0;" 1 "store-merging" } } */ diff --git a/gcc/testsuite/gcc.dg/store_merging_4.c b/gcc/testsuite/gcc.dg/store_merging_4.c new file mode 100644 index 0000000..4bf9025 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_4.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target non_strict_align } */ +/* { dg-options "-O -fdump-tree-store-merging" } */ + +/* Check that we can merge interleaving stores that are guaranteed + to be non-aliasing. */ + +struct bar +{ + int a; + char b; + char c; + short d; + char e; + char f; + char g; +}; + +void +foozero (struct bar *restrict p, struct bar *restrict p2) +{ + p->b = 0xff; + p2->b = 0xa; + p->a = 0xfffff; + p2->a = 0xab; + p2->c = 0xc; + p->c = 0xff; + p2->d = 0xbf; + p->d = 0xfff; +} + +/* { dg-final { scan-tree-dump-times "Merging successful" 2 "store-merging" } } */ diff --git a/gcc/testsuite/gcc.dg/store_merging_5.c b/gcc/testsuite/gcc.dg/store_merging_5.c new file mode 100644 index 0000000..3b82420 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_5.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target non_strict_align } */ +/* { dg-options "-O -fdump-tree-store-merging" } */ + +/* Make sure that non-aliasing non-constant interspersed stores do not + stop chains. */ + +struct bar { + int a; + char b; + char c; + char d; + char e; + char g; +}; + +void +foo1 (struct bar *p, char tmp) +{ + p->a = 0; + p->b = 0; + p->g = tmp; + p->c = 0; + p->d = 0; + p->e = 0; +} + + +/* { dg-final { scan-tree-dump-times "Merging successful" 1 "store-merging" } } */ +/* { dg-final { scan-tree-dump-times "MEM\\\[.*\\\]" 1 "store-merging" } } */ diff --git a/gcc/testsuite/gcc.dg/store_merging_6.c b/gcc/testsuite/gcc.dg/store_merging_6.c new file mode 100644 index 0000000..7d89baf --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_6.c @@ -0,0 +1,53 @@ +/* { dg-do run } */ +/* { dg-require-effective-target non_strict_align } */ +/* { dg-options "-O -fdump-tree-store-merging" } */ + +/* Check that we can widen accesses to bitfields. */ + +struct bar { + int a : 3; + unsigned char b : 4; + unsigned char c : 1; + char d; + char e; + char f; + char g; +}; + +__attribute__ ((noinline)) void +foozero (struct bar *p) +{ + p->b = 0; + p->a = 0; + p->c = 0; + p->d = 0; + p->e = 0; + p->f = 0; + p->g = 0; +} + +__attribute__ ((noinline)) void +foo1 (struct bar *p) +{ + p->b = 3; + p->a = 2; + p->c = 1; + p->d = 4; + p->e = 5; +} + +int +main (void) +{ + struct bar p; + foozero (&p); + foo1 (&p); + if (p.a != 2 || p.b != 3 || p.c != 1 || p.d != 4 || p.e != 5 + || p.f != 0 || p.g != 0) + __builtin_abort (); + + return 0; +} + + +/* { dg-final { scan-tree-dump-times "Merging successful" 2 "store-merging" } } */ diff --git a/gcc/testsuite/gcc.dg/store_merging_7.c b/gcc/testsuite/gcc.dg/store_merging_7.c new file mode 100644 index 0000000..02008f7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_7.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target non_strict_align } */ +/* { dg-options "-O -fdump-tree-store-merging" } */ + +/* Check that we can merge consecutive array members through the pointer. + PR rtl-optimization/23684. */ + +void +foo (char *input) +{ + input = __builtin_assume_aligned (input, 8); + input[0] = 'H'; + input[1] = 'e'; + input[2] = 'l'; + input[3] = 'l'; + input[4] = 'o'; + input[5] = ' '; + input[6] = 'w'; + input[7] = 'o'; + input[8] = 'r'; + input[9] = 'l'; + input[10] = 'd'; + input[11] = '\0'; +} + +/* { dg-final { scan-tree-dump-times "Merging successful" 1 "store-merging" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c index f02e55f..9de4e77 100644 --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_1.c @@ -3,22 +3,22 @@ int arr[4][4]; void -foo () +foo (int x, int y) { - arr[0][1] = 1; - arr[1][0] = -1; - arr[2][0] = 1; - arr[1][1] = -1; - arr[0][2] = 1; - arr[0][3] = -1; - arr[1][2] = 1; - arr[2][1] = -1; - arr[3][0] = 1; - arr[3][1] = -1; - arr[2][2] = 1; - arr[1][3] = -1; - arr[2][3] = 1; - arr[3][2] = -1; + arr[0][1] = x; + arr[1][0] = y; + arr[2][0] = x; + arr[1][1] = y; + arr[0][2] = x; + arr[0][3] = y; + arr[1][2] = x; + arr[2][1] = y; + arr[3][0] = x; + arr[3][1] = y; + arr[2][2] = x; + arr[1][3] = y; + arr[2][3] = x; + arr[3][2] = y; } /* { dg-final { scan-assembler-times "stp\tw\[0-9\]+, w\[0-9\]" 7 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_4.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_4.c index 40056b1..824f0d2 100644 --- a/gcc/testsuite/gcc.target/aarch64/ldp_stp_4.c +++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_4.c @@ -3,22 +3,22 @@ float arr[4][4]; void -foo () +foo (float x, float y) { - arr[0][1] = 1; - arr[1][0] = -1; - arr[2][0] = 1; - arr[1][1] = -1; - arr[0][2] = 1; - arr[0][3] = -1; - arr[1][2] = 1; - arr[2][1] = -1; - arr[3][0] = 1; - arr[3][1] = -1; - arr[2][2] = 1; - arr[1][3] = -1; - arr[2][3] = 1; - arr[3][2] = -1; + arr[0][1] = x; + arr[1][0] = y; + arr[2][0] = x; + arr[1][1] = y; + arr[0][2] = x; + arr[0][3] = y; + arr[1][2] = x; + arr[2][1] = y; + arr[3][0] = x; + arr[3][1] = y; + arr[2][2] = x; + arr[1][3] = y; + arr[2][3] = x; + arr[3][2] = y; } /* { dg-final { scan-assembler-times "stp\ts\[0-9\]+, s\[0-9\]" 7 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr22141.c b/gcc/testsuite/gcc.target/i386/pr22141.c new file mode 100644 index 0000000..036422e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr22141.c @@ -0,0 +1,126 @@ +/* PR middle-end/22141 */ +/* { dg-do compile } */ +/* { dg-options "-Os" } */ + +extern void abort (void); + +struct S +{ + struct T + { + char a; + char b; + char c; + char d; + } t; +} u; + +struct U +{ + struct S s[4]; +}; + +void __attribute__((noinline)) +c1 (struct T *p) +{ + if (p->a != 1 || p->b != 2 || p->c != 3 || p->d != 4) + abort (); + __builtin_memset (p, 0xaa, sizeof (*p)); +} + +void __attribute__((noinline)) +c2 (struct S *p) +{ + c1 (&p->t); +} + +void __attribute__((noinline)) +c3 (struct U *p) +{ + c2 (&p->s[2]); +} + +void __attribute__((noinline)) +f1 (void) +{ + u = (struct S) { { 1, 2, 3, 4 } }; +} + +void __attribute__((noinline)) +f2 (void) +{ + u.t.a = 1; + u.t.b = 2; + u.t.c = 3; + u.t.d = 4; +} + +void __attribute__((noinline)) +f3 (void) +{ + u.t.d = 4; + u.t.b = 2; + u.t.a = 1; + u.t.c = 3; +} + +void __attribute__((noinline)) +f4 (void) +{ + struct S v; + v.t.a = 1; + v.t.b = 2; + v.t.c = 3; + v.t.d = 4; + c2 (&v); +} + +void __attribute__((noinline)) +f5 (struct S *p) +{ + p->t.a = 1; + p->t.c = 3; + p->t.d = 4; + p->t.b = 2; +} + +void __attribute__((noinline)) +f6 (void) +{ + struct U v; + v.s[2].t.a = 1; + v.s[2].t.b = 2; + v.s[2].t.c = 3; + v.s[2].t.d = 4; + c3 (&v); +} + +void __attribute__((noinline)) +f7 (struct U *p) +{ + p->s[2].t.a = 1; + p->s[2].t.c = 3; + p->s[2].t.d = 4; + p->s[2].t.b = 2; +} + +int +main (void) +{ + struct U w; + f1 (); + c2 (&u); + f2 (); + c1 (&u.t); + f3 (); + c2 (&u); + f4 (); + f5 (&u); + c2 (&u); + f6 (); + f7 (&w); + c3 (&w); + return 0; +} + +/* { dg-final { scan-assembler-times "67305985\|4030201" 7 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr34012.c b/gcc/testsuite/gcc.target/i386/pr34012.c index 00b1240..d0cffa0 100644 --- a/gcc/testsuite/gcc.target/i386/pr34012.c +++ b/gcc/testsuite/gcc.target/i386/pr34012.c @@ -1,7 +1,7 @@ /* PR rtl-optimization/34012 */ /* { dg-do compile } */ /* { dg-require-effective-target lp64 } */ -/* { dg-options "-O2" } */ +/* { dg-options "-O2 -fno-store-merging" } */ void bar (long int *); void diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h index a706729..b5373a3 100644 --- a/gcc/tree-pass.h +++ b/gcc/tree-pass.h @@ -425,6 +425,7 @@ extern gimple_opt_pass *make_pass_late_warn_uninitialized (gcc::context *ctxt); extern gimple_opt_pass *make_pass_cse_reciprocals (gcc::context *ctxt); extern gimple_opt_pass *make_pass_cse_sincos (gcc::context *ctxt); extern gimple_opt_pass *make_pass_optimize_bswap (gcc::context *ctxt); +extern gimple_opt_pass *make_pass_store_merging (gcc::context *ctxt); extern gimple_opt_pass *make_pass_optimize_widening_mul (gcc::context *ctxt); extern gimple_opt_pass *make_pass_warn_function_return (gcc::context *ctxt); extern gimple_opt_pass *make_pass_warn_function_noreturn (gcc::context *ctxt);