* New parameters to control stringop expansion libcall strategy @ 2013-08-03 4:22 Xinliang David Li 2013-08-03 8:07 ` Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) Jan Hubicka 2013-08-05 3:01 ` New parameters to control stringop expansion libcall strategy Xinliang David Li 0 siblings, 2 replies; 23+ messages in thread From: Xinliang David Li @ 2013-08-03 4:22 UTC (permalink / raw) To: GCC Patches; +Cc: Jan Hubicka, Teresa Johnson [-- Attachment #1: Type: text/plain, Size: 669 bytes --] On x86_64, when the expected size of memcpy/memset is known (e.g, with FDO), libcall strategy is used with the size is > 8192. This value is hard coded, which makes it hard to do performance tuning. This patch adds two new parameters to do that. Potential usage includes per-application libcall strategy min-size tuning based on summary data with FDO (e.g, instruction workset size). Bootstrap and tested on x86_64/linux. Ok for trunk? thanks, David 2013-08-02 Xinliang David Li <davidxl@google.com> * params.def: New parameters. * config/i386/i386.c (ix86_option_override_internal): Override default libcall size limit with parameters. [-- Attachment #2: stringop_inl_param.txt --] [-- Type: text/plain, Size: 6543 bytes --] Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201458) +++ config/i386/i386.c (working copy) @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = }; /* Processor costs (relative to an add) */ -static const +static struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ -static const +static struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = }; /* core_cost should produce code tuned for Core familly of CPUs. */ -static const +static struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ -static const +static struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -4021,6 +4021,34 @@ ix86_option_override_internal (bool main /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Now override the memcpy/memset inline strategy parameters */ + if (PARAM_VALUE (PARAM_MEMCPY_LIBCALL_MIN_SIZE) != -1 + || PARAM_VALUE (PARAM_MEMSET_LIBCALL_MIN_SIZE) != -1) + { + const struct stringop_algs *algs[2]; + int k; + int min_sizes[2]; + + algs[0] = &ix86_cost->memset[TARGET_64BIT != 0]; + algs[1] = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + min_sizes[0] = PARAM_VALUE (PARAM_MEMSET_LIBCALL_MIN_SIZE); + min_sizes[1] = PARAM_VALUE (PARAM_MEMCPY_LIBCALL_MIN_SIZE); + + for (k = 0; k < 2; k++) + { + if (min_sizes[k] == -1) + continue; + + for (i = 0; i < MAX_STRINGOP_ALGS - 1; i++) + { + if (algs[k]->size[i].max >= min_sizes[k] + || algs[k]->size[i + 1].alg == libcall) + *const_cast<int *>(&algs[k]->size[i].max) = min_sizes[k] - 1; + } + } + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ Index: params.def =================================================================== --- params.def (revision 201458) +++ params.def (working copy) @@ -117,6 +117,18 @@ DEFPARAM (PARAM_COMDAT_SHARING_PROBABILI "Probability that COMDAT function will be shared with different compilation unit", 20, 0, 0) +/* Use libcall strategy when the expected size is no less than this parameter for memcpy. */ +DEFPARAM (PARAM_MEMCPY_LIBCALL_MIN_SIZE, + "memcpy-libcall-min-size", + "The minimal expected size to force libcall expansion strategy for memcpy", + -1, 1, 0) + +/* Use libcall strategy when the expected size is no less than this parameter for memset. */ +DEFPARAM (PARAM_MEMSET_LIBCALL_MIN_SIZE, + "memset-libcall-min-size", + "The minimal expected size to force libcall expansion strategy for memset", + -1, 1, 0) + /* Limit on probability of entry BB. */ DEFPARAM (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY, "partial-inlining-entry-probability", ^ permalink raw reply [flat|nested] 23+ messages in thread
* Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) 2013-08-03 4:22 New parameters to control stringop expansion libcall strategy Xinliang David Li @ 2013-08-03 8:07 ` Jan Hubicka 2013-08-03 15:40 ` Xinliang David Li 2013-08-05 3:01 ` New parameters to control stringop expansion libcall strategy Xinliang David Li 1 sibling, 1 reply; 23+ messages in thread From: Jan Hubicka @ 2013-08-03 8:07 UTC (permalink / raw) To: Xinliang David Li; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson > On x86_64, when the expected size of memcpy/memset is known (e.g, with > FDO), libcall strategy is used with the size is > 8192. This value is > hard coded, which makes it hard to do performance tuning. This patch > adds two new parameters to do that. Potential usage includes > per-application libcall strategy min-size tuning based on summary data > with FDO (e.g, instruction workset size). > > Bootstrap and tested on x86_64/linux. Ok for trunk? > > thanks, > > David > > > 2013-08-02 Xinliang David Li <davidxl@google.com> > > * params.def: New parameters. > * config/i386/i386.c (ix86_option_override_internal): > Override default libcall size limit with parameters. Hi, problem with this is that we introduce generic --param that is used only by x86 backend. I am not really guru on the command line options, but I think this is first time we try to do such thing. I wonder if 1) We want to introduce target specific params.def 2) We want to use usual -msomething= options 3) We want to go this way? Honza ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) 2013-08-03 8:07 ` Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) Jan Hubicka @ 2013-08-03 15:40 ` Xinliang David Li 0 siblings, 0 replies; 23+ messages in thread From: Xinliang David Li @ 2013-08-03 15:40 UTC (permalink / raw) To: Jan Hubicka; +Cc: GCC Patches, Teresa Johnson On Sat, Aug 3, 2013 at 1:06 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> On x86_64, when the expected size of memcpy/memset is known (e.g, with >> FDO), libcall strategy is used with the size is > 8192. This value is >> hard coded, which makes it hard to do performance tuning. This patch >> adds two new parameters to do that. Potential usage includes >> per-application libcall strategy min-size tuning based on summary data >> with FDO (e.g, instruction workset size). >> >> Bootstrap and tested on x86_64/linux. Ok for trunk? >> >> thanks, >> >> David >> >> >> 2013-08-02 Xinliang David Li <davidxl@google.com> >> >> * params.def: New parameters. >> * config/i386/i386.c (ix86_option_override_internal): >> Override default libcall size limit with parameters. > > Hi, > problem with this is that we introduce generic --param that is used only > by x86 backend. I am not really guru on the command line options, but I think > this is first time we try to do such thing. I wonder if > 1) We want to introduce target specific params.def We do have target specific tuning code for parameters though -- backend overrides the default value -- I think this is essentially target specific params. > 2) We want to use usual -msomething= options > 3) We want to go this way? I don't have strong opinion either way. To avoid controversy, let me work on a -mxxx= version of the patch -- and hopefully it will be more powerful. thanks, David > > Honza ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-03 4:22 New parameters to control stringop expansion libcall strategy Xinliang David Li 2013-08-03 8:07 ` Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) Jan Hubicka @ 2013-08-05 3:01 ` Xinliang David Li 2013-08-05 10:57 ` Michael V. Zolotukhin 1 sibling, 1 reply; 23+ messages in thread From: Xinliang David Li @ 2013-08-05 3:01 UTC (permalink / raw) To: GCC Patches; +Cc: Jan Hubicka, Teresa Johnson [-- Attachment #1: Type: text/plain, Size: 2205 bytes --] The attached is a new patch implementing the stringop inline strategy control using two new -m options: -mmemcpy-strategy= -mmemset-strategy= See changes in doc/invoke.texi for description of the new options. Example: -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned tells compiler to inline memcpy using rep_8byte when the size is no larger than 64 byte, using unrolled_loop when size is no larger than 2048, and for size > 2048, using library call. In all cases, destination alignment adjustment is not done. Tested on x86-64/linux. Ok for trunk? thanks, David 2013-08-02 Xinliang David Li <davidxl@google.com> * config/i386/stringop.def: New file. * config/i386/stringop.opt: New file. * config/i386/i386-opts.h: Include stringopt.def. * config/i386/i386.opt: Include stringopt.opt. * config/i386/i386.c (ix86_option_override_internal): Override default size based stringop inline strategies with options. * config/i386/i386.c (ix86_parse_stringop_strategy_string): New function. 2013-08-04 Xinliang David Li <davidxl@google.com> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: > On x86_64, when the expected size of memcpy/memset is known (e.g, with > FDO), libcall strategy is used with the size is > 8192. This value is > hard coded, which makes it hard to do performance tuning. This patch > adds two new parameters to do that. Potential usage includes > per-application libcall strategy min-size tuning based on summary data > with FDO (e.g, instruction workset size). > > Bootstrap and tested on x86_64/linux. Ok for trunk? > > thanks, > > David > > > 2013-08-02 Xinliang David Li <davidxl@google.com> > > * params.def: New parameters. > * config/i386/i386.c (ix86_option_override_internal): > Override default libcall size limit with parameters. [-- Attachment #2: stringop_inl_option.p.txt --] [-- Type: text/plain, Size: 18835 bytes --] Index: config/i386/stringop.def =================================================================== --- config/i386/stringop.def (revision 0) +++ config/i386/stringop.def (revision 0) @@ -0,0 +1,42 @@ +/* Definitions for option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +DEF_ENUM +DEF_ALG (no_stringop, no_stringop) +DEF_ENUM +DEF_ALG (libcall, libcall) +DEF_ENUM +DEF_ALG (rep_prefix_1_byte, rep_byte) +DEF_ENUM +DEF_ALG (rep_prefix_4_byte, rep_4byte) +DEF_ENUM +DEF_ALG (rep_prefix_8_byte, rep_8byte) +DEF_ENUM +DEF_ALG (loop_1_byte, byte_loop) +DEF_ENUM +DEF_ALG (loop, loop) +DEF_ENUM +DEF_ALG (unrolled_loop, unrolled_loop) +DEF_ENUM +DEF_ALG (vector_loop, vector_loop) Index: config/i386/i386.opt =================================================================== --- config/i386/i386.opt (revision 201458) +++ config/i386/i386.opt (working copy) @@ -316,6 +316,14 @@ mstack-arg-probe Target Report Mask(STACK_PROBE) Save Enable stack probing +mmemcpy-strategy= +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) +Specify memcpy expansion strategy when expected size is known + +mmemset-strategy= +Target RejectNegative Joined Var(ix86_tune_memset_strategy) +Specify memset expansion strategy when expected size is known + mstringop-strategy= Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) Chose strategy to generate stringop using Index: config/i386/stringop.opt =================================================================== --- config/i386/stringop.opt (revision 0) +++ config/i386/stringop.opt (revision 0) @@ -0,0 +1,36 @@ +/* Definitions for option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) + +#undef DEF_ENUM +#define DEF_ENUM EnumValue + +#undef DEF_ALG +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) + +#include "stringop.def" + +#undef DEF_ENUM +#undef DEF_ALG Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201458) +++ config/i386/i386.c (working copy) @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = }; /* Processor costs (relative to an add) */ -static const +static struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ -static const +static struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = }; /* core_cost should produce code tuned for Core familly of CPUs. */ -static const +static struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ -static const +static struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -2900,6 +2900,150 @@ ix86_debug_options (void) return; } + +static const char *stringop_alg_names[] = { +#define DEF_ENUM +#define DEF_ALG(alg, name) #name, +#include "stringop.def" +#undef DEF_ENUM +#undef DEF_ALG +}; + +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. + The string is of the following form (or comma separated list of it): + + strategy_alg:max_size:[align|noalign] + + where the full size range for the strategy is either [0, max_size] or + [min_size, max_size], in which min_size is the max_size + 1 of the + preceding range. The last size range must have max_size == -1. + + Examples: + + 1. + -mmemcpy-strategy=libcall:-1:noalign + + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + + + 2. + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign + + This is to tell the compiler to use the following strategy for memset + 1) when the expected size is between [1, 16], use rep_8byte strategy; + 2) when the size is between [17, 2048], use vector_loop; + 3) when the size is > 2048, use libcall. + +*/ + +struct stringop_size_range +{ + int min; + int max; + stringop_alg alg; + bool noalign; +}; + +static void +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +{ + const struct stringop_algs *default_algs; + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; + char *curr_range_str, *next_range_str; + int i = 0, n = 0; + + if (is_memset) + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + curr_range_str = strategy_str; + + do { + + int mins, maxs; + stringop_alg alg; + char alg_name[128]; + char align[16]; + + next_range_str = strchr (curr_range_str, ','); + if (next_range_str) + *next_range_str++ = '\0'; + + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) + { + warning (0, "Wrong arg %s to option %s", curr_range_str, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) + { + warning (0, "Size ranges of option %s should be increasing", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + for (i = 0; i < last_alg; i++) + { + if (!strcmp (alg_name, stringop_alg_names[i])) + { + alg = (stringop_alg) i; + break; + } + } + + if (i == last_alg) + { + warning (0, "Wrong stringop strategy name %s specified for option %s", + alg_name, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + input_ranges[n].min = mins; + input_ranges[n].max = maxs; + input_ranges[n].alg = alg; + if (!strcmp (align, "align")) + input_ranges[n].noalign = false; + else if (!strcmp (align, "noalign")) + input_ranges[n].noalign = true; + else + { + warning (0, "Unknown alignment %s specified for option %s", + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + n++; + curr_range_str = next_range_str; + } while (curr_range_str); + + if (input_ranges[n - 1].max != -1) + { + warning (0, "The max value for the last size range should be -1" + " for option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > MAX_STRINGOP_ALGS) + { + warning (0, "Too many size ranges specified in option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + /* Now override the default algs array */ + for (i = 0; i < n; i++) + { + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; + *const_cast<stringop_alg *>(&default_algs->size[i].alg) + = input_ranges[i].alg; + *const_cast<int *>(&default_algs->size[i].noalign) + = input_ranges[i].noalign; + } +} + \f /* Override various settings based on options. If MAIN_ARGS_P, the options are from the command line, otherwise they are from @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ + if (ix86_tune_memcpy_strategy) + { + char *str = xstrdup (ix86_tune_memcpy_strategy); + ix86_parse_stringop_strategy_string (str, false); + free (str); + } + + if (ix86_tune_memset_strategy) + { + char *str = xstrdup (ix86_tune_memset_strategy); + ix86_parse_stringop_strategy_string (str, true); + free (str); + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: need_zero_guard = true; @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop: need_zero_guard = true; @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: Index: config/i386/i386-opts.h =================================================================== --- config/i386/i386-opts.h (revision 201458) +++ config/i386/i386-opts.h (working copy) @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI /* Algorithm to expand string function with. */ enum stringop_alg { - no_stringop, - libcall, - rep_prefix_1_byte, - rep_prefix_4_byte, - rep_prefix_8_byte, - loop_1_byte, - loop, - unrolled_loop, - vector_loop +#undef DEF_ENUM +#define DEF_ENUM + +#undef DEF_ALG +#define DEF_ALG(alg, name) alg, + +#include "stringop.def" +last_alg + +#undef DEF_ENUM +#undef DEF_ALG }; /* Available call abi. */ Index: doc/invoke.texi =================================================================== --- doc/invoke.texi (revision 201458) +++ doc/invoke.texi (working copy) @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. -mbmi2 -mrtm -mlwp -mthreads @gol -mno-align-stringops -minline-all-stringops @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol -mregparm=@var{num} -msseregparm @gol @@ -14598,6 +14599,24 @@ Expand into an inline loop. Always use a library call. @end table +@item -mmemcpy-strategy=@var{strategy} +@opindex mmemcpy-strategy=@var{strategy} +Override the internal decision heuristic to decide if @code{__builtin_memcpy} +should be inlined and what inline algorithm to use when the expected size +of the copy operation is known. @var{strategy} +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies +the max byte size with which inline algorithm @var{alg} is allowed. For the last +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets +in the list must be specified in increasing order. The minimal byte size for +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the +preceding range. + +@item -mmemset-strategy=@var{strategy} +@opindex mmemset-strategy=@var{strategy} +The option is similar to @option{-mmemcpy-strategy=} except that it is to control +@code{__builtin_memset} expansion. + @item -momit-leaf-frame-pointer @opindex momit-leaf-frame-pointer Don't keep the frame pointer in a register for leaf functions. This Index: testsuite/gcc.target/i386/memcpy-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-2.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memset-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memset" 2 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 1, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-3.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-05 3:01 ` New parameters to control stringop expansion libcall strategy Xinliang David Li @ 2013-08-05 10:57 ` Michael V. Zolotukhin 2013-08-05 16:44 ` Xinliang David Li 0 siblings, 1 reply; 23+ messages in thread From: Michael V. Zolotukhin @ 2013-08-05 10:57 UTC (permalink / raw) To: Xinliang David Li; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson Hi, This is a really convenient option, thanks for working on it. I can't approve it as I'm not a maintainer, but it looks ok to me, except fot a small nitpicking: afair, comments should end with dot-space-space. Michael On 04 Aug 20:01, Xinliang David Li wrote: > The attached is a new patch implementing the stringop inline strategy > control using two new -m options: > > -mmemcpy-strategy= > -mmemset-strategy= > > See changes in doc/invoke.texi for description of the new options. Example: > -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned > > tells compiler to inline memcpy using rep_8byte when the size is no > larger than 64 byte, using unrolled_loop when size is no larger than > 2048, and for size > 2048, using library call. In all cases, > destination alignment adjustment is not done. > > Tested on x86-64/linux. Ok for trunk? > > thanks, > > David > > 2013-08-02 Xinliang David Li <davidxl@google.com> > > * config/i386/stringop.def: New file. > * config/i386/stringop.opt: New file. > * config/i386/i386-opts.h: Include stringopt.def. > * config/i386/i386.opt: Include stringopt.opt. > * config/i386/i386.c (ix86_option_override_internal): > Override default size based stringop inline strategies > with options. > * config/i386/i386.c (ix86_parse_stringop_strategy_string): > New function. > > 2013-08-04 Xinliang David Li <davidxl@google.com> > > * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. > * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. > * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. > * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. > > > > > On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: > > On x86_64, when the expected size of memcpy/memset is known (e.g, with > > FDO), libcall strategy is used with the size is > 8192. This value is > > hard coded, which makes it hard to do performance tuning. This patch > > adds two new parameters to do that. Potential usage includes > > per-application libcall strategy min-size tuning based on summary data > > with FDO (e.g, instruction workset size). > > > > Bootstrap and tested on x86_64/linux. Ok for trunk? > > > > thanks, > > > > David > > > > > > 2013-08-02 Xinliang David Li <davidxl@google.com> > > > > * params.def: New parameters. > > * config/i386/i386.c (ix86_option_override_internal): > > Override default libcall size limit with parameters. > Index: config/i386/stringop.def > =================================================================== > --- config/i386/stringop.def (revision 0) > +++ config/i386/stringop.def (revision 0) > @@ -0,0 +1,42 @@ > +/* Definitions for option handling for IA-32. > + Copyright (C) 2013 Free Software Foundation, Inc. > + > +This file is part of GCC. > + > +GCC is free software; you can redistribute it and/or modify > +it under the terms of the GNU General Public License as published by > +the Free Software Foundation; either version 3, or (at your option) > +any later version. > + > +GCC is distributed in the hope that it will be useful, > +but WITHOUT ANY WARRANTY; without even the implied warranty of > +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +GNU General Public License for more details. > + > +Under Section 7 of GPL version 3, you are granted additional > +permissions described in the GCC Runtime Library Exception, version > +3.1, as published by the Free Software Foundation. > + > +You should have received a copy of the GNU General Public License and > +a copy of the GCC Runtime Library Exception along with this program; > +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see > +<http://www.gnu.org/licenses/>. */ > + > +DEF_ENUM > +DEF_ALG (no_stringop, no_stringop) > +DEF_ENUM > +DEF_ALG (libcall, libcall) > +DEF_ENUM > +DEF_ALG (rep_prefix_1_byte, rep_byte) > +DEF_ENUM > +DEF_ALG (rep_prefix_4_byte, rep_4byte) > +DEF_ENUM > +DEF_ALG (rep_prefix_8_byte, rep_8byte) > +DEF_ENUM > +DEF_ALG (loop_1_byte, byte_loop) > +DEF_ENUM > +DEF_ALG (loop, loop) > +DEF_ENUM > +DEF_ALG (unrolled_loop, unrolled_loop) > +DEF_ENUM > +DEF_ALG (vector_loop, vector_loop) > Index: config/i386/i386.opt > =================================================================== > --- config/i386/i386.opt (revision 201458) > +++ config/i386/i386.opt (working copy) > @@ -316,6 +316,14 @@ mstack-arg-probe > Target Report Mask(STACK_PROBE) Save > Enable stack probing > > +mmemcpy-strategy= > +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) > +Specify memcpy expansion strategy when expected size is known > + > +mmemset-strategy= > +Target RejectNegative Joined Var(ix86_tune_memset_strategy) > +Specify memset expansion strategy when expected size is known > + > mstringop-strategy= > Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) > Chose strategy to generate stringop using > Index: config/i386/stringop.opt > =================================================================== > --- config/i386/stringop.opt (revision 0) > +++ config/i386/stringop.opt (revision 0) > @@ -0,0 +1,36 @@ > +/* Definitions for option handling for IA-32. > + Copyright (C) 2013 Free Software Foundation, Inc. > + > +This file is part of GCC. > + > +GCC is free software; you can redistribute it and/or modify > +it under the terms of the GNU General Public License as published by > +the Free Software Foundation; either version 3, or (at your option) > +any later version. > + > +GCC is distributed in the hope that it will be useful, > +but WITHOUT ANY WARRANTY; without even the implied warranty of > +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +GNU General Public License for more details. > + > +Under Section 7 of GPL version 3, you are granted additional > +permissions described in the GCC Runtime Library Exception, version > +3.1, as published by the Free Software Foundation. > + > +You should have received a copy of the GNU General Public License and > +a copy of the GCC Runtime Library Exception along with this program; > +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see > +<http://www.gnu.org/licenses/>. */ > + > +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) > + > +#undef DEF_ENUM > +#define DEF_ENUM EnumValue > + > +#undef DEF_ALG > +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) > + > +#include "stringop.def" > + > +#undef DEF_ENUM > +#undef DEF_ALG > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 201458) > +++ config/i386/i386.c (working copy) > @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = > }; > > /* Processor costs (relative to an add) */ > -static const > +static > struct processor_costs i386_cost = { /* 386 specific costs */ > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (1), /* cost of a lea instruction */ > @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs i486_cost = { /* 486 specific costs */ > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (1), /* cost of a lea instruction */ > @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs pentium_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (1), /* cost of a lea instruction */ > @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs pentiumpro_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (1), /* cost of a lea instruction */ > @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs geode_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (1), /* cost of a lea instruction */ > @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs k6_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (2), /* cost of a lea instruction */ > @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs athlon_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (2), /* cost of a lea instruction */ > @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs k8_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (2), /* cost of a lea instruction */ > @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs pentium4_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (3), /* cost of a lea instruction */ > @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs nocona_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (1), /* cost of a lea instruction */ > @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { > 1, /* cond_not_taken_branch_cost. */ > }; > > -static const > +static > struct processor_costs atom_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ > @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { > }; > > /* Generic64 should produce code tuned for Nocona and K8. */ > -static const > +static > struct processor_costs generic64_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > /* On all chips taken into consideration lea is 2 cycles and more. With > @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = > }; > > /* core_cost should produce code tuned for Core familly of CPUs. */ > -static const > +static > struct processor_costs core_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > /* On all chips taken into consideration lea is 2 cycles and more. With > @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { > > /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, > Athlon and K8. */ > -static const > +static > struct processor_costs generic32_cost = { > COSTS_N_INSNS (1), /* cost of an add instruction */ > COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ > @@ -2900,6 +2900,150 @@ ix86_debug_options (void) > > return; > } > + > +static const char *stringop_alg_names[] = { > +#define DEF_ENUM > +#define DEF_ALG(alg, name) #name, > +#include "stringop.def" > +#undef DEF_ENUM > +#undef DEF_ALG > +}; > + > +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. > + The string is of the following form (or comma separated list of it): > + > + strategy_alg:max_size:[align|noalign] > + > + where the full size range for the strategy is either [0, max_size] or > + [min_size, max_size], in which min_size is the max_size + 1 of the > + preceding range. The last size range must have max_size == -1. > + > + Examples: > + > + 1. > + -mmemcpy-strategy=libcall:-1:noalign > + > + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall > + > + > + 2. > + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign > + > + This is to tell the compiler to use the following strategy for memset > + 1) when the expected size is between [1, 16], use rep_8byte strategy; > + 2) when the size is between [17, 2048], use vector_loop; > + 3) when the size is > 2048, use libcall. > + > +*/ > + > +struct stringop_size_range > +{ > + int min; > + int max; > + stringop_alg alg; > + bool noalign; > +}; > + > +static void > +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) > +{ > + const struct stringop_algs *default_algs; > + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; > + char *curr_range_str, *next_range_str; > + int i = 0, n = 0; > + > + if (is_memset) > + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; > + else > + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; > + > + curr_range_str = strategy_str; > + > + do { > + > + int mins, maxs; > + stringop_alg alg; > + char alg_name[128]; > + char align[16]; > + > + next_range_str = strchr (curr_range_str, ','); > + if (next_range_str) > + *next_range_str++ = '\0'; > + > + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) > + { > + warning (0, "Wrong arg %s to option %s", curr_range_str, > + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > + return; > + } > + > + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) > + { > + warning (0, "Size ranges of option %s should be increasing", > + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > + return; > + } > + > + for (i = 0; i < last_alg; i++) > + { > + if (!strcmp (alg_name, stringop_alg_names[i])) > + { > + alg = (stringop_alg) i; > + break; > + } > + } > + > + if (i == last_alg) > + { > + warning (0, "Wrong stringop strategy name %s specified for option %s", > + alg_name, > + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > + return; > + } > + > + input_ranges[n].min = mins; > + input_ranges[n].max = maxs; > + input_ranges[n].alg = alg; > + if (!strcmp (align, "align")) > + input_ranges[n].noalign = false; > + else if (!strcmp (align, "noalign")) > + input_ranges[n].noalign = true; > + else > + { > + warning (0, "Unknown alignment %s specified for option %s", > + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > + return; > + } > + n++; > + curr_range_str = next_range_str; > + } while (curr_range_str); > + > + if (input_ranges[n - 1].max != -1) > + { > + warning (0, "The max value for the last size range should be -1" > + " for option %s", > + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > + return; > + } > + > + if (n > MAX_STRINGOP_ALGS) > + { > + warning (0, "Too many size ranges specified in option %s", > + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > + return; > + } > + > + /* Now override the default algs array */ > + for (i = 0; i < n; i++) > + { > + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; > + *const_cast<stringop_alg *>(&default_algs->size[i].alg) > + = input_ranges[i].alg; > + *const_cast<int *>(&default_algs->size[i].noalign) > + = input_ranges[i].noalign; > + } > +} > + > \f > /* Override various settings based on options. If MAIN_ARGS_P, the > options are from the command line, otherwise they are from > @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main > /* Handle stack protector */ > if (!global_options_set.x_ix86_stack_protector_guard) > ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; > + > + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ > + if (ix86_tune_memcpy_strategy) > + { > + char *str = xstrdup (ix86_tune_memcpy_strategy); > + ix86_parse_stringop_strategy_string (str, false); > + free (str); > + } > + > + if (ix86_tune_memset_strategy) > + { > + char *str = xstrdup (ix86_tune_memset_strategy); > + ix86_parse_stringop_strategy_string (str, true); > + free (str); > + } > } > > /* Implement the TARGET_OPTION_OVERRIDE hook. */ > @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt > { > case libcall: > case no_stringop: > + case last_alg: > gcc_unreachable (); > case loop_1_byte: > need_zero_guard = true; > @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt > { > case libcall: > case no_stringop: > + case last_alg: > gcc_unreachable (); > case loop_1_byte: > case loop: > @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e > { > case libcall: > case no_stringop: > + case last_alg: > gcc_unreachable (); > case loop: > need_zero_guard = true; > @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e > { > case libcall: > case no_stringop: > + case last_alg: > gcc_unreachable (); > case loop_1_byte: > case loop: > Index: config/i386/i386-opts.h > =================================================================== > --- config/i386/i386-opts.h (revision 201458) > +++ config/i386/i386-opts.h (working copy) > @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI > /* Algorithm to expand string function with. */ > enum stringop_alg > { > - no_stringop, > - libcall, > - rep_prefix_1_byte, > - rep_prefix_4_byte, > - rep_prefix_8_byte, > - loop_1_byte, > - loop, > - unrolled_loop, > - vector_loop > +#undef DEF_ENUM > +#define DEF_ENUM > + > +#undef DEF_ALG > +#define DEF_ALG(alg, name) alg, > + > +#include "stringop.def" > +last_alg > + > +#undef DEF_ENUM > +#undef DEF_ALG > }; > > /* Available call abi. */ > Index: doc/invoke.texi > =================================================================== > --- doc/invoke.texi (revision 201458) > +++ doc/invoke.texi (working copy) > @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. > -mbmi2 -mrtm -mlwp -mthreads @gol > -mno-align-stringops -minline-all-stringops @gol > -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol > +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} > -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol > -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol > -mregparm=@var{num} -msseregparm @gol > @@ -14598,6 +14599,24 @@ Expand into an inline loop. > Always use a library call. > @end table > > +@item -mmemcpy-strategy=@var{strategy} > +@opindex mmemcpy-strategy=@var{strategy} > +Override the internal decision heuristic to decide if @code{__builtin_memcpy} > +should be inlined and what inline algorithm to use when the expected size > +of the copy operation is known. @var{strategy} > +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. > +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies > +the max byte size with which inline algorithm @var{alg} is allowed. For the last > +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets > +in the list must be specified in increasing order. The minimal byte size for > +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the > +preceding range. > + > +@item -mmemset-strategy=@var{strategy} > +@opindex mmemset-strategy=@var{strategy} > +The option is similar to @option{-mmemcpy-strategy=} except that it is to control > +@code{__builtin_memset} expansion. > + > @item -momit-leaf-frame-pointer > @opindex momit-leaf-frame-pointer > Don't keep the frame pointer in a register for leaf functions. This > Index: testsuite/gcc.target/i386/memcpy-strategy-1.c > =================================================================== > --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) > +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ > +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ > +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ > + > +char a[2048]; > +char b[2048]; > +void t (void) > +{ > + __builtin_memcpy (a, b, 2048); > +} > + > Index: testsuite/gcc.target/i386/memcpy-strategy-2.c > =================================================================== > --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) > +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ > +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ > +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ > + > +char a[2048]; > +char b[2048]; > +void t (void) > +{ > + __builtin_memcpy (a, b, 2048); > +} > + > Index: testsuite/gcc.target/i386/memset-strategy-1.c > =================================================================== > --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) > +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) > @@ -0,0 +1,10 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ > +/* { dg-final { scan-assembler-times "memset" 2 } } */ > + > +char a[2048]; > +void t (void) > +{ > + __builtin_memset (a, 1, 2048); > +} > + > Index: testsuite/gcc.target/i386/memcpy-strategy-3.c > =================================================================== > --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) > +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ > +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ > + > +char a[2048]; > +char b[2048]; > +void t (void) > +{ > + __builtin_memcpy (a, b, 2048); > +} > + ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-05 10:57 ` Michael V. Zolotukhin @ 2013-08-05 16:44 ` Xinliang David Li 2013-08-06 8:46 ` Michael Zolotukhin 0 siblings, 1 reply; 23+ messages in thread From: Xinliang David Li @ 2013-08-05 16:44 UTC (permalink / raw) To: Michael V. Zolotukhin; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson [-- Attachment #1: Type: text/plain, Size: 24045 bytes --] thanks. Updated patch attached. David On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin <michael.v.zolotukhin@gmail.com> wrote: > Hi, > This is a really convenient option, thanks for working on it. > I can't approve it as I'm not a maintainer, but it looks ok to me, > except fot a small nitpicking: afair, comments should end with > dot-space-space. > > Michael > > On 04 Aug 20:01, Xinliang David Li wrote: >> The attached is a new patch implementing the stringop inline strategy >> control using two new -m options: >> >> -mmemcpy-strategy= >> -mmemset-strategy= >> >> See changes in doc/invoke.texi for description of the new options. Example: >> -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned >> >> tells compiler to inline memcpy using rep_8byte when the size is no >> larger than 64 byte, using unrolled_loop when size is no larger than >> 2048, and for size > 2048, using library call. In all cases, >> destination alignment adjustment is not done. >> >> Tested on x86-64/linux. Ok for trunk? >> >> thanks, >> >> David >> >> 2013-08-02 Xinliang David Li <davidxl@google.com> >> >> * config/i386/stringop.def: New file. >> * config/i386/stringop.opt: New file. >> * config/i386/i386-opts.h: Include stringopt.def. >> * config/i386/i386.opt: Include stringopt.opt. >> * config/i386/i386.c (ix86_option_override_internal): >> Override default size based stringop inline strategies >> with options. >> * config/i386/i386.c (ix86_parse_stringop_strategy_string): >> New function. >> >> 2013-08-04 Xinliang David Li <davidxl@google.com> >> >> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. >> * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. >> * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. >> * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. >> >> >> >> >> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: >> > On x86_64, when the expected size of memcpy/memset is known (e.g, with >> > FDO), libcall strategy is used with the size is > 8192. This value is >> > hard coded, which makes it hard to do performance tuning. This patch >> > adds two new parameters to do that. Potential usage includes >> > per-application libcall strategy min-size tuning based on summary data >> > with FDO (e.g, instruction workset size). >> > >> > Bootstrap and tested on x86_64/linux. Ok for trunk? >> > >> > thanks, >> > >> > David >> > >> > >> > 2013-08-02 Xinliang David Li <davidxl@google.com> >> > >> > * params.def: New parameters. >> > * config/i386/i386.c (ix86_option_override_internal): >> > Override default libcall size limit with parameters. > >> Index: config/i386/stringop.def >> =================================================================== >> --- config/i386/stringop.def (revision 0) >> +++ config/i386/stringop.def (revision 0) >> @@ -0,0 +1,42 @@ >> +/* Definitions for option handling for IA-32. >> + Copyright (C) 2013 Free Software Foundation, Inc. >> + >> +This file is part of GCC. >> + >> +GCC is free software; you can redistribute it and/or modify >> +it under the terms of the GNU General Public License as published by >> +the Free Software Foundation; either version 3, or (at your option) >> +any later version. >> + >> +GCC is distributed in the hope that it will be useful, >> +but WITHOUT ANY WARRANTY; without even the implied warranty of >> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> +GNU General Public License for more details. >> + >> +Under Section 7 of GPL version 3, you are granted additional >> +permissions described in the GCC Runtime Library Exception, version >> +3.1, as published by the Free Software Foundation. >> + >> +You should have received a copy of the GNU General Public License and >> +a copy of the GCC Runtime Library Exception along with this program; >> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >> +<http://www.gnu.org/licenses/>. */ >> + >> +DEF_ENUM >> +DEF_ALG (no_stringop, no_stringop) >> +DEF_ENUM >> +DEF_ALG (libcall, libcall) >> +DEF_ENUM >> +DEF_ALG (rep_prefix_1_byte, rep_byte) >> +DEF_ENUM >> +DEF_ALG (rep_prefix_4_byte, rep_4byte) >> +DEF_ENUM >> +DEF_ALG (rep_prefix_8_byte, rep_8byte) >> +DEF_ENUM >> +DEF_ALG (loop_1_byte, byte_loop) >> +DEF_ENUM >> +DEF_ALG (loop, loop) >> +DEF_ENUM >> +DEF_ALG (unrolled_loop, unrolled_loop) >> +DEF_ENUM >> +DEF_ALG (vector_loop, vector_loop) >> Index: config/i386/i386.opt >> =================================================================== >> --- config/i386/i386.opt (revision 201458) >> +++ config/i386/i386.opt (working copy) >> @@ -316,6 +316,14 @@ mstack-arg-probe >> Target Report Mask(STACK_PROBE) Save >> Enable stack probing >> >> +mmemcpy-strategy= >> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) >> +Specify memcpy expansion strategy when expected size is known >> + >> +mmemset-strategy= >> +Target RejectNegative Joined Var(ix86_tune_memset_strategy) >> +Specify memset expansion strategy when expected size is known >> + >> mstringop-strategy= >> Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) >> Chose strategy to generate stringop using >> Index: config/i386/stringop.opt >> =================================================================== >> --- config/i386/stringop.opt (revision 0) >> +++ config/i386/stringop.opt (revision 0) >> @@ -0,0 +1,36 @@ >> +/* Definitions for option handling for IA-32. >> + Copyright (C) 2013 Free Software Foundation, Inc. >> + >> +This file is part of GCC. >> + >> +GCC is free software; you can redistribute it and/or modify >> +it under the terms of the GNU General Public License as published by >> +the Free Software Foundation; either version 3, or (at your option) >> +any later version. >> + >> +GCC is distributed in the hope that it will be useful, >> +but WITHOUT ANY WARRANTY; without even the implied warranty of >> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> +GNU General Public License for more details. >> + >> +Under Section 7 of GPL version 3, you are granted additional >> +permissions described in the GCC Runtime Library Exception, version >> +3.1, as published by the Free Software Foundation. >> + >> +You should have received a copy of the GNU General Public License and >> +a copy of the GCC Runtime Library Exception along with this program; >> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >> +<http://www.gnu.org/licenses/>. */ >> + >> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) >> + >> +#undef DEF_ENUM >> +#define DEF_ENUM EnumValue >> + >> +#undef DEF_ALG >> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) >> + >> +#include "stringop.def" >> + >> +#undef DEF_ENUM >> +#undef DEF_ALG >> Index: config/i386/i386.c >> =================================================================== >> --- config/i386/i386.c (revision 201458) >> +++ config/i386/i386.c (working copy) >> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = >> }; >> >> /* Processor costs (relative to an add) */ >> -static const >> +static >> struct processor_costs i386_cost = { /* 386 specific costs */ >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs i486_cost = { /* 486 specific costs */ >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs pentium_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs pentiumpro_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs geode_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs k6_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs athlon_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs k8_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs pentium4_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (3), /* cost of a lea instruction */ >> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs nocona_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { >> 1, /* cond_not_taken_branch_cost. */ >> }; >> >> -static const >> +static >> struct processor_costs atom_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { >> }; >> >> /* Generic64 should produce code tuned for Nocona and K8. */ >> -static const >> +static >> struct processor_costs generic64_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> /* On all chips taken into consideration lea is 2 cycles and more. With >> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = >> }; >> >> /* core_cost should produce code tuned for Core familly of CPUs. */ >> -static const >> +static >> struct processor_costs core_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> /* On all chips taken into consideration lea is 2 cycles and more. With >> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { >> >> /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, >> Athlon and K8. */ >> -static const >> +static >> struct processor_costs generic32_cost = { >> COSTS_N_INSNS (1), /* cost of an add instruction */ >> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >> @@ -2900,6 +2900,150 @@ ix86_debug_options (void) >> >> return; >> } >> + >> +static const char *stringop_alg_names[] = { >> +#define DEF_ENUM >> +#define DEF_ALG(alg, name) #name, >> +#include "stringop.def" >> +#undef DEF_ENUM >> +#undef DEF_ALG >> +}; >> + >> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. >> + The string is of the following form (or comma separated list of it): >> + >> + strategy_alg:max_size:[align|noalign] >> + >> + where the full size range for the strategy is either [0, max_size] or >> + [min_size, max_size], in which min_size is the max_size + 1 of the >> + preceding range. The last size range must have max_size == -1. >> + >> + Examples: >> + >> + 1. >> + -mmemcpy-strategy=libcall:-1:noalign >> + >> + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall >> + >> + >> + 2. >> + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign >> + >> + This is to tell the compiler to use the following strategy for memset >> + 1) when the expected size is between [1, 16], use rep_8byte strategy; >> + 2) when the size is between [17, 2048], use vector_loop; >> + 3) when the size is > 2048, use libcall. >> + >> +*/ >> + >> +struct stringop_size_range >> +{ >> + int min; >> + int max; >> + stringop_alg alg; >> + bool noalign; >> +}; >> + >> +static void >> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) >> +{ >> + const struct stringop_algs *default_algs; >> + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; >> + char *curr_range_str, *next_range_str; >> + int i = 0, n = 0; >> + >> + if (is_memset) >> + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; >> + else >> + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; >> + >> + curr_range_str = strategy_str; >> + >> + do { >> + >> + int mins, maxs; >> + stringop_alg alg; >> + char alg_name[128]; >> + char align[16]; >> + >> + next_range_str = strchr (curr_range_str, ','); >> + if (next_range_str) >> + *next_range_str++ = '\0'; >> + >> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) >> + { >> + warning (0, "Wrong arg %s to option %s", curr_range_str, >> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> + return; >> + } >> + >> + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) >> + { >> + warning (0, "Size ranges of option %s should be increasing", >> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> + return; >> + } >> + >> + for (i = 0; i < last_alg; i++) >> + { >> + if (!strcmp (alg_name, stringop_alg_names[i])) >> + { >> + alg = (stringop_alg) i; >> + break; >> + } >> + } >> + >> + if (i == last_alg) >> + { >> + warning (0, "Wrong stringop strategy name %s specified for option %s", >> + alg_name, >> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> + return; >> + } >> + >> + input_ranges[n].min = mins; >> + input_ranges[n].max = maxs; >> + input_ranges[n].alg = alg; >> + if (!strcmp (align, "align")) >> + input_ranges[n].noalign = false; >> + else if (!strcmp (align, "noalign")) >> + input_ranges[n].noalign = true; >> + else >> + { >> + warning (0, "Unknown alignment %s specified for option %s", >> + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> + return; >> + } >> + n++; >> + curr_range_str = next_range_str; >> + } while (curr_range_str); >> + >> + if (input_ranges[n - 1].max != -1) >> + { >> + warning (0, "The max value for the last size range should be -1" >> + " for option %s", >> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> + return; >> + } >> + >> + if (n > MAX_STRINGOP_ALGS) >> + { >> + warning (0, "Too many size ranges specified in option %s", >> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> + return; >> + } >> + >> + /* Now override the default algs array */ >> + for (i = 0; i < n; i++) >> + { >> + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; >> + *const_cast<stringop_alg *>(&default_algs->size[i].alg) >> + = input_ranges[i].alg; >> + *const_cast<int *>(&default_algs->size[i].noalign) >> + = input_ranges[i].noalign; >> + } >> +} >> + >> >> /* Override various settings based on options. If MAIN_ARGS_P, the >> options are from the command line, otherwise they are from >> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main >> /* Handle stack protector */ >> if (!global_options_set.x_ix86_stack_protector_guard) >> ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; >> + >> + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ >> + if (ix86_tune_memcpy_strategy) >> + { >> + char *str = xstrdup (ix86_tune_memcpy_strategy); >> + ix86_parse_stringop_strategy_string (str, false); >> + free (str); >> + } >> + >> + if (ix86_tune_memset_strategy) >> + { >> + char *str = xstrdup (ix86_tune_memset_strategy); >> + ix86_parse_stringop_strategy_string (str, true); >> + free (str); >> + } >> } >> >> /* Implement the TARGET_OPTION_OVERRIDE hook. */ >> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >> { >> case libcall: >> case no_stringop: >> + case last_alg: >> gcc_unreachable (); >> case loop_1_byte: >> need_zero_guard = true; >> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >> { >> case libcall: >> case no_stringop: >> + case last_alg: >> gcc_unreachable (); >> case loop_1_byte: >> case loop: >> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >> { >> case libcall: >> case no_stringop: >> + case last_alg: >> gcc_unreachable (); >> case loop: >> need_zero_guard = true; >> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >> { >> case libcall: >> case no_stringop: >> + case last_alg: >> gcc_unreachable (); >> case loop_1_byte: >> case loop: >> Index: config/i386/i386-opts.h >> =================================================================== >> --- config/i386/i386-opts.h (revision 201458) >> +++ config/i386/i386-opts.h (working copy) >> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI >> /* Algorithm to expand string function with. */ >> enum stringop_alg >> { >> - no_stringop, >> - libcall, >> - rep_prefix_1_byte, >> - rep_prefix_4_byte, >> - rep_prefix_8_byte, >> - loop_1_byte, >> - loop, >> - unrolled_loop, >> - vector_loop >> +#undef DEF_ENUM >> +#define DEF_ENUM >> + >> +#undef DEF_ALG >> +#define DEF_ALG(alg, name) alg, >> + >> +#include "stringop.def" >> +last_alg >> + >> +#undef DEF_ENUM >> +#undef DEF_ALG >> }; >> >> /* Available call abi. */ >> Index: doc/invoke.texi >> =================================================================== >> --- doc/invoke.texi (revision 201458) >> +++ doc/invoke.texi (working copy) >> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. >> -mbmi2 -mrtm -mlwp -mthreads @gol >> -mno-align-stringops -minline-all-stringops @gol >> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol >> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} >> -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol >> -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol >> -mregparm=@var{num} -msseregparm @gol >> @@ -14598,6 +14599,24 @@ Expand into an inline loop. >> Always use a library call. >> @end table >> >> +@item -mmemcpy-strategy=@var{strategy} >> +@opindex mmemcpy-strategy=@var{strategy} >> +Override the internal decision heuristic to decide if @code{__builtin_memcpy} >> +should be inlined and what inline algorithm to use when the expected size >> +of the copy operation is known. @var{strategy} >> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. >> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies >> +the max byte size with which inline algorithm @var{alg} is allowed. For the last >> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets >> +in the list must be specified in increasing order. The minimal byte size for >> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the >> +preceding range. >> + >> +@item -mmemset-strategy=@var{strategy} >> +@opindex mmemset-strategy=@var{strategy} >> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control >> +@code{__builtin_memset} expansion. >> + >> @item -momit-leaf-frame-pointer >> @opindex momit-leaf-frame-pointer >> Don't keep the frame pointer in a register for leaf functions. This >> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c >> =================================================================== >> --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >> @@ -0,0 +1,12 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ >> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >> + >> +char a[2048]; >> +char b[2048]; >> +void t (void) >> +{ >> + __builtin_memcpy (a, b, 2048); >> +} >> + >> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c >> =================================================================== >> --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >> @@ -0,0 +1,12 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ >> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >> + >> +char a[2048]; >> +char b[2048]; >> +void t (void) >> +{ >> + __builtin_memcpy (a, b, 2048); >> +} >> + >> Index: testsuite/gcc.target/i386/memset-strategy-1.c >> =================================================================== >> --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >> +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >> @@ -0,0 +1,10 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ >> +/* { dg-final { scan-assembler-times "memset" 2 } } */ >> + >> +char a[2048]; >> +void t (void) >> +{ >> + __builtin_memset (a, 1, 2048); >> +} >> + >> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c >> =================================================================== >> --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >> @@ -0,0 +1,11 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ >> +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ >> + >> +char a[2048]; >> +char b[2048]; >> +void t (void) >> +{ >> + __builtin_memcpy (a, b, 2048); >> +} >> + > [-- Attachment #2: stringop_inl_option.p.txt --] [-- Type: text/plain, Size: 18833 bytes --] Index: doc/invoke.texi =================================================================== --- doc/invoke.texi (revision 201458) +++ doc/invoke.texi (working copy) @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. -mbmi2 -mrtm -mlwp -mthreads @gol -mno-align-stringops -minline-all-stringops @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol -mregparm=@var{num} -msseregparm @gol @@ -14598,6 +14599,24 @@ Expand into an inline loop. Always use a library call. @end table +@item -mmemcpy-strategy=@var{strategy} +@opindex mmemcpy-strategy=@var{strategy} +Override the internal decision heuristic to decide if @code{__builtin_memcpy} +should be inlined and what inline algorithm to use when the expected size +of the copy operation is known. @var{strategy} +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies +the max byte size with which inline algorithm @var{alg} is allowed. For the last +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets +in the list must be specified in increasing order. The minimal byte size for +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the +preceding range. + +@item -mmemset-strategy=@var{strategy} +@opindex mmemset-strategy=@var{strategy} +The option is similar to @option{-mmemcpy-strategy=} except that it is to control +@code{__builtin_memset} expansion. + @item -momit-leaf-frame-pointer @opindex momit-leaf-frame-pointer Don't keep the frame pointer in a register for leaf functions. This Index: testsuite/gcc.target/i386/memcpy-strategy-2.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memset-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memset" 2 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 1, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-3.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: config/i386/stringop.def =================================================================== --- config/i386/stringop.def (revision 0) +++ config/i386/stringop.def (revision 0) @@ -0,0 +1,42 @@ +/* Definitions for option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +DEF_ENUM +DEF_ALG (no_stringop, no_stringop) +DEF_ENUM +DEF_ALG (libcall, libcall) +DEF_ENUM +DEF_ALG (rep_prefix_1_byte, rep_byte) +DEF_ENUM +DEF_ALG (rep_prefix_4_byte, rep_4byte) +DEF_ENUM +DEF_ALG (rep_prefix_8_byte, rep_8byte) +DEF_ENUM +DEF_ALG (loop_1_byte, byte_loop) +DEF_ENUM +DEF_ALG (loop, loop) +DEF_ENUM +DEF_ALG (unrolled_loop, unrolled_loop) +DEF_ENUM +DEF_ALG (vector_loop, vector_loop) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201458) +++ config/i386/i386.c (working copy) @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = }; /* Processor costs (relative to an add) */ -static const +static struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ -static const +static struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = }; /* core_cost should produce code tuned for Core familly of CPUs. */ -static const +static struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ -static const +static struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -2900,6 +2900,148 @@ ix86_debug_options (void) return; } + +static const char *stringop_alg_names[] = { +#define DEF_ENUM +#define DEF_ALG(alg, name) #name, +#include "stringop.def" +#undef DEF_ENUM +#undef DEF_ALG +}; + +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. + The string is of the following form (or comma separated list of it): + + strategy_alg:max_size:[align|noalign] + + where the full size range for the strategy is either [0, max_size] or + [min_size, max_size], in which min_size is the max_size + 1 of the + preceding range. The last size range must have max_size == -1. + + Examples: + + 1. + -mmemcpy-strategy=libcall:-1:noalign + + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + + + 2. + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign + + This is to tell the compiler to use the following strategy for memset + 1) when the expected size is between [1, 16], use rep_8byte strategy; + 2) when the size is between [17, 2048], use vector_loop; + 3) when the size is > 2048, use libcall. */ + +struct stringop_size_range +{ + int min; + int max; + stringop_alg alg; + bool noalign; +}; + +static void +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +{ + const struct stringop_algs *default_algs; + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; + char *curr_range_str, *next_range_str; + int i = 0, n = 0; + + if (is_memset) + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + curr_range_str = strategy_str; + + do { + + int mins, maxs; + stringop_alg alg; + char alg_name[128]; + char align[16]; + + next_range_str = strchr (curr_range_str, ','); + if (next_range_str) + *next_range_str++ = '\0'; + + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) + { + warning (0, "Wrong arg %s to option %s", curr_range_str, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) + { + warning (0, "Size ranges of option %s should be increasing", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + for (i = 0; i < last_alg; i++) + { + if (!strcmp (alg_name, stringop_alg_names[i])) + { + alg = (stringop_alg) i; + break; + } + } + + if (i == last_alg) + { + warning (0, "Wrong stringop strategy name %s specified for option %s", + alg_name, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + input_ranges[n].min = mins; + input_ranges[n].max = maxs; + input_ranges[n].alg = alg; + if (!strcmp (align, "align")) + input_ranges[n].noalign = false; + else if (!strcmp (align, "noalign")) + input_ranges[n].noalign = true; + else + { + warning (0, "Unknown alignment %s specified for option %s", + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + n++; + curr_range_str = next_range_str; + } while (curr_range_str); + + if (input_ranges[n - 1].max != -1) + { + warning (0, "The max value for the last size range should be -1" + " for option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > MAX_STRINGOP_ALGS) + { + warning (0, "Too many size ranges specified in option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + /* Now override the default algs array */ + for (i = 0; i < n; i++) + { + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; + *const_cast<stringop_alg *>(&default_algs->size[i].alg) + = input_ranges[i].alg; + *const_cast<int *>(&default_algs->size[i].noalign) + = input_ranges[i].noalign; + } +} + \f /* Override various settings based on options. If MAIN_ARGS_P, the options are from the command line, otherwise they are from @@ -4021,6 +4163,21 @@ ix86_option_override_internal (bool main /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ + if (ix86_tune_memcpy_strategy) + { + char *str = xstrdup (ix86_tune_memcpy_strategy); + ix86_parse_stringop_strategy_string (str, false); + free (str); + } + + if (ix86_tune_memset_strategy) + { + char *str = xstrdup (ix86_tune_memset_strategy); + ix86_parse_stringop_strategy_string (str, true); + free (str); + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ @@ -22903,6 +23060,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: need_zero_guard = true; @@ -23093,6 +23251,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: @@ -23304,6 +23463,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop: need_zero_guard = true; @@ -23481,6 +23641,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: Index: config/i386/stringop.opt =================================================================== --- config/i386/stringop.opt (revision 0) +++ config/i386/stringop.opt (revision 0) @@ -0,0 +1,36 @@ +/* Definitions for option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) + +#undef DEF_ENUM +#define DEF_ENUM EnumValue + +#undef DEF_ALG +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) + +#include "stringop.def" + +#undef DEF_ENUM +#undef DEF_ALG Index: config/i386/i386-opts.h =================================================================== --- config/i386/i386-opts.h (revision 201458) +++ config/i386/i386-opts.h (working copy) @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI /* Algorithm to expand string function with. */ enum stringop_alg { - no_stringop, - libcall, - rep_prefix_1_byte, - rep_prefix_4_byte, - rep_prefix_8_byte, - loop_1_byte, - loop, - unrolled_loop, - vector_loop +#undef DEF_ENUM +#define DEF_ENUM + +#undef DEF_ALG +#define DEF_ALG(alg, name) alg, + +#include "stringop.def" +last_alg + +#undef DEF_ENUM +#undef DEF_ALG }; /* Available call abi. */ Index: config/i386/i386.opt =================================================================== --- config/i386/i386.opt (revision 201458) +++ config/i386/i386.opt (working copy) @@ -316,6 +316,14 @@ mstack-arg-probe Target Report Mask(STACK_PROBE) Save Enable stack probing +mmemcpy-strategy= +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) +Specify memcpy expansion strategy when expected size is known + +mmemset-strategy= +Target RejectNegative Joined Var(ix86_tune_memset_strategy) +Specify memset expansion strategy when expected size is known + mstringop-strategy= Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) Chose strategy to generate stringop using ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-05 16:44 ` Xinliang David Li @ 2013-08-06 8:46 ` Michael Zolotukhin 2013-08-06 9:42 ` Jan Hubicka 2013-08-06 16:42 ` Xinliang David Li 0 siblings, 2 replies; 23+ messages in thread From: Michael Zolotukhin @ 2013-08-06 8:46 UTC (permalink / raw) To: Xinliang David Li; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson There are still some formatting issues (like 8 spaces instead of a tab, wrong indentation of do-loop and some other places) - to reveal some of them you could use contrib/check_GNU_style.sh script. But that was a nitpicking again:) Actually I wanted to ask whether you're going to use this option for some performance experiments involving memmov/memset - if so, probably you could tune existing cost-models as well? Is it possible? Michael On 5 August 2013 20:44, Xinliang David Li <davidxl@google.com> wrote: > thanks. Updated patch attached. > > David > > On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin > <michael.v.zolotukhin@gmail.com> wrote: >> Hi, >> This is a really convenient option, thanks for working on it. >> I can't approve it as I'm not a maintainer, but it looks ok to me, >> except fot a small nitpicking: afair, comments should end with >> dot-space-space. >> >> Michael >> >> On 04 Aug 20:01, Xinliang David Li wrote: >>> The attached is a new patch implementing the stringop inline strategy >>> control using two new -m options: >>> >>> -mmemcpy-strategy= >>> -mmemset-strategy= >>> >>> See changes in doc/invoke.texi for description of the new options. Example: >>> -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned >>> >>> tells compiler to inline memcpy using rep_8byte when the size is no >>> larger than 64 byte, using unrolled_loop when size is no larger than >>> 2048, and for size > 2048, using library call. In all cases, >>> destination alignment adjustment is not done. >>> >>> Tested on x86-64/linux. Ok for trunk? >>> >>> thanks, >>> >>> David >>> >>> 2013-08-02 Xinliang David Li <davidxl@google.com> >>> >>> * config/i386/stringop.def: New file. >>> * config/i386/stringop.opt: New file. >>> * config/i386/i386-opts.h: Include stringopt.def. >>> * config/i386/i386.opt: Include stringopt.opt. >>> * config/i386/i386.c (ix86_option_override_internal): >>> Override default size based stringop inline strategies >>> with options. >>> * config/i386/i386.c (ix86_parse_stringop_strategy_string): >>> New function. >>> >>> 2013-08-04 Xinliang David Li <davidxl@google.com> >>> >>> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. >>> * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. >>> * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. >>> * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. >>> >>> >>> >>> >>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: >>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with >>> > FDO), libcall strategy is used with the size is > 8192. This value is >>> > hard coded, which makes it hard to do performance tuning. This patch >>> > adds two new parameters to do that. Potential usage includes >>> > per-application libcall strategy min-size tuning based on summary data >>> > with FDO (e.g, instruction workset size). >>> > >>> > Bootstrap and tested on x86_64/linux. Ok for trunk? >>> > >>> > thanks, >>> > >>> > David >>> > >>> > >>> > 2013-08-02 Xinliang David Li <davidxl@google.com> >>> > >>> > * params.def: New parameters. >>> > * config/i386/i386.c (ix86_option_override_internal): >>> > Override default libcall size limit with parameters. >> >>> Index: config/i386/stringop.def >>> =================================================================== >>> --- config/i386/stringop.def (revision 0) >>> +++ config/i386/stringop.def (revision 0) >>> @@ -0,0 +1,42 @@ >>> +/* Definitions for option handling for IA-32. >>> + Copyright (C) 2013 Free Software Foundation, Inc. >>> + >>> +This file is part of GCC. >>> + >>> +GCC is free software; you can redistribute it and/or modify >>> +it under the terms of the GNU General Public License as published by >>> +the Free Software Foundation; either version 3, or (at your option) >>> +any later version. >>> + >>> +GCC is distributed in the hope that it will be useful, >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>> +GNU General Public License for more details. >>> + >>> +Under Section 7 of GPL version 3, you are granted additional >>> +permissions described in the GCC Runtime Library Exception, version >>> +3.1, as published by the Free Software Foundation. >>> + >>> +You should have received a copy of the GNU General Public License and >>> +a copy of the GCC Runtime Library Exception along with this program; >>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >>> +<http://www.gnu.org/licenses/>. */ >>> + >>> +DEF_ENUM >>> +DEF_ALG (no_stringop, no_stringop) >>> +DEF_ENUM >>> +DEF_ALG (libcall, libcall) >>> +DEF_ENUM >>> +DEF_ALG (rep_prefix_1_byte, rep_byte) >>> +DEF_ENUM >>> +DEF_ALG (rep_prefix_4_byte, rep_4byte) >>> +DEF_ENUM >>> +DEF_ALG (rep_prefix_8_byte, rep_8byte) >>> +DEF_ENUM >>> +DEF_ALG (loop_1_byte, byte_loop) >>> +DEF_ENUM >>> +DEF_ALG (loop, loop) >>> +DEF_ENUM >>> +DEF_ALG (unrolled_loop, unrolled_loop) >>> +DEF_ENUM >>> +DEF_ALG (vector_loop, vector_loop) >>> Index: config/i386/i386.opt >>> =================================================================== >>> --- config/i386/i386.opt (revision 201458) >>> +++ config/i386/i386.opt (working copy) >>> @@ -316,6 +316,14 @@ mstack-arg-probe >>> Target Report Mask(STACK_PROBE) Save >>> Enable stack probing >>> >>> +mmemcpy-strategy= >>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) >>> +Specify memcpy expansion strategy when expected size is known >>> + >>> +mmemset-strategy= >>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy) >>> +Specify memset expansion strategy when expected size is known >>> + >>> mstringop-strategy= >>> Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) >>> Chose strategy to generate stringop using >>> Index: config/i386/stringop.opt >>> =================================================================== >>> --- config/i386/stringop.opt (revision 0) >>> +++ config/i386/stringop.opt (revision 0) >>> @@ -0,0 +1,36 @@ >>> +/* Definitions for option handling for IA-32. >>> + Copyright (C) 2013 Free Software Foundation, Inc. >>> + >>> +This file is part of GCC. >>> + >>> +GCC is free software; you can redistribute it and/or modify >>> +it under the terms of the GNU General Public License as published by >>> +the Free Software Foundation; either version 3, or (at your option) >>> +any later version. >>> + >>> +GCC is distributed in the hope that it will be useful, >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>> +GNU General Public License for more details. >>> + >>> +Under Section 7 of GPL version 3, you are granted additional >>> +permissions described in the GCC Runtime Library Exception, version >>> +3.1, as published by the Free Software Foundation. >>> + >>> +You should have received a copy of the GNU General Public License and >>> +a copy of the GCC Runtime Library Exception along with this program; >>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >>> +<http://www.gnu.org/licenses/>. */ >>> + >>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) >>> + >>> +#undef DEF_ENUM >>> +#define DEF_ENUM EnumValue >>> + >>> +#undef DEF_ALG >>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) >>> + >>> +#include "stringop.def" >>> + >>> +#undef DEF_ENUM >>> +#undef DEF_ALG >>> Index: config/i386/i386.c >>> =================================================================== >>> --- config/i386/i386.c (revision 201458) >>> +++ config/i386/i386.c (working copy) >>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = >>> }; >>> >>> /* Processor costs (relative to an add) */ >>> -static const >>> +static >>> struct processor_costs i386_cost = { /* 386 specific costs */ >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs i486_cost = { /* 486 specific costs */ >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs pentium_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs pentiumpro_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs geode_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs k6_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs athlon_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs k8_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs pentium4_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (3), /* cost of a lea instruction */ >>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs nocona_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { >>> 1, /* cond_not_taken_branch_cost. */ >>> }; >>> >>> -static const >>> +static >>> struct processor_costs atom_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { >>> }; >>> >>> /* Generic64 should produce code tuned for Nocona and K8. */ >>> -static const >>> +static >>> struct processor_costs generic64_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> /* On all chips taken into consideration lea is 2 cycles and more. With >>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = >>> }; >>> >>> /* core_cost should produce code tuned for Core familly of CPUs. */ >>> -static const >>> +static >>> struct processor_costs core_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> /* On all chips taken into consideration lea is 2 cycles and more. With >>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { >>> >>> /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, >>> Athlon and K8. */ >>> -static const >>> +static >>> struct processor_costs generic32_cost = { >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void) >>> >>> return; >>> } >>> + >>> +static const char *stringop_alg_names[] = { >>> +#define DEF_ENUM >>> +#define DEF_ALG(alg, name) #name, >>> +#include "stringop.def" >>> +#undef DEF_ENUM >>> +#undef DEF_ALG >>> +}; >>> + >>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. >>> + The string is of the following form (or comma separated list of it): >>> + >>> + strategy_alg:max_size:[align|noalign] >>> + >>> + where the full size range for the strategy is either [0, max_size] or >>> + [min_size, max_size], in which min_size is the max_size + 1 of the >>> + preceding range. The last size range must have max_size == -1. >>> + >>> + Examples: >>> + >>> + 1. >>> + -mmemcpy-strategy=libcall:-1:noalign >>> + >>> + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall >>> + >>> + >>> + 2. >>> + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign >>> + >>> + This is to tell the compiler to use the following strategy for memset >>> + 1) when the expected size is between [1, 16], use rep_8byte strategy; >>> + 2) when the size is between [17, 2048], use vector_loop; >>> + 3) when the size is > 2048, use libcall. >>> + >>> +*/ >>> + >>> +struct stringop_size_range >>> +{ >>> + int min; >>> + int max; >>> + stringop_alg alg; >>> + bool noalign; >>> +}; >>> + >>> +static void >>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) >>> +{ >>> + const struct stringop_algs *default_algs; >>> + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; >>> + char *curr_range_str, *next_range_str; >>> + int i = 0, n = 0; >>> + >>> + if (is_memset) >>> + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; >>> + else >>> + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; >>> + >>> + curr_range_str = strategy_str; >>> + >>> + do { >>> + >>> + int mins, maxs; >>> + stringop_alg alg; >>> + char alg_name[128]; >>> + char align[16]; >>> + >>> + next_range_str = strchr (curr_range_str, ','); >>> + if (next_range_str) >>> + *next_range_str++ = '\0'; >>> + >>> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) >>> + { >>> + warning (0, "Wrong arg %s to option %s", curr_range_str, >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>> + return; >>> + } >>> + >>> + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) >>> + { >>> + warning (0, "Size ranges of option %s should be increasing", >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>> + return; >>> + } >>> + >>> + for (i = 0; i < last_alg; i++) >>> + { >>> + if (!strcmp (alg_name, stringop_alg_names[i])) >>> + { >>> + alg = (stringop_alg) i; >>> + break; >>> + } >>> + } >>> + >>> + if (i == last_alg) >>> + { >>> + warning (0, "Wrong stringop strategy name %s specified for option %s", >>> + alg_name, >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>> + return; >>> + } >>> + >>> + input_ranges[n].min = mins; >>> + input_ranges[n].max = maxs; >>> + input_ranges[n].alg = alg; >>> + if (!strcmp (align, "align")) >>> + input_ranges[n].noalign = false; >>> + else if (!strcmp (align, "noalign")) >>> + input_ranges[n].noalign = true; >>> + else >>> + { >>> + warning (0, "Unknown alignment %s specified for option %s", >>> + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>> + return; >>> + } >>> + n++; >>> + curr_range_str = next_range_str; >>> + } while (curr_range_str); >>> + >>> + if (input_ranges[n - 1].max != -1) >>> + { >>> + warning (0, "The max value for the last size range should be -1" >>> + " for option %s", >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>> + return; >>> + } >>> + >>> + if (n > MAX_STRINGOP_ALGS) >>> + { >>> + warning (0, "Too many size ranges specified in option %s", >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>> + return; >>> + } >>> + >>> + /* Now override the default algs array */ >>> + for (i = 0; i < n; i++) >>> + { >>> + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; >>> + *const_cast<stringop_alg *>(&default_algs->size[i].alg) >>> + = input_ranges[i].alg; >>> + *const_cast<int *>(&default_algs->size[i].noalign) >>> + = input_ranges[i].noalign; >>> + } >>> +} >>> + >>> >>> /* Override various settings based on options. If MAIN_ARGS_P, the >>> options are from the command line, otherwise they are from >>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main >>> /* Handle stack protector */ >>> if (!global_options_set.x_ix86_stack_protector_guard) >>> ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; >>> + >>> + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ >>> + if (ix86_tune_memcpy_strategy) >>> + { >>> + char *str = xstrdup (ix86_tune_memcpy_strategy); >>> + ix86_parse_stringop_strategy_string (str, false); >>> + free (str); >>> + } >>> + >>> + if (ix86_tune_memset_strategy) >>> + { >>> + char *str = xstrdup (ix86_tune_memset_strategy); >>> + ix86_parse_stringop_strategy_string (str, true); >>> + free (str); >>> + } >>> } >>> >>> /* Implement the TARGET_OPTION_OVERRIDE hook. */ >>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >>> { >>> case libcall: >>> case no_stringop: >>> + case last_alg: >>> gcc_unreachable (); >>> case loop_1_byte: >>> need_zero_guard = true; >>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >>> { >>> case libcall: >>> case no_stringop: >>> + case last_alg: >>> gcc_unreachable (); >>> case loop_1_byte: >>> case loop: >>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >>> { >>> case libcall: >>> case no_stringop: >>> + case last_alg: >>> gcc_unreachable (); >>> case loop: >>> need_zero_guard = true; >>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >>> { >>> case libcall: >>> case no_stringop: >>> + case last_alg: >>> gcc_unreachable (); >>> case loop_1_byte: >>> case loop: >>> Index: config/i386/i386-opts.h >>> =================================================================== >>> --- config/i386/i386-opts.h (revision 201458) >>> +++ config/i386/i386-opts.h (working copy) >>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI >>> /* Algorithm to expand string function with. */ >>> enum stringop_alg >>> { >>> - no_stringop, >>> - libcall, >>> - rep_prefix_1_byte, >>> - rep_prefix_4_byte, >>> - rep_prefix_8_byte, >>> - loop_1_byte, >>> - loop, >>> - unrolled_loop, >>> - vector_loop >>> +#undef DEF_ENUM >>> +#define DEF_ENUM >>> + >>> +#undef DEF_ALG >>> +#define DEF_ALG(alg, name) alg, >>> + >>> +#include "stringop.def" >>> +last_alg >>> + >>> +#undef DEF_ENUM >>> +#undef DEF_ALG >>> }; >>> >>> /* Available call abi. */ >>> Index: doc/invoke.texi >>> =================================================================== >>> --- doc/invoke.texi (revision 201458) >>> +++ doc/invoke.texi (working copy) >>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. >>> -mbmi2 -mrtm -mlwp -mthreads @gol >>> -mno-align-stringops -minline-all-stringops @gol >>> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol >>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} >>> -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol >>> -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol >>> -mregparm=@var{num} -msseregparm @gol >>> @@ -14598,6 +14599,24 @@ Expand into an inline loop. >>> Always use a library call. >>> @end table >>> >>> +@item -mmemcpy-strategy=@var{strategy} >>> +@opindex mmemcpy-strategy=@var{strategy} >>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy} >>> +should be inlined and what inline algorithm to use when the expected size >>> +of the copy operation is known. @var{strategy} >>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. >>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies >>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last >>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets >>> +in the list must be specified in increasing order. The minimal byte size for >>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the >>> +preceding range. >>> + >>> +@item -mmemset-strategy=@var{strategy} >>> +@opindex mmemset-strategy=@var{strategy} >>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control >>> +@code{__builtin_memset} expansion. >>> + >>> @item -momit-leaf-frame-pointer >>> @opindex momit-leaf-frame-pointer >>> Don't keep the frame pointer in a register for leaf functions. This >>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c >>> =================================================================== >>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >>> @@ -0,0 +1,12 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >>> + >>> +char a[2048]; >>> +char b[2048]; >>> +void t (void) >>> +{ >>> + __builtin_memcpy (a, b, 2048); >>> +} >>> + >>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c >>> =================================================================== >>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >>> @@ -0,0 +1,12 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >>> + >>> +char a[2048]; >>> +char b[2048]; >>> +void t (void) >>> +{ >>> + __builtin_memcpy (a, b, 2048); >>> +} >>> + >>> Index: testsuite/gcc.target/i386/memset-strategy-1.c >>> =================================================================== >>> --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >>> +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >>> @@ -0,0 +1,10 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ >>> +/* { dg-final { scan-assembler-times "memset" 2 } } */ >>> + >>> +char a[2048]; >>> +void t (void) >>> +{ >>> + __builtin_memset (a, 1, 2048); >>> +} >>> + >>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c >>> =================================================================== >>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >>> @@ -0,0 +1,11 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ >>> +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ >>> + >>> +char a[2048]; >>> +char b[2048]; >>> +void t (void) >>> +{ >>> + __builtin_memcpy (a, b, 2048); >>> +} >>> + >> -- --- Best regards, Michael V. Zolotukhin, Software Engineer Intel Corporation. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-06 8:46 ` Michael Zolotukhin @ 2013-08-06 9:42 ` Jan Hubicka 2013-08-06 16:08 ` Xinliang David Li 2013-08-07 17:06 ` Xinliang David Li 2013-08-06 16:42 ` Xinliang David Li 1 sibling, 2 replies; 23+ messages in thread From: Jan Hubicka @ 2013-08-06 9:42 UTC (permalink / raw) To: Michael Zolotukhin Cc: Xinliang David Li, GCC Patches, Jan Hubicka, Teresa Johnson > >>> 2013-08-02 Xinliang David Li <davidxl@google.com> > >>> > >>> * config/i386/stringop.def: New file. > >>> * config/i386/stringop.opt: New file. > >>> * config/i386/i386-opts.h: Include stringopt.def. > >>> * config/i386/i386.opt: Include stringopt.opt. > >>> * config/i386/i386.c (ix86_option_override_internal): > >>> Override default size based stringop inline strategies > >>> with options. > >>> * config/i386/i386.c (ix86_parse_stringop_strategy_string): > >>> New function. > >>> > >>> 2013-08-04 Xinliang David Li <davidxl@google.com> > >>> > >>> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. > >>> * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. > >>> * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. > >>> * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. The patch looks resonable to me in general. I wonder why we need to bring all the cost tables non-const instead of just having writable storage for the "current strategy" like we do with other flags anyway. Your strings are definitely more readable than the in-memory representation I came up with. Perhaps we can even turn the cost tables into strings for easier maintenance? I guess they are bit confusing for people not familiar with a code. Honza > >>> > >>> > >>> > >>> > >>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: > >>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with > >>> > FDO), libcall strategy is used with the size is > 8192. This value is > >>> > hard coded, which makes it hard to do performance tuning. This patch > >>> > adds two new parameters to do that. Potential usage includes > >>> > per-application libcall strategy min-size tuning based on summary data > >>> > with FDO (e.g, instruction workset size). > >>> > > >>> > Bootstrap and tested on x86_64/linux. Ok for trunk? > >>> > > >>> > thanks, > >>> > > >>> > David > >>> > > >>> > > >>> > 2013-08-02 Xinliang David Li <davidxl@google.com> > >>> > > >>> > * params.def: New parameters. > >>> > * config/i386/i386.c (ix86_option_override_internal): > >>> > Override default libcall size limit with parameters. > >> > >>> Index: config/i386/stringop.def > >>> =================================================================== > >>> --- config/i386/stringop.def (revision 0) > >>> +++ config/i386/stringop.def (revision 0) > >>> @@ -0,0 +1,42 @@ > >>> +/* Definitions for option handling for IA-32. > >>> + Copyright (C) 2013 Free Software Foundation, Inc. > >>> + > >>> +This file is part of GCC. > >>> + > >>> +GCC is free software; you can redistribute it and/or modify > >>> +it under the terms of the GNU General Public License as published by > >>> +the Free Software Foundation; either version 3, or (at your option) > >>> +any later version. > >>> + > >>> +GCC is distributed in the hope that it will be useful, > >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of > >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > >>> +GNU General Public License for more details. > >>> + > >>> +Under Section 7 of GPL version 3, you are granted additional > >>> +permissions described in the GCC Runtime Library Exception, version > >>> +3.1, as published by the Free Software Foundation. > >>> + > >>> +You should have received a copy of the GNU General Public License and > >>> +a copy of the GCC Runtime Library Exception along with this program; > >>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see > >>> +<http://www.gnu.org/licenses/>. */ > >>> + > >>> +DEF_ENUM > >>> +DEF_ALG (no_stringop, no_stringop) > >>> +DEF_ENUM > >>> +DEF_ALG (libcall, libcall) > >>> +DEF_ENUM > >>> +DEF_ALG (rep_prefix_1_byte, rep_byte) > >>> +DEF_ENUM > >>> +DEF_ALG (rep_prefix_4_byte, rep_4byte) > >>> +DEF_ENUM > >>> +DEF_ALG (rep_prefix_8_byte, rep_8byte) > >>> +DEF_ENUM > >>> +DEF_ALG (loop_1_byte, byte_loop) > >>> +DEF_ENUM > >>> +DEF_ALG (loop, loop) > >>> +DEF_ENUM > >>> +DEF_ALG (unrolled_loop, unrolled_loop) > >>> +DEF_ENUM > >>> +DEF_ALG (vector_loop, vector_loop) > >>> Index: config/i386/i386.opt > >>> =================================================================== > >>> --- config/i386/i386.opt (revision 201458) > >>> +++ config/i386/i386.opt (working copy) > >>> @@ -316,6 +316,14 @@ mstack-arg-probe > >>> Target Report Mask(STACK_PROBE) Save > >>> Enable stack probing > >>> > >>> +mmemcpy-strategy= > >>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) > >>> +Specify memcpy expansion strategy when expected size is known > >>> + > >>> +mmemset-strategy= > >>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy) > >>> +Specify memset expansion strategy when expected size is known > >>> + > >>> mstringop-strategy= > >>> Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) > >>> Chose strategy to generate stringop using > >>> Index: config/i386/stringop.opt > >>> =================================================================== > >>> --- config/i386/stringop.opt (revision 0) > >>> +++ config/i386/stringop.opt (revision 0) > >>> @@ -0,0 +1,36 @@ > >>> +/* Definitions for option handling for IA-32. > >>> + Copyright (C) 2013 Free Software Foundation, Inc. > >>> + > >>> +This file is part of GCC. > >>> + > >>> +GCC is free software; you can redistribute it and/or modify > >>> +it under the terms of the GNU General Public License as published by > >>> +the Free Software Foundation; either version 3, or (at your option) > >>> +any later version. > >>> + > >>> +GCC is distributed in the hope that it will be useful, > >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of > >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > >>> +GNU General Public License for more details. > >>> + > >>> +Under Section 7 of GPL version 3, you are granted additional > >>> +permissions described in the GCC Runtime Library Exception, version > >>> +3.1, as published by the Free Software Foundation. > >>> + > >>> +You should have received a copy of the GNU General Public License and > >>> +a copy of the GCC Runtime Library Exception along with this program; > >>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see > >>> +<http://www.gnu.org/licenses/>. */ > >>> + > >>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) > >>> + > >>> +#undef DEF_ENUM > >>> +#define DEF_ENUM EnumValue > >>> + > >>> +#undef DEF_ALG > >>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) > >>> + > >>> +#include "stringop.def" > >>> + > >>> +#undef DEF_ENUM > >>> +#undef DEF_ALG > >>> Index: config/i386/i386.c > >>> =================================================================== > >>> --- config/i386/i386.c (revision 201458) > >>> +++ config/i386/i386.c (working copy) > >>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = > >>> }; > >>> > >>> /* Processor costs (relative to an add) */ > >>> -static const > >>> +static > >>> struct processor_costs i386_cost = { /* 386 specific costs */ > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ > >>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs i486_cost = { /* 486 specific costs */ > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ > >>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs pentium_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ > >>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs pentiumpro_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ > >>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs geode_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ > >>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs k6_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ > >>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs athlon_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ > >>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs k8_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ > >>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs pentium4_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (3), /* cost of a lea instruction */ > >>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs nocona_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ > >>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { > >>> 1, /* cond_not_taken_branch_cost. */ > >>> }; > >>> > >>> -static const > >>> +static > >>> struct processor_costs atom_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ > >>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { > >>> }; > >>> > >>> /* Generic64 should produce code tuned for Nocona and K8. */ > >>> -static const > >>> +static > >>> struct processor_costs generic64_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> /* On all chips taken into consideration lea is 2 cycles and more. With > >>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = > >>> }; > >>> > >>> /* core_cost should produce code tuned for Core familly of CPUs. */ > >>> -static const > >>> +static > >>> struct processor_costs core_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> /* On all chips taken into consideration lea is 2 cycles and more. With > >>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { > >>> > >>> /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, > >>> Athlon and K8. */ > >>> -static const > >>> +static > >>> struct processor_costs generic32_cost = { > >>> COSTS_N_INSNS (1), /* cost of an add instruction */ > >>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ > >>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void) > >>> > >>> return; > >>> } > >>> + > >>> +static const char *stringop_alg_names[] = { > >>> +#define DEF_ENUM > >>> +#define DEF_ALG(alg, name) #name, > >>> +#include "stringop.def" > >>> +#undef DEF_ENUM > >>> +#undef DEF_ALG > >>> +}; > >>> + > >>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. > >>> + The string is of the following form (or comma separated list of it): > >>> + > >>> + strategy_alg:max_size:[align|noalign] > >>> + > >>> + where the full size range for the strategy is either [0, max_size] or > >>> + [min_size, max_size], in which min_size is the max_size + 1 of the > >>> + preceding range. The last size range must have max_size == -1. > >>> + > >>> + Examples: > >>> + > >>> + 1. > >>> + -mmemcpy-strategy=libcall:-1:noalign > >>> + > >>> + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall > >>> + > >>> + > >>> + 2. > >>> + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign > >>> + > >>> + This is to tell the compiler to use the following strategy for memset > >>> + 1) when the expected size is between [1, 16], use rep_8byte strategy; > >>> + 2) when the size is between [17, 2048], use vector_loop; > >>> + 3) when the size is > 2048, use libcall. > >>> + > >>> +*/ > >>> + > >>> +struct stringop_size_range > >>> +{ > >>> + int min; > >>> + int max; > >>> + stringop_alg alg; > >>> + bool noalign; > >>> +}; > >>> + > >>> +static void > >>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) > >>> +{ > >>> + const struct stringop_algs *default_algs; > >>> + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; > >>> + char *curr_range_str, *next_range_str; > >>> + int i = 0, n = 0; > >>> + > >>> + if (is_memset) > >>> + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; > >>> + else > >>> + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; > >>> + > >>> + curr_range_str = strategy_str; > >>> + > >>> + do { > >>> + > >>> + int mins, maxs; > >>> + stringop_alg alg; > >>> + char alg_name[128]; > >>> + char align[16]; > >>> + > >>> + next_range_str = strchr (curr_range_str, ','); > >>> + if (next_range_str) > >>> + *next_range_str++ = '\0'; > >>> + > >>> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) > >>> + { > >>> + warning (0, "Wrong arg %s to option %s", curr_range_str, > >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > >>> + return; > >>> + } > >>> + > >>> + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) > >>> + { > >>> + warning (0, "Size ranges of option %s should be increasing", > >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > >>> + return; > >>> + } > >>> + > >>> + for (i = 0; i < last_alg; i++) > >>> + { > >>> + if (!strcmp (alg_name, stringop_alg_names[i])) > >>> + { > >>> + alg = (stringop_alg) i; > >>> + break; > >>> + } > >>> + } > >>> + > >>> + if (i == last_alg) > >>> + { > >>> + warning (0, "Wrong stringop strategy name %s specified for option %s", > >>> + alg_name, > >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > >>> + return; > >>> + } > >>> + > >>> + input_ranges[n].min = mins; > >>> + input_ranges[n].max = maxs; > >>> + input_ranges[n].alg = alg; > >>> + if (!strcmp (align, "align")) > >>> + input_ranges[n].noalign = false; > >>> + else if (!strcmp (align, "noalign")) > >>> + input_ranges[n].noalign = true; > >>> + else > >>> + { > >>> + warning (0, "Unknown alignment %s specified for option %s", > >>> + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > >>> + return; > >>> + } > >>> + n++; > >>> + curr_range_str = next_range_str; > >>> + } while (curr_range_str); > >>> + > >>> + if (input_ranges[n - 1].max != -1) > >>> + { > >>> + warning (0, "The max value for the last size range should be -1" > >>> + " for option %s", > >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > >>> + return; > >>> + } > >>> + > >>> + if (n > MAX_STRINGOP_ALGS) > >>> + { > >>> + warning (0, "Too many size ranges specified in option %s", > >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > >>> + return; > >>> + } > >>> + > >>> + /* Now override the default algs array */ > >>> + for (i = 0; i < n; i++) > >>> + { > >>> + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; > >>> + *const_cast<stringop_alg *>(&default_algs->size[i].alg) > >>> + = input_ranges[i].alg; > >>> + *const_cast<int *>(&default_algs->size[i].noalign) > >>> + = input_ranges[i].noalign; > >>> + } > >>> +} > >>> + > >>> > >>> /* Override various settings based on options. If MAIN_ARGS_P, the > >>> options are from the command line, otherwise they are from > >>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main > >>> /* Handle stack protector */ > >>> if (!global_options_set.x_ix86_stack_protector_guard) > >>> ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; > >>> + > >>> + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ > >>> + if (ix86_tune_memcpy_strategy) > >>> + { > >>> + char *str = xstrdup (ix86_tune_memcpy_strategy); > >>> + ix86_parse_stringop_strategy_string (str, false); > >>> + free (str); > >>> + } > >>> + > >>> + if (ix86_tune_memset_strategy) > >>> + { > >>> + char *str = xstrdup (ix86_tune_memset_strategy); > >>> + ix86_parse_stringop_strategy_string (str, true); > >>> + free (str); > >>> + } > >>> } > >>> > >>> /* Implement the TARGET_OPTION_OVERRIDE hook. */ > >>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt > >>> { > >>> case libcall: > >>> case no_stringop: > >>> + case last_alg: > >>> gcc_unreachable (); > >>> case loop_1_byte: > >>> need_zero_guard = true; > >>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt > >>> { > >>> case libcall: > >>> case no_stringop: > >>> + case last_alg: > >>> gcc_unreachable (); > >>> case loop_1_byte: > >>> case loop: > >>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e > >>> { > >>> case libcall: > >>> case no_stringop: > >>> + case last_alg: > >>> gcc_unreachable (); > >>> case loop: > >>> need_zero_guard = true; > >>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e > >>> { > >>> case libcall: > >>> case no_stringop: > >>> + case last_alg: > >>> gcc_unreachable (); > >>> case loop_1_byte: > >>> case loop: > >>> Index: config/i386/i386-opts.h > >>> =================================================================== > >>> --- config/i386/i386-opts.h (revision 201458) > >>> +++ config/i386/i386-opts.h (working copy) > >>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI > >>> /* Algorithm to expand string function with. */ > >>> enum stringop_alg > >>> { > >>> - no_stringop, > >>> - libcall, > >>> - rep_prefix_1_byte, > >>> - rep_prefix_4_byte, > >>> - rep_prefix_8_byte, > >>> - loop_1_byte, > >>> - loop, > >>> - unrolled_loop, > >>> - vector_loop > >>> +#undef DEF_ENUM > >>> +#define DEF_ENUM > >>> + > >>> +#undef DEF_ALG > >>> +#define DEF_ALG(alg, name) alg, > >>> + > >>> +#include "stringop.def" > >>> +last_alg > >>> + > >>> +#undef DEF_ENUM > >>> +#undef DEF_ALG > >>> }; > >>> > >>> /* Available call abi. */ > >>> Index: doc/invoke.texi > >>> =================================================================== > >>> --- doc/invoke.texi (revision 201458) > >>> +++ doc/invoke.texi (working copy) > >>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. > >>> -mbmi2 -mrtm -mlwp -mthreads @gol > >>> -mno-align-stringops -minline-all-stringops @gol > >>> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol > >>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} > >>> -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol > >>> -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol > >>> -mregparm=@var{num} -msseregparm @gol > >>> @@ -14598,6 +14599,24 @@ Expand into an inline loop. > >>> Always use a library call. > >>> @end table > >>> > >>> +@item -mmemcpy-strategy=@var{strategy} > >>> +@opindex mmemcpy-strategy=@var{strategy} > >>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy} > >>> +should be inlined and what inline algorithm to use when the expected size > >>> +of the copy operation is known. @var{strategy} > >>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. > >>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies > >>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last > >>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets > >>> +in the list must be specified in increasing order. The minimal byte size for > >>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the > >>> +preceding range. > >>> + > >>> +@item -mmemset-strategy=@var{strategy} > >>> +@opindex mmemset-strategy=@var{strategy} > >>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control > >>> +@code{__builtin_memset} expansion. > >>> + > >>> @item -momit-leaf-frame-pointer > >>> @opindex momit-leaf-frame-pointer > >>> Don't keep the frame pointer in a register for leaf functions. This > >>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c > >>> =================================================================== > >>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) > >>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) > >>> @@ -0,0 +1,12 @@ > >>> +/* { dg-do compile } */ > >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ > >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ > >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ > >>> + > >>> +char a[2048]; > >>> +char b[2048]; > >>> +void t (void) > >>> +{ > >>> + __builtin_memcpy (a, b, 2048); > >>> +} > >>> + > >>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c > >>> =================================================================== > >>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) > >>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) > >>> @@ -0,0 +1,12 @@ > >>> +/* { dg-do compile } */ > >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ > >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ > >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ > >>> + > >>> +char a[2048]; > >>> +char b[2048]; > >>> +void t (void) > >>> +{ > >>> + __builtin_memcpy (a, b, 2048); > >>> +} > >>> + > >>> Index: testsuite/gcc.target/i386/memset-strategy-1.c > >>> =================================================================== > >>> --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) > >>> +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) > >>> @@ -0,0 +1,10 @@ > >>> +/* { dg-do compile } */ > >>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ > >>> +/* { dg-final { scan-assembler-times "memset" 2 } } */ > >>> + > >>> +char a[2048]; > >>> +void t (void) > >>> +{ > >>> + __builtin_memset (a, 1, 2048); > >>> +} > >>> + > >>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c > >>> =================================================================== > >>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) > >>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) > >>> @@ -0,0 +1,11 @@ > >>> +/* { dg-do compile } */ > >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ > >>> +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ > >>> + > >>> +char a[2048]; > >>> +char b[2048]; > >>> +void t (void) > >>> +{ > >>> + __builtin_memcpy (a, b, 2048); > >>> +} > >>> + > >> > > > > -- > --- > Best regards, > Michael V. Zolotukhin, > Software Engineer > Intel Corporation. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-06 9:42 ` Jan Hubicka @ 2013-08-06 16:08 ` Xinliang David Li 2013-08-07 17:06 ` Xinliang David Li 1 sibling, 0 replies; 23+ messages in thread From: Xinliang David Li @ 2013-08-06 16:08 UTC (permalink / raw) To: Jan Hubicka; +Cc: Michael Zolotukhin, GCC Patches, Teresa Johnson On Tue, Aug 6, 2013 at 2:42 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> >>> 2013-08-02 Xinliang David Li <davidxl@google.com> >> >>> >> >>> * config/i386/stringop.def: New file. >> >>> * config/i386/stringop.opt: New file. >> >>> * config/i386/i386-opts.h: Include stringopt.def. >> >>> * config/i386/i386.opt: Include stringopt.opt. >> >>> * config/i386/i386.c (ix86_option_override_internal): >> >>> Override default size based stringop inline strategies >> >>> with options. >> >>> * config/i386/i386.c (ix86_parse_stringop_strategy_string): >> >>> New function. >> >>> >> >>> 2013-08-04 Xinliang David Li <davidxl@google.com> >> >>> >> >>> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. >> >>> * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. >> >>> * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. >> >>> * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. > > The patch looks resonable to me in general. I wonder why we need to bring > all the cost tables non-const instead of just having writable storage for > the "current strategy" like we do with other flags anyway. Having const on those arrays do not bring us anything -- those tables will be accessed indirectly so const-prop won't happen anyways. current_strategy is an embedded struct in the cost array so it ends up in RO data when top level array is const. > > Your strings are definitely more readable than the in-memory representation > I came up with. Perhaps we can even turn the cost tables into strings > for easier maintenance? I guess they are bit confusing for people > not familiar with a code. I think the in memory representation is fine -- if there is a need for internal representation cleanup, it should done as another patch. WDTY? thanks, David > > Honza >> >>> >> >>> >> >>> >> >>> >> >>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: >> >>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with >> >>> > FDO), libcall strategy is used with the size is > 8192. This value is >> >>> > hard coded, which makes it hard to do performance tuning. This patch >> >>> > adds two new parameters to do that. Potential usage includes >> >>> > per-application libcall strategy min-size tuning based on summary data >> >>> > with FDO (e.g, instruction workset size). >> >>> > >> >>> > Bootstrap and tested on x86_64/linux. Ok for trunk? >> >>> > >> >>> > thanks, >> >>> > >> >>> > David >> >>> > >> >>> > >> >>> > 2013-08-02 Xinliang David Li <davidxl@google.com> >> >>> > >> >>> > * params.def: New parameters. >> >>> > * config/i386/i386.c (ix86_option_override_internal): >> >>> > Override default libcall size limit with parameters. >> >> >> >>> Index: config/i386/stringop.def >> >>> =================================================================== >> >>> --- config/i386/stringop.def (revision 0) >> >>> +++ config/i386/stringop.def (revision 0) >> >>> @@ -0,0 +1,42 @@ >> >>> +/* Definitions for option handling for IA-32. >> >>> + Copyright (C) 2013 Free Software Foundation, Inc. >> >>> + >> >>> +This file is part of GCC. >> >>> + >> >>> +GCC is free software; you can redistribute it and/or modify >> >>> +it under the terms of the GNU General Public License as published by >> >>> +the Free Software Foundation; either version 3, or (at your option) >> >>> +any later version. >> >>> + >> >>> +GCC is distributed in the hope that it will be useful, >> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> >>> +GNU General Public License for more details. >> >>> + >> >>> +Under Section 7 of GPL version 3, you are granted additional >> >>> +permissions described in the GCC Runtime Library Exception, version >> >>> +3.1, as published by the Free Software Foundation. >> >>> + >> >>> +You should have received a copy of the GNU General Public License and >> >>> +a copy of the GCC Runtime Library Exception along with this program; >> >>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >> >>> +<http://www.gnu.org/licenses/>. */ >> >>> + >> >>> +DEF_ENUM >> >>> +DEF_ALG (no_stringop, no_stringop) >> >>> +DEF_ENUM >> >>> +DEF_ALG (libcall, libcall) >> >>> +DEF_ENUM >> >>> +DEF_ALG (rep_prefix_1_byte, rep_byte) >> >>> +DEF_ENUM >> >>> +DEF_ALG (rep_prefix_4_byte, rep_4byte) >> >>> +DEF_ENUM >> >>> +DEF_ALG (rep_prefix_8_byte, rep_8byte) >> >>> +DEF_ENUM >> >>> +DEF_ALG (loop_1_byte, byte_loop) >> >>> +DEF_ENUM >> >>> +DEF_ALG (loop, loop) >> >>> +DEF_ENUM >> >>> +DEF_ALG (unrolled_loop, unrolled_loop) >> >>> +DEF_ENUM >> >>> +DEF_ALG (vector_loop, vector_loop) >> >>> Index: config/i386/i386.opt >> >>> =================================================================== >> >>> --- config/i386/i386.opt (revision 201458) >> >>> +++ config/i386/i386.opt (working copy) >> >>> @@ -316,6 +316,14 @@ mstack-arg-probe >> >>> Target Report Mask(STACK_PROBE) Save >> >>> Enable stack probing >> >>> >> >>> +mmemcpy-strategy= >> >>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) >> >>> +Specify memcpy expansion strategy when expected size is known >> >>> + >> >>> +mmemset-strategy= >> >>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy) >> >>> +Specify memset expansion strategy when expected size is known >> >>> + >> >>> mstringop-strategy= >> >>> Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) >> >>> Chose strategy to generate stringop using >> >>> Index: config/i386/stringop.opt >> >>> =================================================================== >> >>> --- config/i386/stringop.opt (revision 0) >> >>> +++ config/i386/stringop.opt (revision 0) >> >>> @@ -0,0 +1,36 @@ >> >>> +/* Definitions for option handling for IA-32. >> >>> + Copyright (C) 2013 Free Software Foundation, Inc. >> >>> + >> >>> +This file is part of GCC. >> >>> + >> >>> +GCC is free software; you can redistribute it and/or modify >> >>> +it under the terms of the GNU General Public License as published by >> >>> +the Free Software Foundation; either version 3, or (at your option) >> >>> +any later version. >> >>> + >> >>> +GCC is distributed in the hope that it will be useful, >> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> >>> +GNU General Public License for more details. >> >>> + >> >>> +Under Section 7 of GPL version 3, you are granted additional >> >>> +permissions described in the GCC Runtime Library Exception, version >> >>> +3.1, as published by the Free Software Foundation. >> >>> + >> >>> +You should have received a copy of the GNU General Public License and >> >>> +a copy of the GCC Runtime Library Exception along with this program; >> >>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >> >>> +<http://www.gnu.org/licenses/>. */ >> >>> + >> >>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) >> >>> + >> >>> +#undef DEF_ENUM >> >>> +#define DEF_ENUM EnumValue >> >>> + >> >>> +#undef DEF_ALG >> >>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) >> >>> + >> >>> +#include "stringop.def" >> >>> + >> >>> +#undef DEF_ENUM >> >>> +#undef DEF_ALG >> >>> Index: config/i386/i386.c >> >>> =================================================================== >> >>> --- config/i386/i386.c (revision 201458) >> >>> +++ config/i386/i386.c (working copy) >> >>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = >> >>> }; >> >>> >> >>> /* Processor costs (relative to an add) */ >> >>> -static const >> >>> +static >> >>> struct processor_costs i386_cost = { /* 386 specific costs */ >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs i486_cost = { /* 486 specific costs */ >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs pentium_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs pentiumpro_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs geode_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs k6_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> >>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs athlon_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> >>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs k8_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> >>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs pentium4_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (3), /* cost of a lea instruction */ >> >>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs nocona_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs atom_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >> >>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { >> >>> }; >> >>> >> >>> /* Generic64 should produce code tuned for Nocona and K8. */ >> >>> -static const >> >>> +static >> >>> struct processor_costs generic64_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> /* On all chips taken into consideration lea is 2 cycles and more. With >> >>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = >> >>> }; >> >>> >> >>> /* core_cost should produce code tuned for Core familly of CPUs. */ >> >>> -static const >> >>> +static >> >>> struct processor_costs core_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> /* On all chips taken into consideration lea is 2 cycles and more. With >> >>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { >> >>> >> >>> /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, >> >>> Athlon and K8. */ >> >>> -static const >> >>> +static >> >>> struct processor_costs generic32_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >> >>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void) >> >>> >> >>> return; >> >>> } >> >>> + >> >>> +static const char *stringop_alg_names[] = { >> >>> +#define DEF_ENUM >> >>> +#define DEF_ALG(alg, name) #name, >> >>> +#include "stringop.def" >> >>> +#undef DEF_ENUM >> >>> +#undef DEF_ALG >> >>> +}; >> >>> + >> >>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. >> >>> + The string is of the following form (or comma separated list of it): >> >>> + >> >>> + strategy_alg:max_size:[align|noalign] >> >>> + >> >>> + where the full size range for the strategy is either [0, max_size] or >> >>> + [min_size, max_size], in which min_size is the max_size + 1 of the >> >>> + preceding range. The last size range must have max_size == -1. >> >>> + >> >>> + Examples: >> >>> + >> >>> + 1. >> >>> + -mmemcpy-strategy=libcall:-1:noalign >> >>> + >> >>> + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall >> >>> + >> >>> + >> >>> + 2. >> >>> + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign >> >>> + >> >>> + This is to tell the compiler to use the following strategy for memset >> >>> + 1) when the expected size is between [1, 16], use rep_8byte strategy; >> >>> + 2) when the size is between [17, 2048], use vector_loop; >> >>> + 3) when the size is > 2048, use libcall. >> >>> + >> >>> +*/ >> >>> + >> >>> +struct stringop_size_range >> >>> +{ >> >>> + int min; >> >>> + int max; >> >>> + stringop_alg alg; >> >>> + bool noalign; >> >>> +}; >> >>> + >> >>> +static void >> >>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) >> >>> +{ >> >>> + const struct stringop_algs *default_algs; >> >>> + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; >> >>> + char *curr_range_str, *next_range_str; >> >>> + int i = 0, n = 0; >> >>> + >> >>> + if (is_memset) >> >>> + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; >> >>> + else >> >>> + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; >> >>> + >> >>> + curr_range_str = strategy_str; >> >>> + >> >>> + do { >> >>> + >> >>> + int mins, maxs; >> >>> + stringop_alg alg; >> >>> + char alg_name[128]; >> >>> + char align[16]; >> >>> + >> >>> + next_range_str = strchr (curr_range_str, ','); >> >>> + if (next_range_str) >> >>> + *next_range_str++ = '\0'; >> >>> + >> >>> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) >> >>> + { >> >>> + warning (0, "Wrong arg %s to option %s", curr_range_str, >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) >> >>> + { >> >>> + warning (0, "Size ranges of option %s should be increasing", >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + for (i = 0; i < last_alg; i++) >> >>> + { >> >>> + if (!strcmp (alg_name, stringop_alg_names[i])) >> >>> + { >> >>> + alg = (stringop_alg) i; >> >>> + break; >> >>> + } >> >>> + } >> >>> + >> >>> + if (i == last_alg) >> >>> + { >> >>> + warning (0, "Wrong stringop strategy name %s specified for option %s", >> >>> + alg_name, >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + input_ranges[n].min = mins; >> >>> + input_ranges[n].max = maxs; >> >>> + input_ranges[n].alg = alg; >> >>> + if (!strcmp (align, "align")) >> >>> + input_ranges[n].noalign = false; >> >>> + else if (!strcmp (align, "noalign")) >> >>> + input_ranges[n].noalign = true; >> >>> + else >> >>> + { >> >>> + warning (0, "Unknown alignment %s specified for option %s", >> >>> + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + n++; >> >>> + curr_range_str = next_range_str; >> >>> + } while (curr_range_str); >> >>> + >> >>> + if (input_ranges[n - 1].max != -1) >> >>> + { >> >>> + warning (0, "The max value for the last size range should be -1" >> >>> + " for option %s", >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + if (n > MAX_STRINGOP_ALGS) >> >>> + { >> >>> + warning (0, "Too many size ranges specified in option %s", >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + /* Now override the default algs array */ >> >>> + for (i = 0; i < n; i++) >> >>> + { >> >>> + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; >> >>> + *const_cast<stringop_alg *>(&default_algs->size[i].alg) >> >>> + = input_ranges[i].alg; >> >>> + *const_cast<int *>(&default_algs->size[i].noalign) >> >>> + = input_ranges[i].noalign; >> >>> + } >> >>> +} >> >>> + >> >>> >> >>> /* Override various settings based on options. If MAIN_ARGS_P, the >> >>> options are from the command line, otherwise they are from >> >>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main >> >>> /* Handle stack protector */ >> >>> if (!global_options_set.x_ix86_stack_protector_guard) >> >>> ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; >> >>> + >> >>> + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ >> >>> + if (ix86_tune_memcpy_strategy) >> >>> + { >> >>> + char *str = xstrdup (ix86_tune_memcpy_strategy); >> >>> + ix86_parse_stringop_strategy_string (str, false); >> >>> + free (str); >> >>> + } >> >>> + >> >>> + if (ix86_tune_memset_strategy) >> >>> + { >> >>> + char *str = xstrdup (ix86_tune_memset_strategy); >> >>> + ix86_parse_stringop_strategy_string (str, true); >> >>> + free (str); >> >>> + } >> >>> } >> >>> >> >>> /* Implement the TARGET_OPTION_OVERRIDE hook. */ >> >>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >> >>> { >> >>> case libcall: >> >>> case no_stringop: >> >>> + case last_alg: >> >>> gcc_unreachable (); >> >>> case loop_1_byte: >> >>> need_zero_guard = true; >> >>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >> >>> { >> >>> case libcall: >> >>> case no_stringop: >> >>> + case last_alg: >> >>> gcc_unreachable (); >> >>> case loop_1_byte: >> >>> case loop: >> >>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >> >>> { >> >>> case libcall: >> >>> case no_stringop: >> >>> + case last_alg: >> >>> gcc_unreachable (); >> >>> case loop: >> >>> need_zero_guard = true; >> >>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >> >>> { >> >>> case libcall: >> >>> case no_stringop: >> >>> + case last_alg: >> >>> gcc_unreachable (); >> >>> case loop_1_byte: >> >>> case loop: >> >>> Index: config/i386/i386-opts.h >> >>> =================================================================== >> >>> --- config/i386/i386-opts.h (revision 201458) >> >>> +++ config/i386/i386-opts.h (working copy) >> >>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI >> >>> /* Algorithm to expand string function with. */ >> >>> enum stringop_alg >> >>> { >> >>> - no_stringop, >> >>> - libcall, >> >>> - rep_prefix_1_byte, >> >>> - rep_prefix_4_byte, >> >>> - rep_prefix_8_byte, >> >>> - loop_1_byte, >> >>> - loop, >> >>> - unrolled_loop, >> >>> - vector_loop >> >>> +#undef DEF_ENUM >> >>> +#define DEF_ENUM >> >>> + >> >>> +#undef DEF_ALG >> >>> +#define DEF_ALG(alg, name) alg, >> >>> + >> >>> +#include "stringop.def" >> >>> +last_alg >> >>> + >> >>> +#undef DEF_ENUM >> >>> +#undef DEF_ALG >> >>> }; >> >>> >> >>> /* Available call abi. */ >> >>> Index: doc/invoke.texi >> >>> =================================================================== >> >>> --- doc/invoke.texi (revision 201458) >> >>> +++ doc/invoke.texi (working copy) >> >>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. >> >>> -mbmi2 -mrtm -mlwp -mthreads @gol >> >>> -mno-align-stringops -minline-all-stringops @gol >> >>> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol >> >>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} >> >>> -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol >> >>> -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol >> >>> -mregparm=@var{num} -msseregparm @gol >> >>> @@ -14598,6 +14599,24 @@ Expand into an inline loop. >> >>> Always use a library call. >> >>> @end table >> >>> >> >>> +@item -mmemcpy-strategy=@var{strategy} >> >>> +@opindex mmemcpy-strategy=@var{strategy} >> >>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy} >> >>> +should be inlined and what inline algorithm to use when the expected size >> >>> +of the copy operation is known. @var{strategy} >> >>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. >> >>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies >> >>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last >> >>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets >> >>> +in the list must be specified in increasing order. The minimal byte size for >> >>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the >> >>> +preceding range. >> >>> + >> >>> +@item -mmemset-strategy=@var{strategy} >> >>> +@opindex mmemset-strategy=@var{strategy} >> >>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control >> >>> +@code{__builtin_memset} expansion. >> >>> + >> >>> @item -momit-leaf-frame-pointer >> >>> @opindex momit-leaf-frame-pointer >> >>> Don't keep the frame pointer in a register for leaf functions. This >> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c >> >>> =================================================================== >> >>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >> >>> @@ -0,0 +1,12 @@ >> >>> +/* { dg-do compile } */ >> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ >> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >> >>> + >> >>> +char a[2048]; >> >>> +char b[2048]; >> >>> +void t (void) >> >>> +{ >> >>> + __builtin_memcpy (a, b, 2048); >> >>> +} >> >>> + >> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c >> >>> =================================================================== >> >>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >> >>> @@ -0,0 +1,12 @@ >> >>> +/* { dg-do compile } */ >> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ >> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >> >>> + >> >>> +char a[2048]; >> >>> +char b[2048]; >> >>> +void t (void) >> >>> +{ >> >>> + __builtin_memcpy (a, b, 2048); >> >>> +} >> >>> + >> >>> Index: testsuite/gcc.target/i386/memset-strategy-1.c >> >>> =================================================================== >> >>> --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >> >>> +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >> >>> @@ -0,0 +1,10 @@ >> >>> +/* { dg-do compile } */ >> >>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ >> >>> +/* { dg-final { scan-assembler-times "memset" 2 } } */ >> >>> + >> >>> +char a[2048]; >> >>> +void t (void) >> >>> +{ >> >>> + __builtin_memset (a, 1, 2048); >> >>> +} >> >>> + >> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c >> >>> =================================================================== >> >>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >> >>> @@ -0,0 +1,11 @@ >> >>> +/* { dg-do compile } */ >> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ >> >>> +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ >> >>> + >> >>> +char a[2048]; >> >>> +char b[2048]; >> >>> +void t (void) >> >>> +{ >> >>> + __builtin_memcpy (a, b, 2048); >> >>> +} >> >>> + >> >> >> >> >> >> -- >> --- >> Best regards, >> Michael V. Zolotukhin, >> Software Engineer >> Intel Corporation. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-06 9:42 ` Jan Hubicka 2013-08-06 16:08 ` Xinliang David Li @ 2013-08-07 17:06 ` Xinliang David Li 2013-08-08 0:23 ` Joseph S. Myers 1 sibling, 1 reply; 23+ messages in thread From: Xinliang David Li @ 2013-08-07 17:06 UTC (permalink / raw) To: Jan Hubicka; +Cc: GCC Patches, Teresa Johnson [-- Attachment #1: Type: text/plain, Size: 26171 bytes --] Fixed the do while formatting. Ok for trunk with this version? thanks, David On Tue, Aug 6, 2013 at 2:42 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> >>> 2013-08-02 Xinliang David Li <davidxl@google.com> >> >>> >> >>> * config/i386/stringop.def: New file. >> >>> * config/i386/stringop.opt: New file. >> >>> * config/i386/i386-opts.h: Include stringopt.def. >> >>> * config/i386/i386.opt: Include stringopt.opt. >> >>> * config/i386/i386.c (ix86_option_override_internal): >> >>> Override default size based stringop inline strategies >> >>> with options. >> >>> * config/i386/i386.c (ix86_parse_stringop_strategy_string): >> >>> New function. >> >>> >> >>> 2013-08-04 Xinliang David Li <davidxl@google.com> >> >>> >> >>> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. >> >>> * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. >> >>> * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. >> >>> * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. > > The patch looks resonable to me in general. I wonder why we need to bring > all the cost tables non-const instead of just having writable storage for > the "current strategy" like we do with other flags anyway. > > Your strings are definitely more readable than the in-memory representation > I came up with. Perhaps we can even turn the cost tables into strings > for easier maintenance? I guess they are bit confusing for people > not familiar with a code. > > Honza >> >>> >> >>> >> >>> >> >>> >> >>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: >> >>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with >> >>> > FDO), libcall strategy is used with the size is > 8192. This value is >> >>> > hard coded, which makes it hard to do performance tuning. This patch >> >>> > adds two new parameters to do that. Potential usage includes >> >>> > per-application libcall strategy min-size tuning based on summary data >> >>> > with FDO (e.g, instruction workset size). >> >>> > >> >>> > Bootstrap and tested on x86_64/linux. Ok for trunk? >> >>> > >> >>> > thanks, >> >>> > >> >>> > David >> >>> > >> >>> > >> >>> > 2013-08-02 Xinliang David Li <davidxl@google.com> >> >>> > >> >>> > * params.def: New parameters. >> >>> > * config/i386/i386.c (ix86_option_override_internal): >> >>> > Override default libcall size limit with parameters. >> >> >> >>> Index: config/i386/stringop.def >> >>> =================================================================== >> >>> --- config/i386/stringop.def (revision 0) >> >>> +++ config/i386/stringop.def (revision 0) >> >>> @@ -0,0 +1,42 @@ >> >>> +/* Definitions for option handling for IA-32. >> >>> + Copyright (C) 2013 Free Software Foundation, Inc. >> >>> + >> >>> +This file is part of GCC. >> >>> + >> >>> +GCC is free software; you can redistribute it and/or modify >> >>> +it under the terms of the GNU General Public License as published by >> >>> +the Free Software Foundation; either version 3, or (at your option) >> >>> +any later version. >> >>> + >> >>> +GCC is distributed in the hope that it will be useful, >> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> >>> +GNU General Public License for more details. >> >>> + >> >>> +Under Section 7 of GPL version 3, you are granted additional >> >>> +permissions described in the GCC Runtime Library Exception, version >> >>> +3.1, as published by the Free Software Foundation. >> >>> + >> >>> +You should have received a copy of the GNU General Public License and >> >>> +a copy of the GCC Runtime Library Exception along with this program; >> >>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >> >>> +<http://www.gnu.org/licenses/>. */ >> >>> + >> >>> +DEF_ENUM >> >>> +DEF_ALG (no_stringop, no_stringop) >> >>> +DEF_ENUM >> >>> +DEF_ALG (libcall, libcall) >> >>> +DEF_ENUM >> >>> +DEF_ALG (rep_prefix_1_byte, rep_byte) >> >>> +DEF_ENUM >> >>> +DEF_ALG (rep_prefix_4_byte, rep_4byte) >> >>> +DEF_ENUM >> >>> +DEF_ALG (rep_prefix_8_byte, rep_8byte) >> >>> +DEF_ENUM >> >>> +DEF_ALG (loop_1_byte, byte_loop) >> >>> +DEF_ENUM >> >>> +DEF_ALG (loop, loop) >> >>> +DEF_ENUM >> >>> +DEF_ALG (unrolled_loop, unrolled_loop) >> >>> +DEF_ENUM >> >>> +DEF_ALG (vector_loop, vector_loop) >> >>> Index: config/i386/i386.opt >> >>> =================================================================== >> >>> --- config/i386/i386.opt (revision 201458) >> >>> +++ config/i386/i386.opt (working copy) >> >>> @@ -316,6 +316,14 @@ mstack-arg-probe >> >>> Target Report Mask(STACK_PROBE) Save >> >>> Enable stack probing >> >>> >> >>> +mmemcpy-strategy= >> >>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) >> >>> +Specify memcpy expansion strategy when expected size is known >> >>> + >> >>> +mmemset-strategy= >> >>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy) >> >>> +Specify memset expansion strategy when expected size is known >> >>> + >> >>> mstringop-strategy= >> >>> Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) >> >>> Chose strategy to generate stringop using >> >>> Index: config/i386/stringop.opt >> >>> =================================================================== >> >>> --- config/i386/stringop.opt (revision 0) >> >>> +++ config/i386/stringop.opt (revision 0) >> >>> @@ -0,0 +1,36 @@ >> >>> +/* Definitions for option handling for IA-32. >> >>> + Copyright (C) 2013 Free Software Foundation, Inc. >> >>> + >> >>> +This file is part of GCC. >> >>> + >> >>> +GCC is free software; you can redistribute it and/or modify >> >>> +it under the terms of the GNU General Public License as published by >> >>> +the Free Software Foundation; either version 3, or (at your option) >> >>> +any later version. >> >>> + >> >>> +GCC is distributed in the hope that it will be useful, >> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> >>> +GNU General Public License for more details. >> >>> + >> >>> +Under Section 7 of GPL version 3, you are granted additional >> >>> +permissions described in the GCC Runtime Library Exception, version >> >>> +3.1, as published by the Free Software Foundation. >> >>> + >> >>> +You should have received a copy of the GNU General Public License and >> >>> +a copy of the GCC Runtime Library Exception along with this program; >> >>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >> >>> +<http://www.gnu.org/licenses/>. */ >> >>> + >> >>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) >> >>> + >> >>> +#undef DEF_ENUM >> >>> +#define DEF_ENUM EnumValue >> >>> + >> >>> +#undef DEF_ALG >> >>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) >> >>> + >> >>> +#include "stringop.def" >> >>> + >> >>> +#undef DEF_ENUM >> >>> +#undef DEF_ALG >> >>> Index: config/i386/i386.c >> >>> =================================================================== >> >>> --- config/i386/i386.c (revision 201458) >> >>> +++ config/i386/i386.c (working copy) >> >>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = >> >>> }; >> >>> >> >>> /* Processor costs (relative to an add) */ >> >>> -static const >> >>> +static >> >>> struct processor_costs i386_cost = { /* 386 specific costs */ >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs i486_cost = { /* 486 specific costs */ >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs pentium_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs pentiumpro_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs geode_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs k6_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> >>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs athlon_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> >>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs k8_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >> >>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs pentium4_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (3), /* cost of a lea instruction */ >> >>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs nocona_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >> >>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { >> >>> 1, /* cond_not_taken_branch_cost. */ >> >>> }; >> >>> >> >>> -static const >> >>> +static >> >>> struct processor_costs atom_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >> >>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { >> >>> }; >> >>> >> >>> /* Generic64 should produce code tuned for Nocona and K8. */ >> >>> -static const >> >>> +static >> >>> struct processor_costs generic64_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> /* On all chips taken into consideration lea is 2 cycles and more. With >> >>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = >> >>> }; >> >>> >> >>> /* core_cost should produce code tuned for Core familly of CPUs. */ >> >>> -static const >> >>> +static >> >>> struct processor_costs core_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> /* On all chips taken into consideration lea is 2 cycles and more. With >> >>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { >> >>> >> >>> /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, >> >>> Athlon and K8. */ >> >>> -static const >> >>> +static >> >>> struct processor_costs generic32_cost = { >> >>> COSTS_N_INSNS (1), /* cost of an add instruction */ >> >>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >> >>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void) >> >>> >> >>> return; >> >>> } >> >>> + >> >>> +static const char *stringop_alg_names[] = { >> >>> +#define DEF_ENUM >> >>> +#define DEF_ALG(alg, name) #name, >> >>> +#include "stringop.def" >> >>> +#undef DEF_ENUM >> >>> +#undef DEF_ALG >> >>> +}; >> >>> + >> >>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. >> >>> + The string is of the following form (or comma separated list of it): >> >>> + >> >>> + strategy_alg:max_size:[align|noalign] >> >>> + >> >>> + where the full size range for the strategy is either [0, max_size] or >> >>> + [min_size, max_size], in which min_size is the max_size + 1 of the >> >>> + preceding range. The last size range must have max_size == -1. >> >>> + >> >>> + Examples: >> >>> + >> >>> + 1. >> >>> + -mmemcpy-strategy=libcall:-1:noalign >> >>> + >> >>> + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall >> >>> + >> >>> + >> >>> + 2. >> >>> + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign >> >>> + >> >>> + This is to tell the compiler to use the following strategy for memset >> >>> + 1) when the expected size is between [1, 16], use rep_8byte strategy; >> >>> + 2) when the size is between [17, 2048], use vector_loop; >> >>> + 3) when the size is > 2048, use libcall. >> >>> + >> >>> +*/ >> >>> + >> >>> +struct stringop_size_range >> >>> +{ >> >>> + int min; >> >>> + int max; >> >>> + stringop_alg alg; >> >>> + bool noalign; >> >>> +}; >> >>> + >> >>> +static void >> >>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) >> >>> +{ >> >>> + const struct stringop_algs *default_algs; >> >>> + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; >> >>> + char *curr_range_str, *next_range_str; >> >>> + int i = 0, n = 0; >> >>> + >> >>> + if (is_memset) >> >>> + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; >> >>> + else >> >>> + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; >> >>> + >> >>> + curr_range_str = strategy_str; >> >>> + >> >>> + do { >> >>> + >> >>> + int mins, maxs; >> >>> + stringop_alg alg; >> >>> + char alg_name[128]; >> >>> + char align[16]; >> >>> + >> >>> + next_range_str = strchr (curr_range_str, ','); >> >>> + if (next_range_str) >> >>> + *next_range_str++ = '\0'; >> >>> + >> >>> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) >> >>> + { >> >>> + warning (0, "Wrong arg %s to option %s", curr_range_str, >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) >> >>> + { >> >>> + warning (0, "Size ranges of option %s should be increasing", >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + for (i = 0; i < last_alg; i++) >> >>> + { >> >>> + if (!strcmp (alg_name, stringop_alg_names[i])) >> >>> + { >> >>> + alg = (stringop_alg) i; >> >>> + break; >> >>> + } >> >>> + } >> >>> + >> >>> + if (i == last_alg) >> >>> + { >> >>> + warning (0, "Wrong stringop strategy name %s specified for option %s", >> >>> + alg_name, >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + input_ranges[n].min = mins; >> >>> + input_ranges[n].max = maxs; >> >>> + input_ranges[n].alg = alg; >> >>> + if (!strcmp (align, "align")) >> >>> + input_ranges[n].noalign = false; >> >>> + else if (!strcmp (align, "noalign")) >> >>> + input_ranges[n].noalign = true; >> >>> + else >> >>> + { >> >>> + warning (0, "Unknown alignment %s specified for option %s", >> >>> + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + n++; >> >>> + curr_range_str = next_range_str; >> >>> + } while (curr_range_str); >> >>> + >> >>> + if (input_ranges[n - 1].max != -1) >> >>> + { >> >>> + warning (0, "The max value for the last size range should be -1" >> >>> + " for option %s", >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + if (n > MAX_STRINGOP_ALGS) >> >>> + { >> >>> + warning (0, "Too many size ranges specified in option %s", >> >>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> >>> + return; >> >>> + } >> >>> + >> >>> + /* Now override the default algs array */ >> >>> + for (i = 0; i < n; i++) >> >>> + { >> >>> + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; >> >>> + *const_cast<stringop_alg *>(&default_algs->size[i].alg) >> >>> + = input_ranges[i].alg; >> >>> + *const_cast<int *>(&default_algs->size[i].noalign) >> >>> + = input_ranges[i].noalign; >> >>> + } >> >>> +} >> >>> + >> >>> >> >>> /* Override various settings based on options. If MAIN_ARGS_P, the >> >>> options are from the command line, otherwise they are from >> >>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main >> >>> /* Handle stack protector */ >> >>> if (!global_options_set.x_ix86_stack_protector_guard) >> >>> ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; >> >>> + >> >>> + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ >> >>> + if (ix86_tune_memcpy_strategy) >> >>> + { >> >>> + char *str = xstrdup (ix86_tune_memcpy_strategy); >> >>> + ix86_parse_stringop_strategy_string (str, false); >> >>> + free (str); >> >>> + } >> >>> + >> >>> + if (ix86_tune_memset_strategy) >> >>> + { >> >>> + char *str = xstrdup (ix86_tune_memset_strategy); >> >>> + ix86_parse_stringop_strategy_string (str, true); >> >>> + free (str); >> >>> + } >> >>> } >> >>> >> >>> /* Implement the TARGET_OPTION_OVERRIDE hook. */ >> >>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >> >>> { >> >>> case libcall: >> >>> case no_stringop: >> >>> + case last_alg: >> >>> gcc_unreachable (); >> >>> case loop_1_byte: >> >>> need_zero_guard = true; >> >>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >> >>> { >> >>> case libcall: >> >>> case no_stringop: >> >>> + case last_alg: >> >>> gcc_unreachable (); >> >>> case loop_1_byte: >> >>> case loop: >> >>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >> >>> { >> >>> case libcall: >> >>> case no_stringop: >> >>> + case last_alg: >> >>> gcc_unreachable (); >> >>> case loop: >> >>> need_zero_guard = true; >> >>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >> >>> { >> >>> case libcall: >> >>> case no_stringop: >> >>> + case last_alg: >> >>> gcc_unreachable (); >> >>> case loop_1_byte: >> >>> case loop: >> >>> Index: config/i386/i386-opts.h >> >>> =================================================================== >> >>> --- config/i386/i386-opts.h (revision 201458) >> >>> +++ config/i386/i386-opts.h (working copy) >> >>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI >> >>> /* Algorithm to expand string function with. */ >> >>> enum stringop_alg >> >>> { >> >>> - no_stringop, >> >>> - libcall, >> >>> - rep_prefix_1_byte, >> >>> - rep_prefix_4_byte, >> >>> - rep_prefix_8_byte, >> >>> - loop_1_byte, >> >>> - loop, >> >>> - unrolled_loop, >> >>> - vector_loop >> >>> +#undef DEF_ENUM >> >>> +#define DEF_ENUM >> >>> + >> >>> +#undef DEF_ALG >> >>> +#define DEF_ALG(alg, name) alg, >> >>> + >> >>> +#include "stringop.def" >> >>> +last_alg >> >>> + >> >>> +#undef DEF_ENUM >> >>> +#undef DEF_ALG >> >>> }; >> >>> >> >>> /* Available call abi. */ >> >>> Index: doc/invoke.texi >> >>> =================================================================== >> >>> --- doc/invoke.texi (revision 201458) >> >>> +++ doc/invoke.texi (working copy) >> >>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. >> >>> -mbmi2 -mrtm -mlwp -mthreads @gol >> >>> -mno-align-stringops -minline-all-stringops @gol >> >>> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol >> >>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} >> >>> -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol >> >>> -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol >> >>> -mregparm=@var{num} -msseregparm @gol >> >>> @@ -14598,6 +14599,24 @@ Expand into an inline loop. >> >>> Always use a library call. >> >>> @end table >> >>> >> >>> +@item -mmemcpy-strategy=@var{strategy} >> >>> +@opindex mmemcpy-strategy=@var{strategy} >> >>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy} >> >>> +should be inlined and what inline algorithm to use when the expected size >> >>> +of the copy operation is known. @var{strategy} >> >>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. >> >>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies >> >>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last >> >>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets >> >>> +in the list must be specified in increasing order. The minimal byte size for >> >>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the >> >>> +preceding range. >> >>> + >> >>> +@item -mmemset-strategy=@var{strategy} >> >>> +@opindex mmemset-strategy=@var{strategy} >> >>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control >> >>> +@code{__builtin_memset} expansion. >> >>> + >> >>> @item -momit-leaf-frame-pointer >> >>> @opindex momit-leaf-frame-pointer >> >>> Don't keep the frame pointer in a register for leaf functions. This >> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c >> >>> =================================================================== >> >>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >> >>> @@ -0,0 +1,12 @@ >> >>> +/* { dg-do compile } */ >> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ >> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >> >>> + >> >>> +char a[2048]; >> >>> +char b[2048]; >> >>> +void t (void) >> >>> +{ >> >>> + __builtin_memcpy (a, b, 2048); >> >>> +} >> >>> + >> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c >> >>> =================================================================== >> >>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >> >>> @@ -0,0 +1,12 @@ >> >>> +/* { dg-do compile } */ >> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ >> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >> >>> + >> >>> +char a[2048]; >> >>> +char b[2048]; >> >>> +void t (void) >> >>> +{ >> >>> + __builtin_memcpy (a, b, 2048); >> >>> +} >> >>> + >> >>> Index: testsuite/gcc.target/i386/memset-strategy-1.c >> >>> =================================================================== >> >>> --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >> >>> +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >> >>> @@ -0,0 +1,10 @@ >> >>> +/* { dg-do compile } */ >> >>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ >> >>> +/* { dg-final { scan-assembler-times "memset" 2 } } */ >> >>> + >> >>> +char a[2048]; >> >>> +void t (void) >> >>> +{ >> >>> + __builtin_memset (a, 1, 2048); >> >>> +} >> >>> + >> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c >> >>> =================================================================== >> >>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >> >>> @@ -0,0 +1,11 @@ >> >>> +/* { dg-do compile } */ >> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ >> >>> +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ >> >>> + >> >>> +char a[2048]; >> >>> +char b[2048]; >> >>> +void t (void) >> >>> +{ >> >>> + __builtin_memcpy (a, b, 2048); >> >>> +} >> >>> + >> >> >> >> >> >> -- >> --- >> Best regards, >> Michael V. Zolotukhin, >> Software Engineer >> Intel Corporation. [-- Attachment #2: stringop_inl_option.p.txt --] [-- Type: text/plain, Size: 18977 bytes --] Index: testsuite/gcc.target/i386/memcpy-strategy-3.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-2.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memset-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memset" 2 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 1, 2048); +} + Index: doc/invoke.texi =================================================================== --- doc/invoke.texi (revision 201540) +++ doc/invoke.texi (working copy) @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. -mbmi2 -mrtm -mlwp -mthreads @gol -mno-align-stringops -minline-all-stringops @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol -mregparm=@var{num} -msseregparm @gol @@ -14598,6 +14599,24 @@ Expand into an inline loop. Always use a library call. @end table +@item -mmemcpy-strategy=@var{strategy} +@opindex mmemcpy-strategy=@var{strategy} +Override the internal decision heuristic to decide if @code{__builtin_memcpy} +should be inlined and what inline algorithm to use when the expected size +of the copy operation is known. @var{strategy} +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies +the max byte size with which inline algorithm @var{alg} is allowed. For the last +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets +in the list must be specified in increasing order. The minimal byte size for +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the +preceding range. + +@item -mmemset-strategy=@var{strategy} +@opindex mmemset-strategy=@var{strategy} +The option is similar to @option{-mmemcpy-strategy=} except that it is to control +@code{__builtin_memset} expansion. + @item -momit-leaf-frame-pointer @opindex momit-leaf-frame-pointer Don't keep the frame pointer in a register for leaf functions. This Index: config/i386/stringop.def =================================================================== --- config/i386/stringop.def (revision 0) +++ config/i386/stringop.def (revision 0) @@ -0,0 +1,42 @@ +/* Definitions for option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +DEF_ENUM +DEF_ALG (no_stringop, no_stringop) +DEF_ENUM +DEF_ALG (libcall, libcall) +DEF_ENUM +DEF_ALG (rep_prefix_1_byte, rep_byte) +DEF_ENUM +DEF_ALG (rep_prefix_4_byte, rep_4byte) +DEF_ENUM +DEF_ALG (rep_prefix_8_byte, rep_8byte) +DEF_ENUM +DEF_ALG (loop_1_byte, byte_loop) +DEF_ENUM +DEF_ALG (loop, loop) +DEF_ENUM +DEF_ALG (unrolled_loop, unrolled_loop) +DEF_ENUM +DEF_ALG (vector_loop, vector_loop) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201540) +++ config/i386/i386.c (working copy) @@ -158,7 +158,7 @@ struct processor_costs ix86_size_cost = }; /* Processor costs (relative to an add) */ -static const +static struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -228,7 +228,7 @@ struct processor_costs i386_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -300,7 +300,7 @@ struct processor_costs i486_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -370,7 +370,7 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -449,7 +449,7 @@ struct processor_costs pentiumpro_cost = 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -520,7 +520,7 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -593,7 +593,7 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -666,7 +666,7 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -1267,7 +1267,7 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ @@ -1338,7 +1338,7 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1411,7 +1411,7 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -1558,7 +1558,7 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ -static const +static struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1637,7 +1637,7 @@ struct processor_costs generic64_cost = }; /* core_cost should produce code tuned for Core familly of CPUs. */ -static const +static struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1719,7 +1719,7 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ -static const +static struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -2919,6 +2919,149 @@ ix86_debug_options (void) return; } + +static const char *stringop_alg_names[] = { +#define DEF_ENUM +#define DEF_ALG(alg, name) #name, +#include "stringop.def" +#undef DEF_ENUM +#undef DEF_ALG +}; + +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. + The string is of the following form (or comma separated list of it): + + strategy_alg:max_size:[align|noalign] + + where the full size range for the strategy is either [0, max_size] or + [min_size, max_size], in which min_size is the max_size + 1 of the + preceding range. The last size range must have max_size == -1. + + Examples: + + 1. + -mmemcpy-strategy=libcall:-1:noalign + + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + + + 2. + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign + + This is to tell the compiler to use the following strategy for memset + 1) when the expected size is between [1, 16], use rep_8byte strategy; + 2) when the size is between [17, 2048], use vector_loop; + 3) when the size is > 2048, use libcall. */ + +struct stringop_size_range +{ + int min; + int max; + stringop_alg alg; + bool noalign; +}; + +static void +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +{ + const struct stringop_algs *default_algs; + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; + char *curr_range_str, *next_range_str; + int i = 0, n = 0; + + if (is_memset) + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + curr_range_str = strategy_str; + + do + { + int mins, maxs; + stringop_alg alg; + char alg_name[128]; + char align[16]; + + next_range_str = strchr (curr_range_str, ','); + if (next_range_str) + *next_range_str++ = '\0'; + + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) + { + warning (0, "Wrong arg %s to option %s", curr_range_str, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) + { + warning (0, "Size ranges of option %s should be increasing", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + for (i = 0; i < last_alg; i++) + { + if (!strcmp (alg_name, stringop_alg_names[i])) + { + alg = (stringop_alg) i; + break; + } + } + + if (i == last_alg) + { + warning (0, "Wrong stringop strategy name %s specified for option %s", + alg_name, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + input_ranges[n].min = mins; + input_ranges[n].max = maxs; + input_ranges[n].alg = alg; + if (!strcmp (align, "align")) + input_ranges[n].noalign = false; + else if (!strcmp (align, "noalign")) + input_ranges[n].noalign = true; + else + { + warning (0, "Unknown alignment %s specified for option %s", + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + n++; + curr_range_str = next_range_str; + } + while (curr_range_str); + + if (input_ranges[n - 1].max != -1) + { + warning (0, "The max value for the last size range should be -1" + " for option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > MAX_STRINGOP_ALGS) + { + warning (0, "Too many size ranges specified in option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + /* Now override the default algs array. */ + for (i = 0; i < n; i++) + { + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; + *const_cast<stringop_alg *>(&default_algs->size[i].alg) + = input_ranges[i].alg; + *const_cast<int *>(&default_algs->size[i].noalign) + = input_ranges[i].noalign; + } +} + \f /* Override various settings based on options. If MAIN_ARGS_P, the options are from the command line, otherwise they are from @@ -4040,6 +4183,21 @@ ix86_option_override_internal (bool main /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ + if (ix86_tune_memcpy_strategy) + { + char *str = xstrdup (ix86_tune_memcpy_strategy); + ix86_parse_stringop_strategy_string (str, false); + free (str); + } + + if (ix86_tune_memset_strategy) + { + char *str = xstrdup (ix86_tune_memset_strategy); + ix86_parse_stringop_strategy_string (str, true); + free (str); + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ @@ -22923,6 +23081,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: need_zero_guard = true; @@ -23113,6 +23272,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: @@ -23324,6 +23484,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop: need_zero_guard = true; @@ -23501,6 +23662,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: Index: config/i386/i386-opts.h =================================================================== --- config/i386/i386-opts.h (revision 201540) +++ config/i386/i386-opts.h (working copy) @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI /* Algorithm to expand string function with. */ enum stringop_alg { - no_stringop, - libcall, - rep_prefix_1_byte, - rep_prefix_4_byte, - rep_prefix_8_byte, - loop_1_byte, - loop, - unrolled_loop, - vector_loop +#undef DEF_ENUM +#define DEF_ENUM + +#undef DEF_ALG +#define DEF_ALG(alg, name) alg, + +#include "stringop.def" +last_alg + +#undef DEF_ENUM +#undef DEF_ALG }; /* Available call abi. */ Index: config/i386/stringop.opt =================================================================== --- config/i386/stringop.opt (revision 0) +++ config/i386/stringop.opt (revision 0) @@ -0,0 +1,36 @@ +/* Definitions for option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) + +#undef DEF_ENUM +#define DEF_ENUM EnumValue + +#undef DEF_ALG +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) + +#include "stringop.def" + +#undef DEF_ENUM +#undef DEF_ALG Index: config/i386/i386.opt =================================================================== --- config/i386/i386.opt (revision 201540) +++ config/i386/i386.opt (working copy) @@ -316,6 +316,14 @@ mstack-arg-probe Target Report Mask(STACK_PROBE) Save Enable stack probing +mmemcpy-strategy= +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) +Specify memcpy expansion strategy when expected size is known + +mmemset-strategy= +Target RejectNegative Joined Var(ix86_tune_memset_strategy) +Specify memset expansion strategy when expected size is known + mstringop-strategy= Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) Chose strategy to generate stringop using ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-07 17:06 ` Xinliang David Li @ 2013-08-08 0:23 ` Joseph S. Myers 2013-08-08 0:29 ` Xinliang David Li 0 siblings, 1 reply; 23+ messages in thread From: Joseph S. Myers @ 2013-08-08 0:23 UTC (permalink / raw) To: Xinliang David Li; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson On Wed, 7 Aug 2013, Xinliang David Li wrote: > Index: config/i386/stringop.def > =================================================================== > --- config/i386/stringop.def (revision 0) > +++ config/i386/stringop.def (revision 0) > @@ -0,0 +1,42 @@ > +/* Definitions for option handling for IA-32. > + Copyright (C) 2013 Free Software Foundation, Inc. > + > +This file is part of GCC. > + > +GCC is free software; you can redistribute it and/or modify > +it under the terms of the GNU General Public License as published by > +the Free Software Foundation; either version 3, or (at your option) > +any later version. > + > +GCC is distributed in the hope that it will be useful, > +but WITHOUT ANY WARRANTY; without even the implied warranty of > +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +GNU General Public License for more details. > + > +Under Section 7 of GPL version 3, you are granted additional > +permissions described in the GCC Runtime Library Exception, version > +3.1, as published by the Free Software Foundation. Why the exception? This should only be used on the host, not the target. > + do > + { > + int mins, maxs; > + stringop_alg alg; > + char alg_name[128]; > + char align[16]; > + > + next_range_str = strchr (curr_range_str, ','); > + if (next_range_str) > + *next_range_str++ = '\0'; > + > + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) This appears to introduce buffer overruns, which is never OK - whatever the length of strings in the command-line arguments, you must not overflow fixed-width buffers, so you must specify maximum field widths for the %[] and %s. > + { > + warning (0, "Wrong arg %s to option %s", curr_range_str, > + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); > + return; Invalid option arguments should be errors, not warnings, and diagnostics should not start with a capital letter. Same applies to other diagnostics here. > Index: config/i386/stringop.opt > =================================================================== > --- config/i386/stringop.opt (revision 0) > +++ config/i386/stringop.opt (revision 0) > @@ -0,0 +1,36 @@ > +/* Definitions for option handling for IA-32. > + Copyright (C) 2013 Free Software Foundation, Inc. > + > +This file is part of GCC. > + > +GCC is free software; you can redistribute it and/or modify > +it under the terms of the GNU General Public License as published by > +the Free Software Foundation; either version 3, or (at your option) > +any later version. > + > +GCC is distributed in the hope that it will be useful, > +but WITHOUT ANY WARRANTY; without even the implied warranty of > +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +GNU General Public License for more details. > + > +Under Section 7 of GPL version 3, you are granted additional > +permissions described in the GCC Runtime Library Exception, version > +3.1, as published by the Free Software Foundation. Again, why the exception? -- Joseph S. Myers joseph@codesourcery.com ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-08 0:23 ` Joseph S. Myers @ 2013-08-08 0:29 ` Xinliang David Li 2013-08-08 1:04 ` Joseph S. Myers 0 siblings, 1 reply; 23+ messages in thread From: Xinliang David Li @ 2013-08-08 0:29 UTC (permalink / raw) To: Joseph S. Myers; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson On Wed, Aug 7, 2013 at 5:23 PM, Joseph S. Myers <joseph@codesourcery.com> wrote: > On Wed, 7 Aug 2013, Xinliang David Li wrote: > >> Index: config/i386/stringop.def >> =================================================================== >> --- config/i386/stringop.def (revision 0) >> +++ config/i386/stringop.def (revision 0) >> @@ -0,0 +1,42 @@ >> +/* Definitions for option handling for IA-32. >> + Copyright (C) 2013 Free Software Foundation, Inc. >> + >> +This file is part of GCC. >> + >> +GCC is free software; you can redistribute it and/or modify >> +it under the terms of the GNU General Public License as published by >> +the Free Software Foundation; either version 3, or (at your option) >> +any later version. >> + >> +GCC is distributed in the hope that it will be useful, >> +but WITHOUT ANY WARRANTY; without even the implied warranty of >> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> +GNU General Public License for more details. >> + >> +Under Section 7 of GPL version 3, you are granted additional >> +permissions described in the GCC Runtime Library Exception, version >> +3.1, as published by the Free Software Foundation. > > Why the exception? This should only be used on the host, not the target. Sorry, I copied the boiler-plate header from i386.h -- is it wrong there too? > >> + do >> + { >> + int mins, maxs; >> + stringop_alg alg; >> + char alg_name[128]; >> + char align[16]; >> + >> + next_range_str = strchr (curr_range_str, ','); >> + if (next_range_str) >> + *next_range_str++ = '\0'; >> + >> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) > > This appears to introduce buffer overruns, which is never OK - whatever > the length of strings in the command-line arguments, you must not overflow > fixed-width buffers, so you must specify maximum field widths for the %[] > and %s. > Ok will fix. >> + { >> + warning (0, "Wrong arg %s to option %s", curr_range_str, >> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >> + return; > > Invalid option arguments should be errors, not warnings, and diagnostics > should not start with a capital letter. Same applies to other diagnostics > here. > Ok will fix. >> Index: config/i386/stringop.opt >> =================================================================== >> --- config/i386/stringop.opt (revision 0) >> +++ config/i386/stringop.opt (revision 0) >> @@ -0,0 +1,36 @@ >> +/* Definitions for option handling for IA-32. >> + Copyright (C) 2013 Free Software Foundation, Inc. >> + >> +This file is part of GCC. >> + >> +GCC is free software; you can redistribute it and/or modify >> +it under the terms of the GNU General Public License as published by >> +the Free Software Foundation; either version 3, or (at your option) >> +any later version. >> + >> +GCC is distributed in the hope that it will be useful, >> +but WITHOUT ANY WARRANTY; without even the implied warranty of >> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> +GNU General Public License for more details. >> + >> +Under Section 7 of GPL version 3, you are granted additional >> +permissions described in the GCC Runtime Library Exception, version >> +3.1, as published by the Free Software Foundation. > > Again, why the exception? Wrong template used. thanks, David > > -- > Joseph S. Myers > joseph@codesourcery.com ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-08 0:29 ` Xinliang David Li @ 2013-08-08 1:04 ` Joseph S. Myers 2013-08-08 6:17 ` Xinliang David Li 0 siblings, 1 reply; 23+ messages in thread From: Joseph S. Myers @ 2013-08-08 1:04 UTC (permalink / raw) To: Xinliang David Li; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson On Wed, 7 Aug 2013, Xinliang David Li wrote: > > Why the exception? This should only be used on the host, not the target. > > Sorry, I copied the boiler-plate header from i386.h -- is it wrong there too? tm.h gets included in target code because we haven't finished separating target macros used on the target from those used on the host. -- Joseph S. Myers joseph@codesourcery.com ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-08 1:04 ` Joseph S. Myers @ 2013-08-08 6:17 ` Xinliang David Li 2013-08-08 15:18 ` Joseph S. Myers 0 siblings, 1 reply; 23+ messages in thread From: Xinliang David Li @ 2013-08-08 6:17 UTC (permalink / raw) To: Joseph S. Myers; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson [-- Attachment #1: Type: text/plain, Size: 605 bytes --] Updated patch attached (fixed header, buffer overflow, and warning --> error problems). Ok for trunk? thanks, David On Wed, Aug 7, 2013 at 6:04 PM, Joseph S. Myers <joseph@codesourcery.com> wrote: > On Wed, 7 Aug 2013, Xinliang David Li wrote: > >> > Why the exception? This should only be used on the host, not the target. >> >> Sorry, I copied the boiler-plate header from i386.h -- is it wrong there too? > > tm.h gets included in target code because we haven't finished separating > target macros used on the target from those used on the host. > > -- > Joseph S. Myers > joseph@codesourcery.com [-- Attachment #2: stringop_inl_option.p.txt --] [-- Type: text/plain, Size: 18415 bytes --] Index: config/i386/stringop.def =================================================================== --- config/i386/stringop.def (revision 0) +++ config/i386/stringop.def (revision 0) @@ -0,0 +1,37 @@ +/* Definitions for stringop strategy for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the files COPYING3. If not, +see <http://www.gnu.org/licenses/>. */ + +DEF_ENUM +DEF_ALG (no_stringop, no_stringop) +DEF_ENUM +DEF_ALG (libcall, libcall) +DEF_ENUM +DEF_ALG (rep_prefix_1_byte, rep_byte) +DEF_ENUM +DEF_ALG (rep_prefix_4_byte, rep_4byte) +DEF_ENUM +DEF_ALG (rep_prefix_8_byte, rep_8byte) +DEF_ENUM +DEF_ALG (loop_1_byte, byte_loop) +DEF_ENUM +DEF_ALG (loop, loop) +DEF_ENUM +DEF_ALG (unrolled_loop, unrolled_loop) +DEF_ENUM +DEF_ALG (vector_loop, vector_loop) Index: config/i386/stringop.opt =================================================================== --- config/i386/stringop.opt (revision 0) +++ config/i386/stringop.opt (revision 0) @@ -0,0 +1,31 @@ +/* Definitions for stringop option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the files COPYING3. If not, +see <http://www.gnu.org/licenses/>. */ + +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) + +#undef DEF_ENUM +#define DEF_ENUM EnumValue + +#undef DEF_ALG +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) + +#include "stringop.def" + +#undef DEF_ENUM +#undef DEF_ALG Index: config/i386/i386-opts.h =================================================================== --- config/i386/i386-opts.h (revision 201581) +++ config/i386/i386-opts.h (working copy) @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI /* Algorithm to expand string function with. */ enum stringop_alg { - no_stringop, - libcall, - rep_prefix_1_byte, - rep_prefix_4_byte, - rep_prefix_8_byte, - loop_1_byte, - loop, - unrolled_loop, - vector_loop +#undef DEF_ENUM +#define DEF_ENUM + +#undef DEF_ALG +#define DEF_ALG(alg, name) alg, + +#include "stringop.def" +last_alg + +#undef DEF_ENUM +#undef DEF_ALG }; /* Available call abi. */ Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201582) +++ config/i386/i386.c (working copy) @@ -158,7 +158,7 @@ struct processor_costs ix86_size_cost = }; /* Processor costs (relative to an add) */ -static const +static struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -228,7 +228,7 @@ struct processor_costs i386_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -300,7 +300,7 @@ struct processor_costs i486_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -370,7 +370,7 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -449,7 +449,7 @@ struct processor_costs pentiumpro_cost = 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -520,7 +520,7 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -593,7 +593,7 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -666,7 +666,7 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -1267,7 +1267,7 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ @@ -1338,7 +1338,7 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1411,7 +1411,7 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -1558,7 +1558,7 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ -static const +static struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1637,7 +1637,7 @@ struct processor_costs generic64_cost = }; /* core_cost should produce code tuned for Core familly of CPUs. */ -static const +static struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1719,7 +1719,7 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ -static const +static struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -2926,6 +2926,149 @@ ix86_debug_options (void) return; } + +static const char *stringop_alg_names[] = { +#define DEF_ENUM +#define DEF_ALG(alg, name) #name, +#include "stringop.def" +#undef DEF_ENUM +#undef DEF_ALG +}; + +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. + The string is of the following form (or comma separated list of it): + + strategy_alg:max_size:[align|noalign] + + where the full size range for the strategy is either [0, max_size] or + [min_size, max_size], in which min_size is the max_size + 1 of the + preceding range. The last size range must have max_size == -1. + + Examples: + + 1. + -mmemcpy-strategy=libcall:-1:noalign + + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + + + 2. + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign + + This is to tell the compiler to use the following strategy for memset + 1) when the expected size is between [1, 16], use rep_8byte strategy; + 2) when the size is between [17, 2048], use vector_loop; + 3) when the size is > 2048, use libcall. */ + +struct stringop_size_range +{ + int min; + int max; + stringop_alg alg; + bool noalign; +}; + +static void +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +{ + const struct stringop_algs *default_algs; + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; + char *curr_range_str, *next_range_str; + int i = 0, n = 0; + + if (is_memset) + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + curr_range_str = strategy_str; + + do + { + int mins, maxs; + stringop_alg alg; + char alg_name[128]; + char align[16]; + next_range_str = strchr (curr_range_str, ','); + if (next_range_str) + *next_range_str++ = '\0'; + + if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s", + alg_name, &maxs, align)) + { + error ("Wrong arg %s to option %s", curr_range_str, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) + { + error ("Size ranges of option %s should be increasing", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + for (i = 0; i < last_alg; i++) + { + if (!strcmp (alg_name, stringop_alg_names[i])) + { + alg = (stringop_alg) i; + break; + } + } + + if (i == last_alg) + { + error ("Wrong stringop strategy name %s specified for option %s", + alg_name, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + input_ranges[n].min = mins; + input_ranges[n].max = maxs; + input_ranges[n].alg = alg; + if (!strcmp (align, "align")) + input_ranges[n].noalign = false; + else if (!strcmp (align, "noalign")) + input_ranges[n].noalign = true; + else + { + error ("Unknown alignment %s specified for option %s", + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + n++; + curr_range_str = next_range_str; + } + while (curr_range_str); + + if (input_ranges[n - 1].max != -1) + { + error ("The max value for the last size range should be -1" + " for option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > MAX_STRINGOP_ALGS) + { + error ("Too many size ranges specified in option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + /* Now override the default algs array. */ + for (i = 0; i < n; i++) + { + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; + *const_cast<stringop_alg *>(&default_algs->size[i].alg) + = input_ranges[i].alg; + *const_cast<int *>(&default_algs->size[i].noalign) + = input_ranges[i].noalign; + } +} + \f /* Override various settings based on options. If MAIN_ARGS_P, the options are from the command line, otherwise they are from @@ -4081,6 +4224,21 @@ ix86_option_override_internal (bool main /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ + if (ix86_tune_memcpy_strategy) + { + char *str = xstrdup (ix86_tune_memcpy_strategy); + ix86_parse_stringop_strategy_string (str, false); + free (str); + } + + if (ix86_tune_memset_strategy) + { + char *str = xstrdup (ix86_tune_memset_strategy); + ix86_parse_stringop_strategy_string (str, true); + free (str); + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ @@ -22964,6 +23122,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: need_zero_guard = true; @@ -23154,6 +23313,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: @@ -23365,6 +23525,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop: need_zero_guard = true; @@ -23542,6 +23703,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: Index: config/i386/i386.opt =================================================================== --- config/i386/i386.opt (revision 201582) +++ config/i386/i386.opt (working copy) @@ -316,6 +316,14 @@ mstack-arg-probe Target Report Mask(STACK_PROBE) Save Enable stack probing +mmemcpy-strategy= +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) +Specify memcpy expansion strategy when expected size is known + +mmemset-strategy= +Target RejectNegative Joined Var(ix86_tune_memset_strategy) +Specify memset expansion strategy when expected size is known + mstringop-strategy= Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) Chose strategy to generate stringop using Index: doc/invoke.texi =================================================================== --- doc/invoke.texi (revision 201581) +++ doc/invoke.texi (working copy) @@ -652,6 +652,7 @@ Objective-C and Objective-C++ Dialects}. -mbmi2 -mrtm -mlwp -mthreads @gol -mno-align-stringops -minline-all-stringops @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol -mregparm=@var{num} -msseregparm @gol @@ -14651,6 +14652,24 @@ Expand into an inline loop. Always use a library call. @end table +@item -mmemcpy-strategy=@var{strategy} +@opindex mmemcpy-strategy=@var{strategy} +Override the internal decision heuristic to decide if @code{__builtin_memcpy} +should be inlined and what inline algorithm to use when the expected size +of the copy operation is known. @var{strategy} +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies +the max byte size with which inline algorithm @var{alg} is allowed. For the last +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets +in the list must be specified in increasing order. The minimal byte size for +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the +preceding range. + +@item -mmemset-strategy=@var{strategy} +@opindex mmemset-strategy=@var{strategy} +The option is similar to @option{-mmemcpy-strategy=} except that it is to control +@code{__builtin_memset} expansion. + @item -momit-leaf-frame-pointer @opindex momit-leaf-frame-pointer Don't keep the frame pointer in a register for leaf functions. This Index: testsuite/gcc.target/i386/memcpy-strategy-2.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memset-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memset" 2 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 1, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-3.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} Index: testsuite/gcc.target/i386/memcpy-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-08 6:17 ` Xinliang David Li @ 2013-08-08 15:18 ` Joseph S. Myers 2013-08-08 16:31 ` Xinliang David Li 0 siblings, 1 reply; 23+ messages in thread From: Joseph S. Myers @ 2013-08-08 15:18 UTC (permalink / raw) To: Xinliang David Li; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson On Wed, 7 Aug 2013, Xinliang David Li wrote: > Updated patch attached (fixed header, buffer overflow, and warning --> > error problems). You still have diagnostics starting with a capital letter, contrary to the GNU Coding Standards. -- Joseph S. Myers joseph@codesourcery.com ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-08 15:18 ` Joseph S. Myers @ 2013-08-08 16:31 ` Xinliang David Li 2013-08-09 18:25 ` Xinliang David Li 0 siblings, 1 reply; 23+ messages in thread From: Xinliang David Li @ 2013-08-08 16:31 UTC (permalink / raw) To: Joseph S. Myers; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson [-- Attachment #1: Type: text/plain, Size: 404 bytes --] Updated. thanks, David On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote: > On Wed, 7 Aug 2013, Xinliang David Li wrote: > >> Updated patch attached (fixed header, buffer overflow, and warning --> >> error problems). > > You still have diagnostics starting with a capital letter, contrary to the > GNU Coding Standards. > > -- > Joseph S. Myers > joseph@codesourcery.com [-- Attachment #2: stringop_inl_option.p.txt --] [-- Type: text/plain, Size: 18415 bytes --] Index: testsuite/gcc.target/i386/memcpy-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-2.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memset-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memset" 2 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 1, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-3.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} Index: doc/invoke.texi =================================================================== --- doc/invoke.texi (revision 201581) +++ doc/invoke.texi (working copy) @@ -652,6 +652,7 @@ Objective-C and Objective-C++ Dialects}. -mbmi2 -mrtm -mlwp -mthreads @gol -mno-align-stringops -minline-all-stringops @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol -mregparm=@var{num} -msseregparm @gol @@ -14651,6 +14652,24 @@ Expand into an inline loop. Always use a library call. @end table +@item -mmemcpy-strategy=@var{strategy} +@opindex mmemcpy-strategy=@var{strategy} +Override the internal decision heuristic to decide if @code{__builtin_memcpy} +should be inlined and what inline algorithm to use when the expected size +of the copy operation is known. @var{strategy} +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies +the max byte size with which inline algorithm @var{alg} is allowed. For the last +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets +in the list must be specified in increasing order. The minimal byte size for +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the +preceding range. + +@item -mmemset-strategy=@var{strategy} +@opindex mmemset-strategy=@var{strategy} +The option is similar to @option{-mmemcpy-strategy=} except that it is to control +@code{__builtin_memset} expansion. + @item -momit-leaf-frame-pointer @opindex momit-leaf-frame-pointer Don't keep the frame pointer in a register for leaf functions. This Index: config/i386/stringop.def =================================================================== --- config/i386/stringop.def (revision 0) +++ config/i386/stringop.def (revision 0) @@ -0,0 +1,37 @@ +/* Definitions for stringop strategy for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the files COPYING3. If not, +see <http://www.gnu.org/licenses/>. */ + +DEF_ENUM +DEF_ALG (no_stringop, no_stringop) +DEF_ENUM +DEF_ALG (libcall, libcall) +DEF_ENUM +DEF_ALG (rep_prefix_1_byte, rep_byte) +DEF_ENUM +DEF_ALG (rep_prefix_4_byte, rep_4byte) +DEF_ENUM +DEF_ALG (rep_prefix_8_byte, rep_8byte) +DEF_ENUM +DEF_ALG (loop_1_byte, byte_loop) +DEF_ENUM +DEF_ALG (loop, loop) +DEF_ENUM +DEF_ALG (unrolled_loop, unrolled_loop) +DEF_ENUM +DEF_ALG (vector_loop, vector_loop) Index: config/i386/stringop.opt =================================================================== --- config/i386/stringop.opt (revision 0) +++ config/i386/stringop.opt (revision 0) @@ -0,0 +1,31 @@ +/* Definitions for stringop option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the files COPYING3. If not, +see <http://www.gnu.org/licenses/>. */ + +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) + +#undef DEF_ENUM +#define DEF_ENUM EnumValue + +#undef DEF_ALG +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) + +#include "stringop.def" + +#undef DEF_ENUM +#undef DEF_ALG Index: config/i386/i386-opts.h =================================================================== --- config/i386/i386-opts.h (revision 201581) +++ config/i386/i386-opts.h (working copy) @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI /* Algorithm to expand string function with. */ enum stringop_alg { - no_stringop, - libcall, - rep_prefix_1_byte, - rep_prefix_4_byte, - rep_prefix_8_byte, - loop_1_byte, - loop, - unrolled_loop, - vector_loop +#undef DEF_ENUM +#define DEF_ENUM + +#undef DEF_ALG +#define DEF_ALG(alg, name) alg, + +#include "stringop.def" +last_alg + +#undef DEF_ENUM +#undef DEF_ALG }; /* Available call abi. */ Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201582) +++ config/i386/i386.c (working copy) @@ -158,7 +158,7 @@ struct processor_costs ix86_size_cost = }; /* Processor costs (relative to an add) */ -static const +static struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -228,7 +228,7 @@ struct processor_costs i386_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -300,7 +300,7 @@ struct processor_costs i486_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -370,7 +370,7 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -449,7 +449,7 @@ struct processor_costs pentiumpro_cost = 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -520,7 +520,7 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -593,7 +593,7 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -666,7 +666,7 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -1267,7 +1267,7 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ @@ -1338,7 +1338,7 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1411,7 +1411,7 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -1558,7 +1558,7 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ -static const +static struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1637,7 +1637,7 @@ struct processor_costs generic64_cost = }; /* core_cost should produce code tuned for Core familly of CPUs. */ -static const +static struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1719,7 +1719,7 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ -static const +static struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -2926,6 +2926,149 @@ ix86_debug_options (void) return; } + +static const char *stringop_alg_names[] = { +#define DEF_ENUM +#define DEF_ALG(alg, name) #name, +#include "stringop.def" +#undef DEF_ENUM +#undef DEF_ALG +}; + +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. + The string is of the following form (or comma separated list of it): + + strategy_alg:max_size:[align|noalign] + + where the full size range for the strategy is either [0, max_size] or + [min_size, max_size], in which min_size is the max_size + 1 of the + preceding range. The last size range must have max_size == -1. + + Examples: + + 1. + -mmemcpy-strategy=libcall:-1:noalign + + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + + + 2. + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign + + This is to tell the compiler to use the following strategy for memset + 1) when the expected size is between [1, 16], use rep_8byte strategy; + 2) when the size is between [17, 2048], use vector_loop; + 3) when the size is > 2048, use libcall. */ + +struct stringop_size_range +{ + int min; + int max; + stringop_alg alg; + bool noalign; +}; + +static void +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +{ + const struct stringop_algs *default_algs; + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; + char *curr_range_str, *next_range_str; + int i = 0, n = 0; + + if (is_memset) + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + curr_range_str = strategy_str; + + do + { + int mins, maxs; + stringop_alg alg; + char alg_name[128]; + char align[16]; + next_range_str = strchr (curr_range_str, ','); + if (next_range_str) + *next_range_str++ = '\0'; + + if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s", + alg_name, &maxs, align)) + { + error ("wrong arg %s to option %s", curr_range_str, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) + { + error ("size ranges of option %s should be increasing", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + for (i = 0; i < last_alg; i++) + { + if (!strcmp (alg_name, stringop_alg_names[i])) + { + alg = (stringop_alg) i; + break; + } + } + + if (i == last_alg) + { + error ("wrong stringop strategy name %s specified for option %s", + alg_name, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + input_ranges[n].min = mins; + input_ranges[n].max = maxs; + input_ranges[n].alg = alg; + if (!strcmp (align, "align")) + input_ranges[n].noalign = false; + else if (!strcmp (align, "noalign")) + input_ranges[n].noalign = true; + else + { + error ("unknown alignment %s specified for option %s", + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + n++; + curr_range_str = next_range_str; + } + while (curr_range_str); + + if (input_ranges[n - 1].max != -1) + { + error ("the max value for the last size range should be -1" + " for option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > MAX_STRINGOP_ALGS) + { + error ("too many size ranges specified in option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + /* Now override the default algs array. */ + for (i = 0; i < n; i++) + { + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; + *const_cast<stringop_alg *>(&default_algs->size[i].alg) + = input_ranges[i].alg; + *const_cast<int *>(&default_algs->size[i].noalign) + = input_ranges[i].noalign; + } +} + \f /* Override various settings based on options. If MAIN_ARGS_P, the options are from the command line, otherwise they are from @@ -4081,6 +4224,21 @@ ix86_option_override_internal (bool main /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ + if (ix86_tune_memcpy_strategy) + { + char *str = xstrdup (ix86_tune_memcpy_strategy); + ix86_parse_stringop_strategy_string (str, false); + free (str); + } + + if (ix86_tune_memset_strategy) + { + char *str = xstrdup (ix86_tune_memset_strategy); + ix86_parse_stringop_strategy_string (str, true); + free (str); + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ @@ -22964,6 +23122,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: need_zero_guard = true; @@ -23154,6 +23313,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: @@ -23365,6 +23525,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop: need_zero_guard = true; @@ -23542,6 +23703,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: Index: config/i386/i386.opt =================================================================== --- config/i386/i386.opt (revision 201582) +++ config/i386/i386.opt (working copy) @@ -316,6 +316,14 @@ mstack-arg-probe Target Report Mask(STACK_PROBE) Save Enable stack probing +mmemcpy-strategy= +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) +Specify memcpy expansion strategy when expected size is known + +mmemset-strategy= +Target RejectNegative Joined Var(ix86_tune_memset_strategy) +Specify memset expansion strategy when expected size is known + mstringop-strategy= Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) Chose strategy to generate stringop using ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-08 16:31 ` Xinliang David Li @ 2013-08-09 18:25 ` Xinliang David Li 2013-08-09 18:33 ` Jan Hubicka 0 siblings, 1 reply; 23+ messages in thread From: Xinliang David Li @ 2013-08-09 18:25 UTC (permalink / raw) To: Joseph S. Myers; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson Is this version ok for trunk? thanks, David On Thu, Aug 8, 2013 at 9:31 AM, Xinliang David Li <davidxl@google.com> wrote: > Updated. > > thanks, > > David > > On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote: >> On Wed, 7 Aug 2013, Xinliang David Li wrote: >> >>> Updated patch attached (fixed header, buffer overflow, and warning --> >>> error problems). >> >> You still have diagnostics starting with a capital letter, contrary to the >> GNU Coding Standards. >> >> -- >> Joseph S. Myers >> joseph@codesourcery.com ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-09 18:25 ` Xinliang David Li @ 2013-08-09 18:33 ` Jan Hubicka 2013-08-09 19:41 ` Xinliang David Li 0 siblings, 1 reply; 23+ messages in thread From: Jan Hubicka @ 2013-08-09 18:33 UTC (permalink / raw) To: Xinliang David Li Cc: Joseph S. Myers, Jan Hubicka, GCC Patches, Teresa Johnson > Is this version ok for trunk? It looks resonable, but I still do not like much the removal of const for tables. Doing so will push them all into David Malcom's per-thread global universe. Currently the algorithm is selected based on cost->memset/cost->memcpy. Instead of removing the const of all the CPU tables, I would preffer introducing two readwrite global variables memset_algs/memcpy_algs and feed them with proper table at a time we set up ix86_tune_features. This has chance to do the right thing with optimize attribute specifying algorithms and with the longer term threading plan. Honza > > thanks, > > David > > On Thu, Aug 8, 2013 at 9:31 AM, Xinliang David Li <davidxl@google.com> wrote: > > Updated. > > > > thanks, > > > > David > > > > On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote: > >> On Wed, 7 Aug 2013, Xinliang David Li wrote: > >> > >>> Updated patch attached (fixed header, buffer overflow, and warning --> > >>> error problems). > >> > >> You still have diagnostics starting with a capital letter, contrary to the > >> GNU Coding Standards. > >> > >> -- > >> Joseph S. Myers > >> joseph@codesourcery.com ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-09 18:33 ` Jan Hubicka @ 2013-08-09 19:41 ` Xinliang David Li 2013-08-09 19:49 ` Jan Hubicka 0 siblings, 1 reply; 23+ messages in thread From: Xinliang David Li @ 2013-08-09 19:41 UTC (permalink / raw) To: Jan Hubicka; +Cc: Joseph S. Myers, GCC Patches, Teresa Johnson On Fri, Aug 9, 2013 at 11:33 AM, Jan Hubicka <hubicka@ucw.cz> wrote: >> Is this version ok for trunk? > > It looks resonable, but I still do not like much the removal of const for tables. > Doing so will push them all into David Malcom's per-thread global universe. > > Currently the algorithm is selected based on cost->memset/cost->memcpy. > Instead of removing the const of all the CPU tables, I would preffer > introducing two readwrite global variables memset_algs/memcpy_algs and feed > them with proper table at a time we set up ix86_tune_features. > I can do that in this patch. In the future, when we need to do tunings for those constants, we can revisit it. thanks, David > This has chance to do the right thing with optimize attribute specifying algorithms > and with the longer term threading plan. > > Honza >> >> thanks, >> >> David >> >> On Thu, Aug 8, 2013 at 9:31 AM, Xinliang David Li <davidxl@google.com> wrote: >> > Updated. >> > >> > thanks, >> > >> > David >> > >> > On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote: >> >> On Wed, 7 Aug 2013, Xinliang David Li wrote: >> >> >> >>> Updated patch attached (fixed header, buffer overflow, and warning --> >> >>> error problems). >> >> >> >> You still have diagnostics starting with a capital letter, contrary to the >> >> GNU Coding Standards. >> >> >> >> -- >> >> Joseph S. Myers >> >> joseph@codesourcery.com ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-09 19:41 ` Xinliang David Li @ 2013-08-09 19:49 ` Jan Hubicka 0 siblings, 0 replies; 23+ messages in thread From: Jan Hubicka @ 2013-08-09 19:49 UTC (permalink / raw) To: Xinliang David Li Cc: Jan Hubicka, Joseph S. Myers, GCC Patches, Teresa Johnson > On Fri, Aug 9, 2013 at 11:33 AM, Jan Hubicka <hubicka@ucw.cz> wrote: > >> Is this version ok for trunk? > > > > It looks resonable, but I still do not like much the removal of const for tables. > > Doing so will push them all into David Malcom's per-thread global universe. > > > > Currently the algorithm is selected based on cost->memset/cost->memcpy. > > Instead of removing the const of all the CPU tables, I would preffer > > introducing two readwrite global variables memset_algs/memcpy_algs and feed > > them with proper table at a time we set up ix86_tune_features. > > > > I can do that in this patch. In the future, when we need to do tunings > for those constants, we can revisit it. Yep, I think we can follow same strategy and just move them to a global constant. Those are part of the context/universum since they will be user rewritable then. Thanks, the patch is OK with this change. Honza > > thanks, > > David > > > This has chance to do the right thing with optimize attribute specifying algorithms > > and with the longer term threading plan. > > > > Honza > >> > >> thanks, > >> > >> David > >> > >> On Thu, Aug 8, 2013 at 9:31 AM, Xinliang David Li <davidxl@google.com> wrote: > >> > Updated. > >> > > >> > thanks, > >> > > >> > David > >> > > >> > On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote: > >> >> On Wed, 7 Aug 2013, Xinliang David Li wrote: > >> >> > >> >>> Updated patch attached (fixed header, buffer overflow, and warning --> > >> >>> error problems). > >> >> > >> >> You still have diagnostics starting with a capital letter, contrary to the > >> >> GNU Coding Standards. > >> >> > >> >> -- > >> >> Joseph S. Myers > >> >> joseph@codesourcery.com ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-06 8:46 ` Michael Zolotukhin 2013-08-06 9:42 ` Jan Hubicka @ 2013-08-06 16:42 ` Xinliang David Li 2013-08-06 16:45 ` Xinliang David Li 2013-08-07 7:14 ` Michael Zolotukhin 1 sibling, 2 replies; 23+ messages in thread From: Xinliang David Li @ 2013-08-06 16:42 UTC (permalink / raw) To: Michael Zolotukhin; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson Corrected two small problems reported by the style checker (The warnings about the EnumValue for options in stringopt.opt are not valid). On Tue, Aug 6, 2013 at 1:46 AM, Michael Zolotukhin <michael.v.zolotukhin@gmail.com> wrote: > There are still some formatting issues (like 8 spaces instead of a > tab, wrong indentation of do-loop and some other places) - to reveal > some of them you could use contrib/check_GNU_style.sh script. > But that was a nitpicking again:) Actually I wanted to ask whether > you're going to use this option for some performance experiments > involving memmov/memset - if so, probably you could tune existing > cost-models as well? Is it possible? the option is designed for purpose like this. thanks, David > > Michael > > On 5 August 2013 20:44, Xinliang David Li <davidxl@google.com> wrote: >> thanks. Updated patch attached. >> >> David >> >> On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin >> <michael.v.zolotukhin@gmail.com> wrote: >>> Hi, >>> This is a really convenient option, thanks for working on it. >>> I can't approve it as I'm not a maintainer, but it looks ok to me, >>> except fot a small nitpicking: afair, comments should end with >>> dot-space-space. >>> >>> Michael >>> >>> On 04 Aug 20:01, Xinliang David Li wrote: >>>> The attached is a new patch implementing the stringop inline strategy >>>> control using two new -m options: >>>> >>>> -mmemcpy-strategy= >>>> -mmemset-strategy= >>>> >>>> See changes in doc/invoke.texi for description of the new options. Example: >>>> -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned >>>> >>>> tells compiler to inline memcpy using rep_8byte when the size is no >>>> larger than 64 byte, using unrolled_loop when size is no larger than >>>> 2048, and for size > 2048, using library call. In all cases, >>>> destination alignment adjustment is not done. >>>> >>>> Tested on x86-64/linux. Ok for trunk? >>>> >>>> thanks, >>>> >>>> David >>>> >>>> 2013-08-02 Xinliang David Li <davidxl@google.com> >>>> >>>> * config/i386/stringop.def: New file. >>>> * config/i386/stringop.opt: New file. >>>> * config/i386/i386-opts.h: Include stringopt.def. >>>> * config/i386/i386.opt: Include stringopt.opt. >>>> * config/i386/i386.c (ix86_option_override_internal): >>>> Override default size based stringop inline strategies >>>> with options. >>>> * config/i386/i386.c (ix86_parse_stringop_strategy_string): >>>> New function. >>>> >>>> 2013-08-04 Xinliang David Li <davidxl@google.com> >>>> >>>> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. >>>> * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. >>>> * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. >>>> * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. >>>> >>>> >>>> >>>> >>>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: >>>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with >>>> > FDO), libcall strategy is used with the size is > 8192. This value is >>>> > hard coded, which makes it hard to do performance tuning. This patch >>>> > adds two new parameters to do that. Potential usage includes >>>> > per-application libcall strategy min-size tuning based on summary data >>>> > with FDO (e.g, instruction workset size). >>>> > >>>> > Bootstrap and tested on x86_64/linux. Ok for trunk? >>>> > >>>> > thanks, >>>> > >>>> > David >>>> > >>>> > >>>> > 2013-08-02 Xinliang David Li <davidxl@google.com> >>>> > >>>> > * params.def: New parameters. >>>> > * config/i386/i386.c (ix86_option_override_internal): >>>> > Override default libcall size limit with parameters. >>> >>>> Index: config/i386/stringop.def >>>> =================================================================== >>>> --- config/i386/stringop.def (revision 0) >>>> +++ config/i386/stringop.def (revision 0) >>>> @@ -0,0 +1,42 @@ >>>> +/* Definitions for option handling for IA-32. >>>> + Copyright (C) 2013 Free Software Foundation, Inc. >>>> + >>>> +This file is part of GCC. >>>> + >>>> +GCC is free software; you can redistribute it and/or modify >>>> +it under the terms of the GNU General Public License as published by >>>> +the Free Software Foundation; either version 3, or (at your option) >>>> +any later version. >>>> + >>>> +GCC is distributed in the hope that it will be useful, >>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>>> +GNU General Public License for more details. >>>> + >>>> +Under Section 7 of GPL version 3, you are granted additional >>>> +permissions described in the GCC Runtime Library Exception, version >>>> +3.1, as published by the Free Software Foundation. >>>> + >>>> +You should have received a copy of the GNU General Public License and >>>> +a copy of the GCC Runtime Library Exception along with this program; >>>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >>>> +<http://www.gnu.org/licenses/>. */ >>>> + >>>> +DEF_ENUM >>>> +DEF_ALG (no_stringop, no_stringop) >>>> +DEF_ENUM >>>> +DEF_ALG (libcall, libcall) >>>> +DEF_ENUM >>>> +DEF_ALG (rep_prefix_1_byte, rep_byte) >>>> +DEF_ENUM >>>> +DEF_ALG (rep_prefix_4_byte, rep_4byte) >>>> +DEF_ENUM >>>> +DEF_ALG (rep_prefix_8_byte, rep_8byte) >>>> +DEF_ENUM >>>> +DEF_ALG (loop_1_byte, byte_loop) >>>> +DEF_ENUM >>>> +DEF_ALG (loop, loop) >>>> +DEF_ENUM >>>> +DEF_ALG (unrolled_loop, unrolled_loop) >>>> +DEF_ENUM >>>> +DEF_ALG (vector_loop, vector_loop) >>>> Index: config/i386/i386.opt >>>> =================================================================== >>>> --- config/i386/i386.opt (revision 201458) >>>> +++ config/i386/i386.opt (working copy) >>>> @@ -316,6 +316,14 @@ mstack-arg-probe >>>> Target Report Mask(STACK_PROBE) Save >>>> Enable stack probing >>>> >>>> +mmemcpy-strategy= >>>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) >>>> +Specify memcpy expansion strategy when expected size is known >>>> + >>>> +mmemset-strategy= >>>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy) >>>> +Specify memset expansion strategy when expected size is known >>>> + >>>> mstringop-strategy= >>>> Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) >>>> Chose strategy to generate stringop using >>>> Index: config/i386/stringop.opt >>>> =================================================================== >>>> --- config/i386/stringop.opt (revision 0) >>>> +++ config/i386/stringop.opt (revision 0) >>>> @@ -0,0 +1,36 @@ >>>> +/* Definitions for option handling for IA-32. >>>> + Copyright (C) 2013 Free Software Foundation, Inc. >>>> + >>>> +This file is part of GCC. >>>> + >>>> +GCC is free software; you can redistribute it and/or modify >>>> +it under the terms of the GNU General Public License as published by >>>> +the Free Software Foundation; either version 3, or (at your option) >>>> +any later version. >>>> + >>>> +GCC is distributed in the hope that it will be useful, >>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>>> +GNU General Public License for more details. >>>> + >>>> +Under Section 7 of GPL version 3, you are granted additional >>>> +permissions described in the GCC Runtime Library Exception, version >>>> +3.1, as published by the Free Software Foundation. >>>> + >>>> +You should have received a copy of the GNU General Public License and >>>> +a copy of the GCC Runtime Library Exception along with this program; >>>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >>>> +<http://www.gnu.org/licenses/>. */ >>>> + >>>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) >>>> + >>>> +#undef DEF_ENUM >>>> +#define DEF_ENUM EnumValue >>>> + >>>> +#undef DEF_ALG >>>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) >>>> + >>>> +#include "stringop.def" >>>> + >>>> +#undef DEF_ENUM >>>> +#undef DEF_ALG >>>> Index: config/i386/i386.c >>>> =================================================================== >>>> --- config/i386/i386.c (revision 201458) >>>> +++ config/i386/i386.c (working copy) >>>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = >>>> }; >>>> >>>> /* Processor costs (relative to an add) */ >>>> -static const >>>> +static >>>> struct processor_costs i386_cost = { /* 386 specific costs */ >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs i486_cost = { /* 486 specific costs */ >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs pentium_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs pentiumpro_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs geode_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs k6_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs athlon_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs k8_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs pentium4_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (3), /* cost of a lea instruction */ >>>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs nocona_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { >>>> 1, /* cond_not_taken_branch_cost. */ >>>> }; >>>> >>>> -static const >>>> +static >>>> struct processor_costs atom_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >>>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { >>>> }; >>>> >>>> /* Generic64 should produce code tuned for Nocona and K8. */ >>>> -static const >>>> +static >>>> struct processor_costs generic64_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> /* On all chips taken into consideration lea is 2 cycles and more. With >>>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = >>>> }; >>>> >>>> /* core_cost should produce code tuned for Core familly of CPUs. */ >>>> -static const >>>> +static >>>> struct processor_costs core_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> /* On all chips taken into consideration lea is 2 cycles and more. With >>>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { >>>> >>>> /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, >>>> Athlon and K8. */ >>>> -static const >>>> +static >>>> struct processor_costs generic32_cost = { >>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >>>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void) >>>> >>>> return; >>>> } >>>> + >>>> +static const char *stringop_alg_names[] = { >>>> +#define DEF_ENUM >>>> +#define DEF_ALG(alg, name) #name, >>>> +#include "stringop.def" >>>> +#undef DEF_ENUM >>>> +#undef DEF_ALG >>>> +}; >>>> + >>>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. >>>> + The string is of the following form (or comma separated list of it): >>>> + >>>> + strategy_alg:max_size:[align|noalign] >>>> + >>>> + where the full size range for the strategy is either [0, max_size] or >>>> + [min_size, max_size], in which min_size is the max_size + 1 of the >>>> + preceding range. The last size range must have max_size == -1. >>>> + >>>> + Examples: >>>> + >>>> + 1. >>>> + -mmemcpy-strategy=libcall:-1:noalign >>>> + >>>> + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall >>>> + >>>> + >>>> + 2. >>>> + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign >>>> + >>>> + This is to tell the compiler to use the following strategy for memset >>>> + 1) when the expected size is between [1, 16], use rep_8byte strategy; >>>> + 2) when the size is between [17, 2048], use vector_loop; >>>> + 3) when the size is > 2048, use libcall. >>>> + >>>> +*/ >>>> + >>>> +struct stringop_size_range >>>> +{ >>>> + int min; >>>> + int max; >>>> + stringop_alg alg; >>>> + bool noalign; >>>> +}; >>>> + >>>> +static void >>>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) >>>> +{ >>>> + const struct stringop_algs *default_algs; >>>> + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; >>>> + char *curr_range_str, *next_range_str; >>>> + int i = 0, n = 0; >>>> + >>>> + if (is_memset) >>>> + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; >>>> + else >>>> + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; >>>> + >>>> + curr_range_str = strategy_str; >>>> + >>>> + do { >>>> + >>>> + int mins, maxs; >>>> + stringop_alg alg; >>>> + char alg_name[128]; >>>> + char align[16]; >>>> + >>>> + next_range_str = strchr (curr_range_str, ','); >>>> + if (next_range_str) >>>> + *next_range_str++ = '\0'; >>>> + >>>> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) >>>> + { >>>> + warning (0, "Wrong arg %s to option %s", curr_range_str, >>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>> + return; >>>> + } >>>> + >>>> + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) >>>> + { >>>> + warning (0, "Size ranges of option %s should be increasing", >>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>> + return; >>>> + } >>>> + >>>> + for (i = 0; i < last_alg; i++) >>>> + { >>>> + if (!strcmp (alg_name, stringop_alg_names[i])) >>>> + { >>>> + alg = (stringop_alg) i; >>>> + break; >>>> + } >>>> + } >>>> + >>>> + if (i == last_alg) >>>> + { >>>> + warning (0, "Wrong stringop strategy name %s specified for option %s", >>>> + alg_name, >>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>> + return; >>>> + } >>>> + >>>> + input_ranges[n].min = mins; >>>> + input_ranges[n].max = maxs; >>>> + input_ranges[n].alg = alg; >>>> + if (!strcmp (align, "align")) >>>> + input_ranges[n].noalign = false; >>>> + else if (!strcmp (align, "noalign")) >>>> + input_ranges[n].noalign = true; >>>> + else >>>> + { >>>> + warning (0, "Unknown alignment %s specified for option %s", >>>> + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>> + return; >>>> + } >>>> + n++; >>>> + curr_range_str = next_range_str; >>>> + } while (curr_range_str); >>>> + >>>> + if (input_ranges[n - 1].max != -1) >>>> + { >>>> + warning (0, "The max value for the last size range should be -1" >>>> + " for option %s", >>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>> + return; >>>> + } >>>> + >>>> + if (n > MAX_STRINGOP_ALGS) >>>> + { >>>> + warning (0, "Too many size ranges specified in option %s", >>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>> + return; >>>> + } >>>> + >>>> + /* Now override the default algs array */ >>>> + for (i = 0; i < n; i++) >>>> + { >>>> + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; >>>> + *const_cast<stringop_alg *>(&default_algs->size[i].alg) >>>> + = input_ranges[i].alg; >>>> + *const_cast<int *>(&default_algs->size[i].noalign) >>>> + = input_ranges[i].noalign; >>>> + } >>>> +} >>>> + >>>> >>>> /* Override various settings based on options. If MAIN_ARGS_P, the >>>> options are from the command line, otherwise they are from >>>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main >>>> /* Handle stack protector */ >>>> if (!global_options_set.x_ix86_stack_protector_guard) >>>> ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; >>>> + >>>> + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ >>>> + if (ix86_tune_memcpy_strategy) >>>> + { >>>> + char *str = xstrdup (ix86_tune_memcpy_strategy); >>>> + ix86_parse_stringop_strategy_string (str, false); >>>> + free (str); >>>> + } >>>> + >>>> + if (ix86_tune_memset_strategy) >>>> + { >>>> + char *str = xstrdup (ix86_tune_memset_strategy); >>>> + ix86_parse_stringop_strategy_string (str, true); >>>> + free (str); >>>> + } >>>> } >>>> >>>> /* Implement the TARGET_OPTION_OVERRIDE hook. */ >>>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >>>> { >>>> case libcall: >>>> case no_stringop: >>>> + case last_alg: >>>> gcc_unreachable (); >>>> case loop_1_byte: >>>> need_zero_guard = true; >>>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >>>> { >>>> case libcall: >>>> case no_stringop: >>>> + case last_alg: >>>> gcc_unreachable (); >>>> case loop_1_byte: >>>> case loop: >>>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >>>> { >>>> case libcall: >>>> case no_stringop: >>>> + case last_alg: >>>> gcc_unreachable (); >>>> case loop: >>>> need_zero_guard = true; >>>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >>>> { >>>> case libcall: >>>> case no_stringop: >>>> + case last_alg: >>>> gcc_unreachable (); >>>> case loop_1_byte: >>>> case loop: >>>> Index: config/i386/i386-opts.h >>>> =================================================================== >>>> --- config/i386/i386-opts.h (revision 201458) >>>> +++ config/i386/i386-opts.h (working copy) >>>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI >>>> /* Algorithm to expand string function with. */ >>>> enum stringop_alg >>>> { >>>> - no_stringop, >>>> - libcall, >>>> - rep_prefix_1_byte, >>>> - rep_prefix_4_byte, >>>> - rep_prefix_8_byte, >>>> - loop_1_byte, >>>> - loop, >>>> - unrolled_loop, >>>> - vector_loop >>>> +#undef DEF_ENUM >>>> +#define DEF_ENUM >>>> + >>>> +#undef DEF_ALG >>>> +#define DEF_ALG(alg, name) alg, >>>> + >>>> +#include "stringop.def" >>>> +last_alg >>>> + >>>> +#undef DEF_ENUM >>>> +#undef DEF_ALG >>>> }; >>>> >>>> /* Available call abi. */ >>>> Index: doc/invoke.texi >>>> =================================================================== >>>> --- doc/invoke.texi (revision 201458) >>>> +++ doc/invoke.texi (working copy) >>>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. >>>> -mbmi2 -mrtm -mlwp -mthreads @gol >>>> -mno-align-stringops -minline-all-stringops @gol >>>> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol >>>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} >>>> -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol >>>> -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol >>>> -mregparm=@var{num} -msseregparm @gol >>>> @@ -14598,6 +14599,24 @@ Expand into an inline loop. >>>> Always use a library call. >>>> @end table >>>> >>>> +@item -mmemcpy-strategy=@var{strategy} >>>> +@opindex mmemcpy-strategy=@var{strategy} >>>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy} >>>> +should be inlined and what inline algorithm to use when the expected size >>>> +of the copy operation is known. @var{strategy} >>>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. >>>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies >>>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last >>>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets >>>> +in the list must be specified in increasing order. The minimal byte size for >>>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the >>>> +preceding range. >>>> + >>>> +@item -mmemset-strategy=@var{strategy} >>>> +@opindex mmemset-strategy=@var{strategy} >>>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control >>>> +@code{__builtin_memset} expansion. >>>> + >>>> @item -momit-leaf-frame-pointer >>>> @opindex momit-leaf-frame-pointer >>>> Don't keep the frame pointer in a register for leaf functions. This >>>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c >>>> =================================================================== >>>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >>>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >>>> @@ -0,0 +1,12 @@ >>>> +/* { dg-do compile } */ >>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ >>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >>>> + >>>> +char a[2048]; >>>> +char b[2048]; >>>> +void t (void) >>>> +{ >>>> + __builtin_memcpy (a, b, 2048); >>>> +} >>>> + >>>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c >>>> =================================================================== >>>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >>>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >>>> @@ -0,0 +1,12 @@ >>>> +/* { dg-do compile } */ >>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ >>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >>>> + >>>> +char a[2048]; >>>> +char b[2048]; >>>> +void t (void) >>>> +{ >>>> + __builtin_memcpy (a, b, 2048); >>>> +} >>>> + >>>> Index: testsuite/gcc.target/i386/memset-strategy-1.c >>>> =================================================================== >>>> --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >>>> +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >>>> @@ -0,0 +1,10 @@ >>>> +/* { dg-do compile } */ >>>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ >>>> +/* { dg-final { scan-assembler-times "memset" 2 } } */ >>>> + >>>> +char a[2048]; >>>> +void t (void) >>>> +{ >>>> + __builtin_memset (a, 1, 2048); >>>> +} >>>> + >>>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c >>>> =================================================================== >>>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >>>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >>>> @@ -0,0 +1,11 @@ >>>> +/* { dg-do compile } */ >>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ >>>> +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ >>>> + >>>> +char a[2048]; >>>> +char b[2048]; >>>> +void t (void) >>>> +{ >>>> + __builtin_memcpy (a, b, 2048); >>>> +} >>>> + >>> > > > > -- > --- > Best regards, > Michael V. Zolotukhin, > Software Engineer > Intel Corporation. ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-06 16:42 ` Xinliang David Li @ 2013-08-06 16:45 ` Xinliang David Li 2013-08-07 7:14 ` Michael Zolotukhin 1 sibling, 0 replies; 23+ messages in thread From: Xinliang David Li @ 2013-08-06 16:45 UTC (permalink / raw) To: Michael Zolotukhin; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson [-- Attachment #1: Type: text/plain, Size: 27083 bytes --] Forgot the patch. David On Tue, Aug 6, 2013 at 9:42 AM, Xinliang David Li <davidxl@google.com> wrote: > Corrected two small problems reported by the style checker (The > warnings about the EnumValue for options in stringopt.opt are not > valid). > > On Tue, Aug 6, 2013 at 1:46 AM, Michael Zolotukhin > <michael.v.zolotukhin@gmail.com> wrote: >> There are still some formatting issues (like 8 spaces instead of a >> tab, wrong indentation of do-loop and some other places) - to reveal >> some of them you could use contrib/check_GNU_style.sh script. >> But that was a nitpicking again:) Actually I wanted to ask whether >> you're going to use this option for some performance experiments >> involving memmov/memset - if so, probably you could tune existing >> cost-models as well? Is it possible? > > the option is designed for purpose like this. > > thanks, > > David > >> >> Michael >> >> On 5 August 2013 20:44, Xinliang David Li <davidxl@google.com> wrote: >>> thanks. Updated patch attached. >>> >>> David >>> >>> On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin >>> <michael.v.zolotukhin@gmail.com> wrote: >>>> Hi, >>>> This is a really convenient option, thanks for working on it. >>>> I can't approve it as I'm not a maintainer, but it looks ok to me, >>>> except fot a small nitpicking: afair, comments should end with >>>> dot-space-space. >>>> >>>> Michael >>>> >>>> On 04 Aug 20:01, Xinliang David Li wrote: >>>>> The attached is a new patch implementing the stringop inline strategy >>>>> control using two new -m options: >>>>> >>>>> -mmemcpy-strategy= >>>>> -mmemset-strategy= >>>>> >>>>> See changes in doc/invoke.texi for description of the new options. Example: >>>>> -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned >>>>> >>>>> tells compiler to inline memcpy using rep_8byte when the size is no >>>>> larger than 64 byte, using unrolled_loop when size is no larger than >>>>> 2048, and for size > 2048, using library call. In all cases, >>>>> destination alignment adjustment is not done. >>>>> >>>>> Tested on x86-64/linux. Ok for trunk? >>>>> >>>>> thanks, >>>>> >>>>> David >>>>> >>>>> 2013-08-02 Xinliang David Li <davidxl@google.com> >>>>> >>>>> * config/i386/stringop.def: New file. >>>>> * config/i386/stringop.opt: New file. >>>>> * config/i386/i386-opts.h: Include stringopt.def. >>>>> * config/i386/i386.opt: Include stringopt.opt. >>>>> * config/i386/i386.c (ix86_option_override_internal): >>>>> Override default size based stringop inline strategies >>>>> with options. >>>>> * config/i386/i386.c (ix86_parse_stringop_strategy_string): >>>>> New function. >>>>> >>>>> 2013-08-04 Xinliang David Li <davidxl@google.com> >>>>> >>>>> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. >>>>> * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. >>>>> * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. >>>>> * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. >>>>> >>>>> >>>>> >>>>> >>>>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: >>>>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with >>>>> > FDO), libcall strategy is used with the size is > 8192. This value is >>>>> > hard coded, which makes it hard to do performance tuning. This patch >>>>> > adds two new parameters to do that. Potential usage includes >>>>> > per-application libcall strategy min-size tuning based on summary data >>>>> > with FDO (e.g, instruction workset size). >>>>> > >>>>> > Bootstrap and tested on x86_64/linux. Ok for trunk? >>>>> > >>>>> > thanks, >>>>> > >>>>> > David >>>>> > >>>>> > >>>>> > 2013-08-02 Xinliang David Li <davidxl@google.com> >>>>> > >>>>> > * params.def: New parameters. >>>>> > * config/i386/i386.c (ix86_option_override_internal): >>>>> > Override default libcall size limit with parameters. >>>> >>>>> Index: config/i386/stringop.def >>>>> =================================================================== >>>>> --- config/i386/stringop.def (revision 0) >>>>> +++ config/i386/stringop.def (revision 0) >>>>> @@ -0,0 +1,42 @@ >>>>> +/* Definitions for option handling for IA-32. >>>>> + Copyright (C) 2013 Free Software Foundation, Inc. >>>>> + >>>>> +This file is part of GCC. >>>>> + >>>>> +GCC is free software; you can redistribute it and/or modify >>>>> +it under the terms of the GNU General Public License as published by >>>>> +the Free Software Foundation; either version 3, or (at your option) >>>>> +any later version. >>>>> + >>>>> +GCC is distributed in the hope that it will be useful, >>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>>>> +GNU General Public License for more details. >>>>> + >>>>> +Under Section 7 of GPL version 3, you are granted additional >>>>> +permissions described in the GCC Runtime Library Exception, version >>>>> +3.1, as published by the Free Software Foundation. >>>>> + >>>>> +You should have received a copy of the GNU General Public License and >>>>> +a copy of the GCC Runtime Library Exception along with this program; >>>>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >>>>> +<http://www.gnu.org/licenses/>. */ >>>>> + >>>>> +DEF_ENUM >>>>> +DEF_ALG (no_stringop, no_stringop) >>>>> +DEF_ENUM >>>>> +DEF_ALG (libcall, libcall) >>>>> +DEF_ENUM >>>>> +DEF_ALG (rep_prefix_1_byte, rep_byte) >>>>> +DEF_ENUM >>>>> +DEF_ALG (rep_prefix_4_byte, rep_4byte) >>>>> +DEF_ENUM >>>>> +DEF_ALG (rep_prefix_8_byte, rep_8byte) >>>>> +DEF_ENUM >>>>> +DEF_ALG (loop_1_byte, byte_loop) >>>>> +DEF_ENUM >>>>> +DEF_ALG (loop, loop) >>>>> +DEF_ENUM >>>>> +DEF_ALG (unrolled_loop, unrolled_loop) >>>>> +DEF_ENUM >>>>> +DEF_ALG (vector_loop, vector_loop) >>>>> Index: config/i386/i386.opt >>>>> =================================================================== >>>>> --- config/i386/i386.opt (revision 201458) >>>>> +++ config/i386/i386.opt (working copy) >>>>> @@ -316,6 +316,14 @@ mstack-arg-probe >>>>> Target Report Mask(STACK_PROBE) Save >>>>> Enable stack probing >>>>> >>>>> +mmemcpy-strategy= >>>>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) >>>>> +Specify memcpy expansion strategy when expected size is known >>>>> + >>>>> +mmemset-strategy= >>>>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy) >>>>> +Specify memset expansion strategy when expected size is known >>>>> + >>>>> mstringop-strategy= >>>>> Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) >>>>> Chose strategy to generate stringop using >>>>> Index: config/i386/stringop.opt >>>>> =================================================================== >>>>> --- config/i386/stringop.opt (revision 0) >>>>> +++ config/i386/stringop.opt (revision 0) >>>>> @@ -0,0 +1,36 @@ >>>>> +/* Definitions for option handling for IA-32. >>>>> + Copyright (C) 2013 Free Software Foundation, Inc. >>>>> + >>>>> +This file is part of GCC. >>>>> + >>>>> +GCC is free software; you can redistribute it and/or modify >>>>> +it under the terms of the GNU General Public License as published by >>>>> +the Free Software Foundation; either version 3, or (at your option) >>>>> +any later version. >>>>> + >>>>> +GCC is distributed in the hope that it will be useful, >>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>>>> +GNU General Public License for more details. >>>>> + >>>>> +Under Section 7 of GPL version 3, you are granted additional >>>>> +permissions described in the GCC Runtime Library Exception, version >>>>> +3.1, as published by the Free Software Foundation. >>>>> + >>>>> +You should have received a copy of the GNU General Public License and >>>>> +a copy of the GCC Runtime Library Exception along with this program; >>>>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >>>>> +<http://www.gnu.org/licenses/>. */ >>>>> + >>>>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) >>>>> + >>>>> +#undef DEF_ENUM >>>>> +#define DEF_ENUM EnumValue >>>>> + >>>>> +#undef DEF_ALG >>>>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) >>>>> + >>>>> +#include "stringop.def" >>>>> + >>>>> +#undef DEF_ENUM >>>>> +#undef DEF_ALG >>>>> Index: config/i386/i386.c >>>>> =================================================================== >>>>> --- config/i386/i386.c (revision 201458) >>>>> +++ config/i386/i386.c (working copy) >>>>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = >>>>> }; >>>>> >>>>> /* Processor costs (relative to an add) */ >>>>> -static const >>>>> +static >>>>> struct processor_costs i386_cost = { /* 386 specific costs */ >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs i486_cost = { /* 486 specific costs */ >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs pentium_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs pentiumpro_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs geode_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs k6_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs athlon_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs k8_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs pentium4_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (3), /* cost of a lea instruction */ >>>>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs nocona_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs atom_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >>>>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { >>>>> }; >>>>> >>>>> /* Generic64 should produce code tuned for Nocona and K8. */ >>>>> -static const >>>>> +static >>>>> struct processor_costs generic64_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> /* On all chips taken into consideration lea is 2 cycles and more. With >>>>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = >>>>> }; >>>>> >>>>> /* core_cost should produce code tuned for Core familly of CPUs. */ >>>>> -static const >>>>> +static >>>>> struct processor_costs core_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> /* On all chips taken into consideration lea is 2 cycles and more. With >>>>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { >>>>> >>>>> /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, >>>>> Athlon and K8. */ >>>>> -static const >>>>> +static >>>>> struct processor_costs generic32_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >>>>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void) >>>>> >>>>> return; >>>>> } >>>>> + >>>>> +static const char *stringop_alg_names[] = { >>>>> +#define DEF_ENUM >>>>> +#define DEF_ALG(alg, name) #name, >>>>> +#include "stringop.def" >>>>> +#undef DEF_ENUM >>>>> +#undef DEF_ALG >>>>> +}; >>>>> + >>>>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. >>>>> + The string is of the following form (or comma separated list of it): >>>>> + >>>>> + strategy_alg:max_size:[align|noalign] >>>>> + >>>>> + where the full size range for the strategy is either [0, max_size] or >>>>> + [min_size, max_size], in which min_size is the max_size + 1 of the >>>>> + preceding range. The last size range must have max_size == -1. >>>>> + >>>>> + Examples: >>>>> + >>>>> + 1. >>>>> + -mmemcpy-strategy=libcall:-1:noalign >>>>> + >>>>> + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall >>>>> + >>>>> + >>>>> + 2. >>>>> + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign >>>>> + >>>>> + This is to tell the compiler to use the following strategy for memset >>>>> + 1) when the expected size is between [1, 16], use rep_8byte strategy; >>>>> + 2) when the size is between [17, 2048], use vector_loop; >>>>> + 3) when the size is > 2048, use libcall. >>>>> + >>>>> +*/ >>>>> + >>>>> +struct stringop_size_range >>>>> +{ >>>>> + int min; >>>>> + int max; >>>>> + stringop_alg alg; >>>>> + bool noalign; >>>>> +}; >>>>> + >>>>> +static void >>>>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) >>>>> +{ >>>>> + const struct stringop_algs *default_algs; >>>>> + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; >>>>> + char *curr_range_str, *next_range_str; >>>>> + int i = 0, n = 0; >>>>> + >>>>> + if (is_memset) >>>>> + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; >>>>> + else >>>>> + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; >>>>> + >>>>> + curr_range_str = strategy_str; >>>>> + >>>>> + do { >>>>> + >>>>> + int mins, maxs; >>>>> + stringop_alg alg; >>>>> + char alg_name[128]; >>>>> + char align[16]; >>>>> + >>>>> + next_range_str = strchr (curr_range_str, ','); >>>>> + if (next_range_str) >>>>> + *next_range_str++ = '\0'; >>>>> + >>>>> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) >>>>> + { >>>>> + warning (0, "Wrong arg %s to option %s", curr_range_str, >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) >>>>> + { >>>>> + warning (0, "Size ranges of option %s should be increasing", >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + for (i = 0; i < last_alg; i++) >>>>> + { >>>>> + if (!strcmp (alg_name, stringop_alg_names[i])) >>>>> + { >>>>> + alg = (stringop_alg) i; >>>>> + break; >>>>> + } >>>>> + } >>>>> + >>>>> + if (i == last_alg) >>>>> + { >>>>> + warning (0, "Wrong stringop strategy name %s specified for option %s", >>>>> + alg_name, >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + input_ranges[n].min = mins; >>>>> + input_ranges[n].max = maxs; >>>>> + input_ranges[n].alg = alg; >>>>> + if (!strcmp (align, "align")) >>>>> + input_ranges[n].noalign = false; >>>>> + else if (!strcmp (align, "noalign")) >>>>> + input_ranges[n].noalign = true; >>>>> + else >>>>> + { >>>>> + warning (0, "Unknown alignment %s specified for option %s", >>>>> + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + n++; >>>>> + curr_range_str = next_range_str; >>>>> + } while (curr_range_str); >>>>> + >>>>> + if (input_ranges[n - 1].max != -1) >>>>> + { >>>>> + warning (0, "The max value for the last size range should be -1" >>>>> + " for option %s", >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + if (n > MAX_STRINGOP_ALGS) >>>>> + { >>>>> + warning (0, "Too many size ranges specified in option %s", >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + /* Now override the default algs array */ >>>>> + for (i = 0; i < n; i++) >>>>> + { >>>>> + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; >>>>> + *const_cast<stringop_alg *>(&default_algs->size[i].alg) >>>>> + = input_ranges[i].alg; >>>>> + *const_cast<int *>(&default_algs->size[i].noalign) >>>>> + = input_ranges[i].noalign; >>>>> + } >>>>> +} >>>>> + >>>>> >>>>> /* Override various settings based on options. If MAIN_ARGS_P, the >>>>> options are from the command line, otherwise they are from >>>>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main >>>>> /* Handle stack protector */ >>>>> if (!global_options_set.x_ix86_stack_protector_guard) >>>>> ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; >>>>> + >>>>> + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ >>>>> + if (ix86_tune_memcpy_strategy) >>>>> + { >>>>> + char *str = xstrdup (ix86_tune_memcpy_strategy); >>>>> + ix86_parse_stringop_strategy_string (str, false); >>>>> + free (str); >>>>> + } >>>>> + >>>>> + if (ix86_tune_memset_strategy) >>>>> + { >>>>> + char *str = xstrdup (ix86_tune_memset_strategy); >>>>> + ix86_parse_stringop_strategy_string (str, true); >>>>> + free (str); >>>>> + } >>>>> } >>>>> >>>>> /* Implement the TARGET_OPTION_OVERRIDE hook. */ >>>>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >>>>> { >>>>> case libcall: >>>>> case no_stringop: >>>>> + case last_alg: >>>>> gcc_unreachable (); >>>>> case loop_1_byte: >>>>> need_zero_guard = true; >>>>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >>>>> { >>>>> case libcall: >>>>> case no_stringop: >>>>> + case last_alg: >>>>> gcc_unreachable (); >>>>> case loop_1_byte: >>>>> case loop: >>>>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >>>>> { >>>>> case libcall: >>>>> case no_stringop: >>>>> + case last_alg: >>>>> gcc_unreachable (); >>>>> case loop: >>>>> need_zero_guard = true; >>>>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >>>>> { >>>>> case libcall: >>>>> case no_stringop: >>>>> + case last_alg: >>>>> gcc_unreachable (); >>>>> case loop_1_byte: >>>>> case loop: >>>>> Index: config/i386/i386-opts.h >>>>> =================================================================== >>>>> --- config/i386/i386-opts.h (revision 201458) >>>>> +++ config/i386/i386-opts.h (working copy) >>>>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI >>>>> /* Algorithm to expand string function with. */ >>>>> enum stringop_alg >>>>> { >>>>> - no_stringop, >>>>> - libcall, >>>>> - rep_prefix_1_byte, >>>>> - rep_prefix_4_byte, >>>>> - rep_prefix_8_byte, >>>>> - loop_1_byte, >>>>> - loop, >>>>> - unrolled_loop, >>>>> - vector_loop >>>>> +#undef DEF_ENUM >>>>> +#define DEF_ENUM >>>>> + >>>>> +#undef DEF_ALG >>>>> +#define DEF_ALG(alg, name) alg, >>>>> + >>>>> +#include "stringop.def" >>>>> +last_alg >>>>> + >>>>> +#undef DEF_ENUM >>>>> +#undef DEF_ALG >>>>> }; >>>>> >>>>> /* Available call abi. */ >>>>> Index: doc/invoke.texi >>>>> =================================================================== >>>>> --- doc/invoke.texi (revision 201458) >>>>> +++ doc/invoke.texi (working copy) >>>>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. >>>>> -mbmi2 -mrtm -mlwp -mthreads @gol >>>>> -mno-align-stringops -minline-all-stringops @gol >>>>> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol >>>>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} >>>>> -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol >>>>> -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol >>>>> -mregparm=@var{num} -msseregparm @gol >>>>> @@ -14598,6 +14599,24 @@ Expand into an inline loop. >>>>> Always use a library call. >>>>> @end table >>>>> >>>>> +@item -mmemcpy-strategy=@var{strategy} >>>>> +@opindex mmemcpy-strategy=@var{strategy} >>>>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy} >>>>> +should be inlined and what inline algorithm to use when the expected size >>>>> +of the copy operation is known. @var{strategy} >>>>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. >>>>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies >>>>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last >>>>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets >>>>> +in the list must be specified in increasing order. The minimal byte size for >>>>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the >>>>> +preceding range. >>>>> + >>>>> +@item -mmemset-strategy=@var{strategy} >>>>> +@opindex mmemset-strategy=@var{strategy} >>>>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control >>>>> +@code{__builtin_memset} expansion. >>>>> + >>>>> @item -momit-leaf-frame-pointer >>>>> @opindex momit-leaf-frame-pointer >>>>> Don't keep the frame pointer in a register for leaf functions. This >>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c >>>>> =================================================================== >>>>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >>>>> @@ -0,0 +1,12 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ >>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >>>>> + >>>>> +char a[2048]; >>>>> +char b[2048]; >>>>> +void t (void) >>>>> +{ >>>>> + __builtin_memcpy (a, b, 2048); >>>>> +} >>>>> + >>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c >>>>> =================================================================== >>>>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >>>>> @@ -0,0 +1,12 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ >>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >>>>> + >>>>> +char a[2048]; >>>>> +char b[2048]; >>>>> +void t (void) >>>>> +{ >>>>> + __builtin_memcpy (a, b, 2048); >>>>> +} >>>>> + >>>>> Index: testsuite/gcc.target/i386/memset-strategy-1.c >>>>> =================================================================== >>>>> --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >>>>> +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >>>>> @@ -0,0 +1,10 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ >>>>> +/* { dg-final { scan-assembler-times "memset" 2 } } */ >>>>> + >>>>> +char a[2048]; >>>>> +void t (void) >>>>> +{ >>>>> + __builtin_memset (a, 1, 2048); >>>>> +} >>>>> + >>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c >>>>> =================================================================== >>>>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >>>>> @@ -0,0 +1,11 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ >>>>> +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ >>>>> + >>>>> +char a[2048]; >>>>> +char b[2048]; >>>>> +void t (void) >>>>> +{ >>>>> + __builtin_memcpy (a, b, 2048); >>>>> +} >>>>> + >>>> >> >> >> >> -- >> --- >> Best regards, >> Michael V. Zolotukhin, >> Software Engineer >> Intel Corporation. [-- Attachment #2: stringop_inl_option.p.txt --] [-- Type: text/plain, Size: 18836 bytes --] Index: doc/invoke.texi =================================================================== --- doc/invoke.texi (revision 201458) +++ doc/invoke.texi (working copy) @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. -mbmi2 -mrtm -mlwp -mthreads @gol -mno-align-stringops -minline-all-stringops @gol -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol -mregparm=@var{num} -msseregparm @gol @@ -14598,6 +14599,24 @@ Expand into an inline loop. Always use a library call. @end table +@item -mmemcpy-strategy=@var{strategy} +@opindex mmemcpy-strategy=@var{strategy} +Override the internal decision heuristic to decide if @code{__builtin_memcpy} +should be inlined and what inline algorithm to use when the expected size +of the copy operation is known. @var{strategy} +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies +the max byte size with which inline algorithm @var{alg} is allowed. For the last +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets +in the list must be specified in increasing order. The minimal byte size for +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the +preceding range. + +@item -mmemset-strategy=@var{strategy} +@opindex mmemset-strategy=@var{strategy} +The option is similar to @option{-mmemcpy-strategy=} except that it is to control +@code{__builtin_memset} expansion. + @item -momit-leaf-frame-pointer @opindex momit-leaf-frame-pointer Don't keep the frame pointer in a register for leaf functions. This Index: testsuite/gcc.target/i386/memcpy-strategy-2.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memset-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memset" 2 } } */ + +char a[2048]; +void t (void) +{ + __builtin_memset (a, 1, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-3.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: testsuite/gcc.target/i386/memcpy-strategy-1.c =================================================================== --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ + +char a[2048]; +char b[2048]; +void t (void) +{ + __builtin_memcpy (a, b, 2048); +} + Index: config/i386/stringop.opt =================================================================== --- config/i386/stringop.opt (revision 0) +++ config/i386/stringop.opt (revision 0) @@ -0,0 +1,36 @@ +/* Definitions for option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) + +#undef DEF_ENUM +#define DEF_ENUM EnumValue + +#undef DEF_ALG +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) + +#include "stringop.def" + +#undef DEF_ENUM +#undef DEF_ALG Index: config/i386/i386-opts.h =================================================================== --- config/i386/i386-opts.h (revision 201458) +++ config/i386/i386-opts.h (working copy) @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI /* Algorithm to expand string function with. */ enum stringop_alg { - no_stringop, - libcall, - rep_prefix_1_byte, - rep_prefix_4_byte, - rep_prefix_8_byte, - loop_1_byte, - loop, - unrolled_loop, - vector_loop +#undef DEF_ENUM +#define DEF_ENUM + +#undef DEF_ALG +#define DEF_ALG(alg, name) alg, + +#include "stringop.def" +last_alg + +#undef DEF_ENUM +#undef DEF_ALG }; /* Available call abi. */ Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 201458) +++ config/i386/i386.c (working copy) @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = }; /* Processor costs (relative to an add) */ -static const +static struct processor_costs i386_cost = { /* 386 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs i486_cost = { /* 486 specific costs */ COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentiumpro_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs geode_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k6_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs athlon_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs k8_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (2), /* cost of a lea instruction */ @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs pentium4_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (3), /* cost of a lea instruction */ @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs nocona_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1), /* cost of a lea instruction */ @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { 1, /* cond_not_taken_branch_cost. */ }; -static const +static struct processor_costs atom_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { }; /* Generic64 should produce code tuned for Nocona and K8. */ -static const +static struct processor_costs generic64_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = }; /* core_cost should produce code tuned for Core familly of CPUs. */ -static const +static struct processor_costs core_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ /* On all chips taken into consideration lea is 2 cycles and more. With @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, Athlon and K8. */ -static const +static struct processor_costs generic32_cost = { COSTS_N_INSNS (1), /* cost of an add instruction */ COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ @@ -2900,6 +2900,148 @@ ix86_debug_options (void) return; } + +static const char *stringop_alg_names[] = { +#define DEF_ENUM +#define DEF_ALG(alg, name) #name, +#include "stringop.def" +#undef DEF_ENUM +#undef DEF_ALG +}; + +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. + The string is of the following form (or comma separated list of it): + + strategy_alg:max_size:[align|noalign] + + where the full size range for the strategy is either [0, max_size] or + [min_size, max_size], in which min_size is the max_size + 1 of the + preceding range. The last size range must have max_size == -1. + + Examples: + + 1. + -mmemcpy-strategy=libcall:-1:noalign + + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall + + + 2. + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign + + This is to tell the compiler to use the following strategy for memset + 1) when the expected size is between [1, 16], use rep_8byte strategy; + 2) when the size is between [17, 2048], use vector_loop; + 3) when the size is > 2048, use libcall. */ + +struct stringop_size_range +{ + int min; + int max; + stringop_alg alg; + bool noalign; +}; + +static void +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) +{ + const struct stringop_algs *default_algs; + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; + char *curr_range_str, *next_range_str; + int i = 0, n = 0; + + if (is_memset) + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; + else + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; + + curr_range_str = strategy_str; + + do { + + int mins, maxs; + stringop_alg alg; + char alg_name[128]; + char align[16]; + + next_range_str = strchr (curr_range_str, ','); + if (next_range_str) + *next_range_str++ = '\0'; + + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) + { + warning (0, "Wrong arg %s to option %s", curr_range_str, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) + { + warning (0, "Size ranges of option %s should be increasing", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + for (i = 0; i < last_alg; i++) + { + if (!strcmp (alg_name, stringop_alg_names[i])) + { + alg = (stringop_alg) i; + break; + } + } + + if (i == last_alg) + { + warning (0, "Wrong stringop strategy name %s specified for option %s", + alg_name, + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + input_ranges[n].min = mins; + input_ranges[n].max = maxs; + input_ranges[n].alg = alg; + if (!strcmp (align, "align")) + input_ranges[n].noalign = false; + else if (!strcmp (align, "noalign")) + input_ranges[n].noalign = true; + else + { + warning (0, "Unknown alignment %s specified for option %s", + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + n++; + curr_range_str = next_range_str; + } while (curr_range_str); + + if (input_ranges[n - 1].max != -1) + { + warning (0, "The max value for the last size range should be -1" + " for option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + if (n > MAX_STRINGOP_ALGS) + { + warning (0, "Too many size ranges specified in option %s", + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); + return; + } + + /* Now override the default algs array. */ + for (i = 0; i < n; i++) + { + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; + *const_cast<stringop_alg *>(&default_algs->size[i].alg) + = input_ranges[i].alg; + *const_cast<int *>(&default_algs->size[i].noalign) + = input_ranges[i].noalign; + } +} + \f /* Override various settings based on options. If MAIN_ARGS_P, the options are from the command line, otherwise they are from @@ -4021,6 +4163,21 @@ ix86_option_override_internal (bool main /* Handle stack protector */ if (!global_options_set.x_ix86_stack_protector_guard) ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; + + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ + if (ix86_tune_memcpy_strategy) + { + char *str = xstrdup (ix86_tune_memcpy_strategy); + ix86_parse_stringop_strategy_string (str, false); + free (str); + } + + if (ix86_tune_memset_strategy) + { + char *str = xstrdup (ix86_tune_memset_strategy); + ix86_parse_stringop_strategy_string (str, true); + free (str); + } } /* Implement the TARGET_OPTION_OVERRIDE hook. */ @@ -22903,6 +23060,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: need_zero_guard = true; @@ -23093,6 +23251,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: @@ -23304,6 +23463,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop: need_zero_guard = true; @@ -23481,6 +23641,7 @@ ix86_expand_setmem (rtx dst, rtx count_e { case libcall: case no_stringop: + case last_alg: gcc_unreachable (); case loop_1_byte: case loop: Index: config/i386/i386.opt =================================================================== --- config/i386/i386.opt (revision 201458) +++ config/i386/i386.opt (working copy) @@ -316,6 +316,14 @@ mstack-arg-probe Target Report Mask(STACK_PROBE) Save Enable stack probing +mmemcpy-strategy= +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) +Specify memcpy expansion strategy when expected size is known + +mmemset-strategy= +Target RejectNegative Joined Var(ix86_tune_memset_strategy) +Specify memset expansion strategy when expected size is known + mstringop-strategy= Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) Chose strategy to generate stringop using Index: config/i386/stringop.def =================================================================== --- config/i386/stringop.def (revision 0) +++ config/i386/stringop.def (revision 0) @@ -0,0 +1,42 @@ +/* Definitions for option handling for IA-32. + Copyright (C) 2013 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3, or (at your option) +any later version. + +GCC is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +<http://www.gnu.org/licenses/>. */ + +DEF_ENUM +DEF_ALG (no_stringop, no_stringop) +DEF_ENUM +DEF_ALG (libcall, libcall) +DEF_ENUM +DEF_ALG (rep_prefix_1_byte, rep_byte) +DEF_ENUM +DEF_ALG (rep_prefix_4_byte, rep_4byte) +DEF_ENUM +DEF_ALG (rep_prefix_8_byte, rep_8byte) +DEF_ENUM +DEF_ALG (loop_1_byte, byte_loop) +DEF_ENUM +DEF_ALG (loop, loop) +DEF_ENUM +DEF_ALG (unrolled_loop, unrolled_loop) +DEF_ENUM +DEF_ALG (vector_loop, vector_loop) ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: New parameters to control stringop expansion libcall strategy 2013-08-06 16:42 ` Xinliang David Li 2013-08-06 16:45 ` Xinliang David Li @ 2013-08-07 7:14 ` Michael Zolotukhin 1 sibling, 0 replies; 23+ messages in thread From: Michael Zolotukhin @ 2013-08-07 7:14 UTC (permalink / raw) To: Xinliang David Li; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson > the option is designed for purpose like this. That's great, thanks! Michael > David On 6 August 2013 20:42, Xinliang David Li <davidxl@google.com> wrote: > Corrected two small problems reported by the style checker (The > warnings about the EnumValue for options in stringopt.opt are not > valid). > > On Tue, Aug 6, 2013 at 1:46 AM, Michael Zolotukhin > <michael.v.zolotukhin@gmail.com> wrote: >> There are still some formatting issues (like 8 spaces instead of a >> tab, wrong indentation of do-loop and some other places) - to reveal >> some of them you could use contrib/check_GNU_style.sh script. >> But that was a nitpicking again:) Actually I wanted to ask whether >> you're going to use this option for some performance experiments >> involving memmov/memset - if so, probably you could tune existing >> cost-models as well? Is it possible? > > the option is designed for purpose like this. > > thanks, > > David > >> >> Michael >> >> On 5 August 2013 20:44, Xinliang David Li <davidxl@google.com> wrote: >>> thanks. Updated patch attached. >>> >>> David >>> >>> On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin >>> <michael.v.zolotukhin@gmail.com> wrote: >>>> Hi, >>>> This is a really convenient option, thanks for working on it. >>>> I can't approve it as I'm not a maintainer, but it looks ok to me, >>>> except fot a small nitpicking: afair, comments should end with >>>> dot-space-space. >>>> >>>> Michael >>>> >>>> On 04 Aug 20:01, Xinliang David Li wrote: >>>>> The attached is a new patch implementing the stringop inline strategy >>>>> control using two new -m options: >>>>> >>>>> -mmemcpy-strategy= >>>>> -mmemset-strategy= >>>>> >>>>> See changes in doc/invoke.texi for description of the new options. Example: >>>>> -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned >>>>> >>>>> tells compiler to inline memcpy using rep_8byte when the size is no >>>>> larger than 64 byte, using unrolled_loop when size is no larger than >>>>> 2048, and for size > 2048, using library call. In all cases, >>>>> destination alignment adjustment is not done. >>>>> >>>>> Tested on x86-64/linux. Ok for trunk? >>>>> >>>>> thanks, >>>>> >>>>> David >>>>> >>>>> 2013-08-02 Xinliang David Li <davidxl@google.com> >>>>> >>>>> * config/i386/stringop.def: New file. >>>>> * config/i386/stringop.opt: New file. >>>>> * config/i386/i386-opts.h: Include stringopt.def. >>>>> * config/i386/i386.opt: Include stringopt.opt. >>>>> * config/i386/i386.c (ix86_option_override_internal): >>>>> Override default size based stringop inline strategies >>>>> with options. >>>>> * config/i386/i386.c (ix86_parse_stringop_strategy_string): >>>>> New function. >>>>> >>>>> 2013-08-04 Xinliang David Li <davidxl@google.com> >>>>> >>>>> * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test. >>>>> * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto. >>>>> * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto. >>>>> * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto. >>>>> >>>>> >>>>> >>>>> >>>>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote: >>>>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with >>>>> > FDO), libcall strategy is used with the size is > 8192. This value is >>>>> > hard coded, which makes it hard to do performance tuning. This patch >>>>> > adds two new parameters to do that. Potential usage includes >>>>> > per-application libcall strategy min-size tuning based on summary data >>>>> > with FDO (e.g, instruction workset size). >>>>> > >>>>> > Bootstrap and tested on x86_64/linux. Ok for trunk? >>>>> > >>>>> > thanks, >>>>> > >>>>> > David >>>>> > >>>>> > >>>>> > 2013-08-02 Xinliang David Li <davidxl@google.com> >>>>> > >>>>> > * params.def: New parameters. >>>>> > * config/i386/i386.c (ix86_option_override_internal): >>>>> > Override default libcall size limit with parameters. >>>> >>>>> Index: config/i386/stringop.def >>>>> =================================================================== >>>>> --- config/i386/stringop.def (revision 0) >>>>> +++ config/i386/stringop.def (revision 0) >>>>> @@ -0,0 +1,42 @@ >>>>> +/* Definitions for option handling for IA-32. >>>>> + Copyright (C) 2013 Free Software Foundation, Inc. >>>>> + >>>>> +This file is part of GCC. >>>>> + >>>>> +GCC is free software; you can redistribute it and/or modify >>>>> +it under the terms of the GNU General Public License as published by >>>>> +the Free Software Foundation; either version 3, or (at your option) >>>>> +any later version. >>>>> + >>>>> +GCC is distributed in the hope that it will be useful, >>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>>>> +GNU General Public License for more details. >>>>> + >>>>> +Under Section 7 of GPL version 3, you are granted additional >>>>> +permissions described in the GCC Runtime Library Exception, version >>>>> +3.1, as published by the Free Software Foundation. >>>>> + >>>>> +You should have received a copy of the GNU General Public License and >>>>> +a copy of the GCC Runtime Library Exception along with this program; >>>>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >>>>> +<http://www.gnu.org/licenses/>. */ >>>>> + >>>>> +DEF_ENUM >>>>> +DEF_ALG (no_stringop, no_stringop) >>>>> +DEF_ENUM >>>>> +DEF_ALG (libcall, libcall) >>>>> +DEF_ENUM >>>>> +DEF_ALG (rep_prefix_1_byte, rep_byte) >>>>> +DEF_ENUM >>>>> +DEF_ALG (rep_prefix_4_byte, rep_4byte) >>>>> +DEF_ENUM >>>>> +DEF_ALG (rep_prefix_8_byte, rep_8byte) >>>>> +DEF_ENUM >>>>> +DEF_ALG (loop_1_byte, byte_loop) >>>>> +DEF_ENUM >>>>> +DEF_ALG (loop, loop) >>>>> +DEF_ENUM >>>>> +DEF_ALG (unrolled_loop, unrolled_loop) >>>>> +DEF_ENUM >>>>> +DEF_ALG (vector_loop, vector_loop) >>>>> Index: config/i386/i386.opt >>>>> =================================================================== >>>>> --- config/i386/i386.opt (revision 201458) >>>>> +++ config/i386/i386.opt (working copy) >>>>> @@ -316,6 +316,14 @@ mstack-arg-probe >>>>> Target Report Mask(STACK_PROBE) Save >>>>> Enable stack probing >>>>> >>>>> +mmemcpy-strategy= >>>>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy) >>>>> +Specify memcpy expansion strategy when expected size is known >>>>> + >>>>> +mmemset-strategy= >>>>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy) >>>>> +Specify memset expansion strategy when expected size is known >>>>> + >>>>> mstringop-strategy= >>>>> Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop) >>>>> Chose strategy to generate stringop using >>>>> Index: config/i386/stringop.opt >>>>> =================================================================== >>>>> --- config/i386/stringop.opt (revision 0) >>>>> +++ config/i386/stringop.opt (revision 0) >>>>> @@ -0,0 +1,36 @@ >>>>> +/* Definitions for option handling for IA-32. >>>>> + Copyright (C) 2013 Free Software Foundation, Inc. >>>>> + >>>>> +This file is part of GCC. >>>>> + >>>>> +GCC is free software; you can redistribute it and/or modify >>>>> +it under the terms of the GNU General Public License as published by >>>>> +the Free Software Foundation; either version 3, or (at your option) >>>>> +any later version. >>>>> + >>>>> +GCC is distributed in the hope that it will be useful, >>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of >>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >>>>> +GNU General Public License for more details. >>>>> + >>>>> +Under Section 7 of GPL version 3, you are granted additional >>>>> +permissions described in the GCC Runtime Library Exception, version >>>>> +3.1, as published by the Free Software Foundation. >>>>> + >>>>> +You should have received a copy of the GNU General Public License and >>>>> +a copy of the GCC Runtime Library Exception along with this program; >>>>> +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see >>>>> +<http://www.gnu.org/licenses/>. */ >>>>> + >>>>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte) >>>>> + >>>>> +#undef DEF_ENUM >>>>> +#define DEF_ENUM EnumValue >>>>> + >>>>> +#undef DEF_ALG >>>>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg) >>>>> + >>>>> +#include "stringop.def" >>>>> + >>>>> +#undef DEF_ENUM >>>>> +#undef DEF_ALG >>>>> Index: config/i386/i386.c >>>>> =================================================================== >>>>> --- config/i386/i386.c (revision 201458) >>>>> +++ config/i386/i386.c (working copy) >>>>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost = >>>>> }; >>>>> >>>>> /* Processor costs (relative to an add) */ >>>>> -static const >>>>> +static >>>>> struct processor_costs i386_cost = { /* 386 specific costs */ >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = { /* >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs i486_cost = { /* 486 specific costs */ >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = { /* >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs pentium_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs pentiumpro_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost = >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs geode_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs k6_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs athlon_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs k8_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (2), /* cost of a lea instruction */ >>>>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs pentium4_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (3), /* cost of a lea instruction */ >>>>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs nocona_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1), /* cost of a lea instruction */ >>>>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = { >>>>> 1, /* cond_not_taken_branch_cost. */ >>>>> }; >>>>> >>>>> -static const >>>>> +static >>>>> struct processor_costs atom_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >>>>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = { >>>>> }; >>>>> >>>>> /* Generic64 should produce code tuned for Nocona and K8. */ >>>>> -static const >>>>> +static >>>>> struct processor_costs generic64_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> /* On all chips taken into consideration lea is 2 cycles and more. With >>>>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost = >>>>> }; >>>>> >>>>> /* core_cost should produce code tuned for Core familly of CPUs. */ >>>>> -static const >>>>> +static >>>>> struct processor_costs core_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> /* On all chips taken into consideration lea is 2 cycles and more. With >>>>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = { >>>>> >>>>> /* Generic32 should produce code tuned for PPro, Pentium4, Nocona, >>>>> Athlon and K8. */ >>>>> -static const >>>>> +static >>>>> struct processor_costs generic32_cost = { >>>>> COSTS_N_INSNS (1), /* cost of an add instruction */ >>>>> COSTS_N_INSNS (1) + 1, /* cost of a lea instruction */ >>>>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void) >>>>> >>>>> return; >>>>> } >>>>> + >>>>> +static const char *stringop_alg_names[] = { >>>>> +#define DEF_ENUM >>>>> +#define DEF_ALG(alg, name) #name, >>>>> +#include "stringop.def" >>>>> +#undef DEF_ENUM >>>>> +#undef DEF_ALG >>>>> +}; >>>>> + >>>>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=. >>>>> + The string is of the following form (or comma separated list of it): >>>>> + >>>>> + strategy_alg:max_size:[align|noalign] >>>>> + >>>>> + where the full size range for the strategy is either [0, max_size] or >>>>> + [min_size, max_size], in which min_size is the max_size + 1 of the >>>>> + preceding range. The last size range must have max_size == -1. >>>>> + >>>>> + Examples: >>>>> + >>>>> + 1. >>>>> + -mmemcpy-strategy=libcall:-1:noalign >>>>> + >>>>> + this is equivalent to (for known size memcpy) -mstringop-strategy=libcall >>>>> + >>>>> + >>>>> + 2. >>>>> + -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign >>>>> + >>>>> + This is to tell the compiler to use the following strategy for memset >>>>> + 1) when the expected size is between [1, 16], use rep_8byte strategy; >>>>> + 2) when the size is between [17, 2048], use vector_loop; >>>>> + 3) when the size is > 2048, use libcall. >>>>> + >>>>> +*/ >>>>> + >>>>> +struct stringop_size_range >>>>> +{ >>>>> + int min; >>>>> + int max; >>>>> + stringop_alg alg; >>>>> + bool noalign; >>>>> +}; >>>>> + >>>>> +static void >>>>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset) >>>>> +{ >>>>> + const struct stringop_algs *default_algs; >>>>> + stringop_size_range input_ranges[MAX_STRINGOP_ALGS]; >>>>> + char *curr_range_str, *next_range_str; >>>>> + int i = 0, n = 0; >>>>> + >>>>> + if (is_memset) >>>>> + default_algs = &ix86_cost->memset[TARGET_64BIT != 0]; >>>>> + else >>>>> + default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0]; >>>>> + >>>>> + curr_range_str = strategy_str; >>>>> + >>>>> + do { >>>>> + >>>>> + int mins, maxs; >>>>> + stringop_alg alg; >>>>> + char alg_name[128]; >>>>> + char align[16]; >>>>> + >>>>> + next_range_str = strchr (curr_range_str, ','); >>>>> + if (next_range_str) >>>>> + *next_range_str++ = '\0'; >>>>> + >>>>> + if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align)) >>>>> + { >>>>> + warning (0, "Wrong arg %s to option %s", curr_range_str, >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1)) >>>>> + { >>>>> + warning (0, "Size ranges of option %s should be increasing", >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + for (i = 0; i < last_alg; i++) >>>>> + { >>>>> + if (!strcmp (alg_name, stringop_alg_names[i])) >>>>> + { >>>>> + alg = (stringop_alg) i; >>>>> + break; >>>>> + } >>>>> + } >>>>> + >>>>> + if (i == last_alg) >>>>> + { >>>>> + warning (0, "Wrong stringop strategy name %s specified for option %s", >>>>> + alg_name, >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + input_ranges[n].min = mins; >>>>> + input_ranges[n].max = maxs; >>>>> + input_ranges[n].alg = alg; >>>>> + if (!strcmp (align, "align")) >>>>> + input_ranges[n].noalign = false; >>>>> + else if (!strcmp (align, "noalign")) >>>>> + input_ranges[n].noalign = true; >>>>> + else >>>>> + { >>>>> + warning (0, "Unknown alignment %s specified for option %s", >>>>> + align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + n++; >>>>> + curr_range_str = next_range_str; >>>>> + } while (curr_range_str); >>>>> + >>>>> + if (input_ranges[n - 1].max != -1) >>>>> + { >>>>> + warning (0, "The max value for the last size range should be -1" >>>>> + " for option %s", >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + if (n > MAX_STRINGOP_ALGS) >>>>> + { >>>>> + warning (0, "Too many size ranges specified in option %s", >>>>> + is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy="); >>>>> + return; >>>>> + } >>>>> + >>>>> + /* Now override the default algs array */ >>>>> + for (i = 0; i < n; i++) >>>>> + { >>>>> + *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max; >>>>> + *const_cast<stringop_alg *>(&default_algs->size[i].alg) >>>>> + = input_ranges[i].alg; >>>>> + *const_cast<int *>(&default_algs->size[i].noalign) >>>>> + = input_ranges[i].noalign; >>>>> + } >>>>> +} >>>>> + >>>>> >>>>> /* Override various settings based on options. If MAIN_ARGS_P, the >>>>> options are from the command line, otherwise they are from >>>>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main >>>>> /* Handle stack protector */ >>>>> if (!global_options_set.x_ix86_stack_protector_guard) >>>>> ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS; >>>>> + >>>>> + /* Handle -mmemcpy-strategy= and -mmemset-strategy= */ >>>>> + if (ix86_tune_memcpy_strategy) >>>>> + { >>>>> + char *str = xstrdup (ix86_tune_memcpy_strategy); >>>>> + ix86_parse_stringop_strategy_string (str, false); >>>>> + free (str); >>>>> + } >>>>> + >>>>> + if (ix86_tune_memset_strategy) >>>>> + { >>>>> + char *str = xstrdup (ix86_tune_memset_strategy); >>>>> + ix86_parse_stringop_strategy_string (str, true); >>>>> + free (str); >>>>> + } >>>>> } >>>>> >>>>> /* Implement the TARGET_OPTION_OVERRIDE hook. */ >>>>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >>>>> { >>>>> case libcall: >>>>> case no_stringop: >>>>> + case last_alg: >>>>> gcc_unreachable (); >>>>> case loop_1_byte: >>>>> need_zero_guard = true; >>>>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt >>>>> { >>>>> case libcall: >>>>> case no_stringop: >>>>> + case last_alg: >>>>> gcc_unreachable (); >>>>> case loop_1_byte: >>>>> case loop: >>>>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >>>>> { >>>>> case libcall: >>>>> case no_stringop: >>>>> + case last_alg: >>>>> gcc_unreachable (); >>>>> case loop: >>>>> need_zero_guard = true; >>>>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e >>>>> { >>>>> case libcall: >>>>> case no_stringop: >>>>> + case last_alg: >>>>> gcc_unreachable (); >>>>> case loop_1_byte: >>>>> case loop: >>>>> Index: config/i386/i386-opts.h >>>>> =================================================================== >>>>> --- config/i386/i386-opts.h (revision 201458) >>>>> +++ config/i386/i386-opts.h (working copy) >>>>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI >>>>> /* Algorithm to expand string function with. */ >>>>> enum stringop_alg >>>>> { >>>>> - no_stringop, >>>>> - libcall, >>>>> - rep_prefix_1_byte, >>>>> - rep_prefix_4_byte, >>>>> - rep_prefix_8_byte, >>>>> - loop_1_byte, >>>>> - loop, >>>>> - unrolled_loop, >>>>> - vector_loop >>>>> +#undef DEF_ENUM >>>>> +#define DEF_ENUM >>>>> + >>>>> +#undef DEF_ALG >>>>> +#define DEF_ALG(alg, name) alg, >>>>> + >>>>> +#include "stringop.def" >>>>> +last_alg >>>>> + >>>>> +#undef DEF_ENUM >>>>> +#undef DEF_ALG >>>>> }; >>>>> >>>>> /* Available call abi. */ >>>>> Index: doc/invoke.texi >>>>> =================================================================== >>>>> --- doc/invoke.texi (revision 201458) >>>>> +++ doc/invoke.texi (working copy) >>>>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}. >>>>> -mbmi2 -mrtm -mlwp -mthreads @gol >>>>> -mno-align-stringops -minline-all-stringops @gol >>>>> -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol >>>>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} >>>>> -mpush-args -maccumulate-outgoing-args -m128bit-long-double @gol >>>>> -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol >>>>> -mregparm=@var{num} -msseregparm @gol >>>>> @@ -14598,6 +14599,24 @@ Expand into an inline loop. >>>>> Always use a library call. >>>>> @end table >>>>> >>>>> +@item -mmemcpy-strategy=@var{strategy} >>>>> +@opindex mmemcpy-strategy=@var{strategy} >>>>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy} >>>>> +should be inlined and what inline algorithm to use when the expected size >>>>> +of the copy operation is known. @var{strategy} >>>>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. >>>>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies >>>>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last >>>>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets >>>>> +in the list must be specified in increasing order. The minimal byte size for >>>>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the >>>>> +preceding range. >>>>> + >>>>> +@item -mmemset-strategy=@var{strategy} >>>>> +@opindex mmemset-strategy=@var{strategy} >>>>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control >>>>> +@code{__builtin_memset} expansion. >>>>> + >>>>> @item -momit-leaf-frame-pointer >>>>> @opindex momit-leaf-frame-pointer >>>>> Don't keep the frame pointer in a register for leaf functions. This >>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c >>>>> =================================================================== >>>>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c (revision 0) >>>>> @@ -0,0 +1,12 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */ >>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >>>>> + >>>>> +char a[2048]; >>>>> +char b[2048]; >>>>> +void t (void) >>>>> +{ >>>>> + __builtin_memcpy (a, b, 2048); >>>>> +} >>>>> + >>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c >>>>> =================================================================== >>>>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c (revision 0) >>>>> @@ -0,0 +1,12 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */ >>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */ >>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */ >>>>> + >>>>> +char a[2048]; >>>>> +char b[2048]; >>>>> +void t (void) >>>>> +{ >>>>> + __builtin_memcpy (a, b, 2048); >>>>> +} >>>>> + >>>>> Index: testsuite/gcc.target/i386/memset-strategy-1.c >>>>> =================================================================== >>>>> --- testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >>>>> +++ testsuite/gcc.target/i386/memset-strategy-1.c (revision 0) >>>>> @@ -0,0 +1,10 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */ >>>>> +/* { dg-final { scan-assembler-times "memset" 2 } } */ >>>>> + >>>>> +char a[2048]; >>>>> +void t (void) >>>>> +{ >>>>> + __builtin_memset (a, 1, 2048); >>>>> +} >>>>> + >>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c >>>>> =================================================================== >>>>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c (revision 0) >>>>> @@ -0,0 +1,11 @@ >>>>> +/* { dg-do compile } */ >>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */ >>>>> +/* { dg-final { scan-assembler-times "memcpy" 2 } } */ >>>>> + >>>>> +char a[2048]; >>>>> +char b[2048]; >>>>> +void t (void) >>>>> +{ >>>>> + __builtin_memcpy (a, b, 2048); >>>>> +} >>>>> + >>>> >> >> >> >> -- >> --- >> Best regards, >> Michael V. Zolotukhin, >> Software Engineer >> Intel Corporation. -- --- Best regards, Michael V. Zolotukhin, Software Engineer Intel Corporation. ^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2013-08-09 19:49 UTC | newest] Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2013-08-03 4:22 New parameters to control stringop expansion libcall strategy Xinliang David Li 2013-08-03 8:07 ` Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) Jan Hubicka 2013-08-03 15:40 ` Xinliang David Li 2013-08-05 3:01 ` New parameters to control stringop expansion libcall strategy Xinliang David Li 2013-08-05 10:57 ` Michael V. Zolotukhin 2013-08-05 16:44 ` Xinliang David Li 2013-08-06 8:46 ` Michael Zolotukhin 2013-08-06 9:42 ` Jan Hubicka 2013-08-06 16:08 ` Xinliang David Li 2013-08-07 17:06 ` Xinliang David Li 2013-08-08 0:23 ` Joseph S. Myers 2013-08-08 0:29 ` Xinliang David Li 2013-08-08 1:04 ` Joseph S. Myers 2013-08-08 6:17 ` Xinliang David Li 2013-08-08 15:18 ` Joseph S. Myers 2013-08-08 16:31 ` Xinliang David Li 2013-08-09 18:25 ` Xinliang David Li 2013-08-09 18:33 ` Jan Hubicka 2013-08-09 19:41 ` Xinliang David Li 2013-08-09 19:49 ` Jan Hubicka 2013-08-06 16:42 ` Xinliang David Li 2013-08-06 16:45 ` Xinliang David Li 2013-08-07 7:14 ` Michael Zolotukhin
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).