public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* New parameters to control stringop expansion libcall strategy
@ 2013-08-03  4:22 Xinliang David Li
  2013-08-03  8:07 ` Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) Jan Hubicka
  2013-08-05  3:01 ` New parameters to control stringop expansion libcall strategy Xinliang David Li
  0 siblings, 2 replies; 23+ messages in thread
From: Xinliang David Li @ 2013-08-03  4:22 UTC (permalink / raw)
  To: GCC Patches; +Cc: Jan Hubicka, Teresa Johnson

[-- Attachment #1: Type: text/plain, Size: 669 bytes --]

On x86_64, when the expected size of memcpy/memset is known (e.g, with
FDO), libcall strategy is used with the size is > 8192. This value is
hard coded, which makes it hard to do performance tuning. This patch
adds two new parameters to do that. Potential usage includes
per-application libcall strategy min-size tuning based on summary data
with FDO (e.g, instruction workset size).

Bootstrap and tested on x86_64/linux. Ok for trunk?

thanks,

David


2013-08-02  Xinliang David Li  <davidxl@google.com>

        * params.def: New parameters.
        * config/i386/i386.c (ix86_option_override_internal):
        Override default libcall size limit with parameters.

[-- Attachment #2: stringop_inl_param.txt --]
[-- Type: text/plain, Size: 6543 bytes --]

Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 201458)
+++ config/i386/i386.c	(working copy)
@@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
 };
 
 /* Processor costs (relative to an add) */
-static const
+static
 struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -226,7 +226,7 @@ struct processor_costs i386_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -298,7 +298,7 @@ struct processor_costs i486_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k6_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (3),			/* cost of a lea instruction */
@@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
-static const
+static
 struct processor_costs generic64_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
-static const
+static
 struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
 
 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
    Athlon and K8.  */
-static const
+static
 struct processor_costs generic32_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -4021,6 +4021,34 @@ ix86_option_override_internal (bool main
   /* Handle stack protector */
   if (!global_options_set.x_ix86_stack_protector_guard)
     ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
+
+  /* Now override the memcpy/memset inline strategy parameters  */
+  if (PARAM_VALUE (PARAM_MEMCPY_LIBCALL_MIN_SIZE) != -1
+      || PARAM_VALUE (PARAM_MEMSET_LIBCALL_MIN_SIZE) != -1)
+    {
+      const struct stringop_algs *algs[2];
+      int k;
+      int min_sizes[2];
+
+      algs[0] = &ix86_cost->memset[TARGET_64BIT != 0];
+      algs[1] = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+      min_sizes[0] = PARAM_VALUE (PARAM_MEMSET_LIBCALL_MIN_SIZE);
+      min_sizes[1] = PARAM_VALUE (PARAM_MEMCPY_LIBCALL_MIN_SIZE);
+
+      for (k = 0; k < 2; k++)
+        {
+          if (min_sizes[k] == -1)
+            continue;
+
+          for (i = 0; i < MAX_STRINGOP_ALGS - 1; i++)
+            {
+              if (algs[k]->size[i].max >= min_sizes[k]
+                  || algs[k]->size[i + 1].alg == libcall)
+                *const_cast<int *>(&algs[k]->size[i].max) = min_sizes[k] - 1;
+            }
+        }
+    }
 }
 
 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
Index: params.def
===================================================================
--- params.def	(revision 201458)
+++ params.def	(working copy)
@@ -117,6 +117,18 @@ DEFPARAM (PARAM_COMDAT_SHARING_PROBABILI
 	  "Probability that COMDAT function will be shared with different compilation unit",
 	  20, 0, 0)
 
+/* Use libcall strategy when the expected size is no less than this parameter for memcpy.  */
+DEFPARAM (PARAM_MEMCPY_LIBCALL_MIN_SIZE,
+	  "memcpy-libcall-min-size",
+	  "The minimal expected size to force libcall expansion strategy for memcpy",
+	  -1, 1, 0)
+
+/* Use libcall strategy when the expected size is no less than this parameter for memset.  */
+DEFPARAM (PARAM_MEMSET_LIBCALL_MIN_SIZE,
+	  "memset-libcall-min-size",
+	  "The minimal expected size to force libcall expansion strategy for memset",
+	  -1, 1, 0)
+
 /* Limit on probability of entry BB.  */
 DEFPARAM (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY,
 	  "partial-inlining-entry-probability",

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy)
  2013-08-03  4:22 New parameters to control stringop expansion libcall strategy Xinliang David Li
@ 2013-08-03  8:07 ` Jan Hubicka
  2013-08-03 15:40   ` Xinliang David Li
  2013-08-05  3:01 ` New parameters to control stringop expansion libcall strategy Xinliang David Li
  1 sibling, 1 reply; 23+ messages in thread
From: Jan Hubicka @ 2013-08-03  8:07 UTC (permalink / raw)
  To: Xinliang David Li; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson

> On x86_64, when the expected size of memcpy/memset is known (e.g, with
> FDO), libcall strategy is used with the size is > 8192. This value is
> hard coded, which makes it hard to do performance tuning. This patch
> adds two new parameters to do that. Potential usage includes
> per-application libcall strategy min-size tuning based on summary data
> with FDO (e.g, instruction workset size).
> 
> Bootstrap and tested on x86_64/linux. Ok for trunk?
> 
> thanks,
> 
> David
> 
> 
> 2013-08-02  Xinliang David Li  <davidxl@google.com>
> 
>         * params.def: New parameters.
>         * config/i386/i386.c (ix86_option_override_internal):
>         Override default libcall size limit with parameters.

Hi,
problem with this is that we introduce generic --param that is used only
by x86 backend.  I am not really guru on the command line options, but I think
this is first time we try to do such thing.  I wonder if
1) We want to introduce target specific params.def
2) We want to use usual -msomething= options
3) We want to go this way?

Honza

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy)
  2013-08-03  8:07 ` Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) Jan Hubicka
@ 2013-08-03 15:40   ` Xinliang David Li
  0 siblings, 0 replies; 23+ messages in thread
From: Xinliang David Li @ 2013-08-03 15:40 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: GCC Patches, Teresa Johnson

On Sat, Aug 3, 2013 at 1:06 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> On x86_64, when the expected size of memcpy/memset is known (e.g, with
>> FDO), libcall strategy is used with the size is > 8192. This value is
>> hard coded, which makes it hard to do performance tuning. This patch
>> adds two new parameters to do that. Potential usage includes
>> per-application libcall strategy min-size tuning based on summary data
>> with FDO (e.g, instruction workset size).
>>
>> Bootstrap and tested on x86_64/linux. Ok for trunk?
>>
>> thanks,
>>
>> David
>>
>>
>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>
>>         * params.def: New parameters.
>>         * config/i386/i386.c (ix86_option_override_internal):
>>         Override default libcall size limit with parameters.
>
> Hi,
> problem with this is that we introduce generic --param that is used only
> by x86 backend.  I am not really guru on the command line options, but I think
> this is first time we try to do such thing.  I wonder if
> 1) We want to introduce target specific params.def

We do have target specific tuning code for parameters though  --
backend overrides the default value -- I think this is essentially
target specific params.

> 2) We want to use usual -msomething= options
> 3) We want to go this way?

I don't have strong opinion either way. To avoid controversy, let me
work on a -mxxx= version of the patch -- and hopefully it will be more
powerful.

thanks,

David

>
> Honza

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-03  4:22 New parameters to control stringop expansion libcall strategy Xinliang David Li
  2013-08-03  8:07 ` Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) Jan Hubicka
@ 2013-08-05  3:01 ` Xinliang David Li
  2013-08-05 10:57   ` Michael V. Zolotukhin
  1 sibling, 1 reply; 23+ messages in thread
From: Xinliang David Li @ 2013-08-05  3:01 UTC (permalink / raw)
  To: GCC Patches; +Cc: Jan Hubicka, Teresa Johnson

[-- Attachment #1: Type: text/plain, Size: 2205 bytes --]

The attached is a new patch implementing the stringop inline strategy
control using two new -m options:

-mmemcpy-strategy=
-mmemset-strategy=

See changes in doc/invoke.texi for description of the new options. Example:
  -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned

tells compiler to inline memcpy using rep_8byte when the size is no
larger than 64 byte, using unrolled_loop when size is no larger than
2048, and for size > 2048, using library call. In all cases,
destination alignment adjustment is not done.

Tested on x86-64/linux. Ok for trunk?

thanks,

David

2013-08-02  Xinliang David Li  <davidxl@google.com>

        * config/i386/stringop.def: New file.
        * config/i386/stringop.opt: New file.
        * config/i386/i386-opts.h: Include stringopt.def.
        * config/i386/i386.opt: Include stringopt.opt.
        * config/i386/i386.c (ix86_option_override_internal):
        Override default size based stringop inline strategies
        with options.
        * config/i386/i386.c (ix86_parse_stringop_strategy_string):
        New function.

2013-08-04  Xinliang David Li  <davidxl@google.com>

        * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
        * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
        * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
        * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.




On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
> On x86_64, when the expected size of memcpy/memset is known (e.g, with
> FDO), libcall strategy is used with the size is > 8192. This value is
> hard coded, which makes it hard to do performance tuning. This patch
> adds two new parameters to do that. Potential usage includes
> per-application libcall strategy min-size tuning based on summary data
> with FDO (e.g, instruction workset size).
>
> Bootstrap and tested on x86_64/linux. Ok for trunk?
>
> thanks,
>
> David
>
>
> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>
>         * params.def: New parameters.
>         * config/i386/i386.c (ix86_option_override_internal):
>         Override default libcall size limit with parameters.

[-- Attachment #2: stringop_inl_option.p.txt --]
[-- Type: text/plain, Size: 18835 bytes --]

Index: config/i386/stringop.def
===================================================================
--- config/i386/stringop.def	(revision 0)
+++ config/i386/stringop.def	(revision 0)
@@ -0,0 +1,42 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+DEF_ENUM
+DEF_ALG (no_stringop, no_stringop)
+DEF_ENUM
+DEF_ALG (libcall, libcall)
+DEF_ENUM
+DEF_ALG (rep_prefix_1_byte, rep_byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_4_byte, rep_4byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_8_byte, rep_8byte)
+DEF_ENUM
+DEF_ALG (loop_1_byte, byte_loop)
+DEF_ENUM
+DEF_ALG (loop, loop)
+DEF_ENUM
+DEF_ALG (unrolled_loop, unrolled_loop)
+DEF_ENUM
+DEF_ALG (vector_loop, vector_loop)
Index: config/i386/i386.opt
===================================================================
--- config/i386/i386.opt	(revision 201458)
+++ config/i386/i386.opt	(working copy)
@@ -316,6 +316,14 @@ mstack-arg-probe
 Target Report Mask(STACK_PROBE) Save
 Enable stack probing
 
+mmemcpy-strategy=
+Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
+Specify memcpy expansion strategy when expected size is known
+
+mmemset-strategy=
+Target RejectNegative Joined Var(ix86_tune_memset_strategy)
+Specify memset expansion strategy when expected size is known
+
 mstringop-strategy=
 Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
 Chose strategy to generate stringop using
Index: config/i386/stringop.opt
===================================================================
--- config/i386/stringop.opt	(revision 0)
+++ config/i386/stringop.opt	(revision 0)
@@ -0,0 +1,36 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
+
+#undef DEF_ENUM
+#define DEF_ENUM EnumValue
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
+
+#include "stringop.def"
+
+#undef DEF_ENUM
+#undef DEF_ALG
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 201458)
+++ config/i386/i386.c	(working copy)
@@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
 };
 
 /* Processor costs (relative to an add) */
-static const
+static
 struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -226,7 +226,7 @@ struct processor_costs i386_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -298,7 +298,7 @@ struct processor_costs i486_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k6_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (3),			/* cost of a lea instruction */
@@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
-static const
+static
 struct processor_costs generic64_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
-static const
+static
 struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
 
 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
    Athlon and K8.  */
-static const
+static
 struct processor_costs generic32_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -2900,6 +2900,150 @@ ix86_debug_options (void)
 
   return;
 }
+
+static const char *stringop_alg_names[] = {
+#define DEF_ENUM
+#define DEF_ALG(alg, name) #name,
+#include "stringop.def"
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+   The string is of the following form (or comma separated list of it):
+
+     strategy_alg:max_size:[align|noalign]
+
+   where the full size range for the strategy is either [0, max_size] or
+   [min_size, max_size], in which min_size is the max_size + 1 of the
+   preceding range.  The last size range must have max_size == -1.
+
+   Examples:
+
+    1.
+       -mmemcpy-strategy=libcall:-1:noalign
+
+      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+
+
+   2.
+      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+
+      This is to tell the compiler to use the following strategy for memset
+      1) when the expected size is between [1, 16], use rep_8byte strategy;
+      2) when the size is between [17, 2048], use vector_loop;
+      3) when the size is > 2048, use libcall.
+
+*/
+
+struct stringop_size_range
+{
+  int min;
+  int max;
+  stringop_alg alg;
+  bool noalign;
+};
+
+static void
+ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+{
+  const struct stringop_algs *default_algs;
+  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+  char *curr_range_str, *next_range_str;
+  int i = 0, n = 0;
+
+  if (is_memset)
+    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+  else
+    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+  curr_range_str = strategy_str;
+
+  do {
+
+    int mins, maxs;
+    stringop_alg alg;
+    char alg_name[128];
+    char align[16];
+
+    next_range_str = strchr (curr_range_str, ',');
+    if (next_range_str)
+      *next_range_str++ = '\0';
+
+    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
+      {
+        warning (0, "Wrong arg %s to option %s", curr_range_str,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+
+    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
+      {
+        warning (0, "Size ranges of option %s should be increasing",
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+
+    for (i = 0; i < last_alg; i++)
+      {
+        if (!strcmp (alg_name, stringop_alg_names[i]))
+	  {
+	    alg = (stringop_alg) i;
+	    break;
+          }
+      }
+
+    if (i == last_alg)
+      {
+        warning (0, "Wrong stringop strategy name %s specified for option %s",
+	         alg_name,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+	return;
+      }
+
+    input_ranges[n].min = mins;
+    input_ranges[n].max = maxs;
+    input_ranges[n].alg = alg;
+    if (!strcmp (align, "align"))
+      input_ranges[n].noalign = false;
+    else if (!strcmp (align, "noalign"))
+      input_ranges[n].noalign = true;
+    else
+      {
+        warning (0, "Unknown alignment %s specified for option %s",
+                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+    n++;
+    curr_range_str = next_range_str;
+  } while (curr_range_str);
+
+  if (input_ranges[n - 1].max != -1)
+    {
+      warning (0, "The max value for the last size range should be -1"
+               " for option %s",
+               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  if (n > MAX_STRINGOP_ALGS)
+    {
+      warning (0, "Too many size ranges specified in option %s",
+               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  /* Now override the default algs array  */
+  for (i = 0; i < n; i++)
+    {
+      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+          = input_ranges[i].alg;
+      *const_cast<int *>(&default_algs->size[i].noalign)
+          = input_ranges[i].noalign;
+    }
+}
+
 \f
 /* Override various settings based on options.  If MAIN_ARGS_P, the
    options are from the command line, otherwise they are from
@@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
   /* Handle stack protector */
   if (!global_options_set.x_ix86_stack_protector_guard)
     ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
+
+  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+  if (ix86_tune_memcpy_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memcpy_strategy);
+      ix86_parse_stringop_strategy_string (str, false);
+      free (str);
+    }
+
+  if (ix86_tune_memset_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memset_strategy);
+      ix86_parse_stringop_strategy_string (str, true);
+      free (str);
+    }
 }
 
 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
@@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
       need_zero_guard = true;
@@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
@@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop:
       need_zero_guard = true;
@@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
Index: config/i386/i386-opts.h
===================================================================
--- config/i386/i386-opts.h	(revision 201458)
+++ config/i386/i386-opts.h	(working copy)
@@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
 /* Algorithm to expand string function with.  */
 enum stringop_alg
 {
-   no_stringop,
-   libcall,
-   rep_prefix_1_byte,
-   rep_prefix_4_byte,
-   rep_prefix_8_byte,
-   loop_1_byte,
-   loop,
-   unrolled_loop,
-   vector_loop
+#undef DEF_ENUM
+#define DEF_ENUM
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) alg, 
+
+#include "stringop.def"
+last_alg
+
+#undef DEF_ENUM
+#undef DEF_ALG
 };
 
 /* Available call abi.  */
Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 201458)
+++ doc/invoke.texi	(working copy)
@@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
 -mbmi2 -mrtm -mlwp -mthreads @gol
 -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
+-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} 
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
 -mregparm=@var{num}  -msseregparm @gol
@@ -14598,6 +14599,24 @@ Expand into an inline loop.
 Always use a library call.
 @end table
 
+@item -mmemcpy-strategy=@var{strategy}
+@opindex mmemcpy-strategy=@var{strategy}
+Override the internal decision heuristic to decide if @code{__builtin_memcpy}
+should be inlined and what inline algorithm to use when the expected size
+of the copy operation is known. @var{strategy} 
+is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. 
+@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
+the max byte size with which inline algorithm @var{alg} is allowed. For the last
+triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
+in the list must be specified in increasing order. The minimal byte size for 
+@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the 
+preceding range.
+
+@item -mmemset-strategy=@var{strategy}
+@opindex mmemset-strategy=@var{strategy}
+The option is similar to @option{-mmemcpy-strategy=} except that it is to control
+@code{__builtin_memset} expansion.
+
 @item -momit-leaf-frame-pointer
 @opindex momit-leaf-frame-pointer
 Don't keep the frame pointer in a register for leaf functions.  This
Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memset-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memset" 2  } } */
+
+char a[2048];
+void t (void)
+{
+  __builtin_memset (a, 1, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memcpy" 2  } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-05  3:01 ` New parameters to control stringop expansion libcall strategy Xinliang David Li
@ 2013-08-05 10:57   ` Michael V. Zolotukhin
  2013-08-05 16:44     ` Xinliang David Li
  0 siblings, 1 reply; 23+ messages in thread
From: Michael V. Zolotukhin @ 2013-08-05 10:57 UTC (permalink / raw)
  To: Xinliang David Li; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson

Hi,
This is a really convenient option, thanks for working on it.
I can't approve it as I'm not a maintainer, but it looks ok to me,
except fot a small nitpicking: afair, comments should end with
dot-space-space.

Michael

On 04 Aug 20:01, Xinliang David Li wrote:
> The attached is a new patch implementing the stringop inline strategy
> control using two new -m options:
> 
> -mmemcpy-strategy=
> -mmemset-strategy=
> 
> See changes in doc/invoke.texi for description of the new options. Example:
>   -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned
> 
> tells compiler to inline memcpy using rep_8byte when the size is no
> larger than 64 byte, using unrolled_loop when size is no larger than
> 2048, and for size > 2048, using library call. In all cases,
> destination alignment adjustment is not done.
> 
> Tested on x86-64/linux. Ok for trunk?
> 
> thanks,
> 
> David
> 
> 2013-08-02  Xinliang David Li  <davidxl@google.com>
> 
>         * config/i386/stringop.def: New file.
>         * config/i386/stringop.opt: New file.
>         * config/i386/i386-opts.h: Include stringopt.def.
>         * config/i386/i386.opt: Include stringopt.opt.
>         * config/i386/i386.c (ix86_option_override_internal):
>         Override default size based stringop inline strategies
>         with options.
>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
>         New function.
> 
> 2013-08-04  Xinliang David Li  <davidxl@google.com>
> 
>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.
> 
> 
> 
> 
> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
> > FDO), libcall strategy is used with the size is > 8192. This value is
> > hard coded, which makes it hard to do performance tuning. This patch
> > adds two new parameters to do that. Potential usage includes
> > per-application libcall strategy min-size tuning based on summary data
> > with FDO (e.g, instruction workset size).
> >
> > Bootstrap and tested on x86_64/linux. Ok for trunk?
> >
> > thanks,
> >
> > David
> >
> >
> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
> >
> >         * params.def: New parameters.
> >         * config/i386/i386.c (ix86_option_override_internal):
> >         Override default libcall size limit with parameters.

> Index: config/i386/stringop.def
> ===================================================================
> --- config/i386/stringop.def	(revision 0)
> +++ config/i386/stringop.def	(revision 0)
> @@ -0,0 +1,42 @@
> +/* Definitions for option handling for IA-32.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify
> +it under the terms of the GNU General Public License as published by
> +the Free Software Foundation; either version 3, or (at your option)
> +any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +Under Section 7 of GPL version 3, you are granted additional
> +permissions described in the GCC Runtime Library Exception, version
> +3.1, as published by the Free Software Foundation.
> +
> +You should have received a copy of the GNU General Public License and
> +a copy of the GCC Runtime Library Exception along with this program;
> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +<http://www.gnu.org/licenses/>.  */
> +
> +DEF_ENUM
> +DEF_ALG (no_stringop, no_stringop)
> +DEF_ENUM
> +DEF_ALG (libcall, libcall)
> +DEF_ENUM
> +DEF_ALG (rep_prefix_1_byte, rep_byte)
> +DEF_ENUM
> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
> +DEF_ENUM
> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
> +DEF_ENUM
> +DEF_ALG (loop_1_byte, byte_loop)
> +DEF_ENUM
> +DEF_ALG (loop, loop)
> +DEF_ENUM
> +DEF_ALG (unrolled_loop, unrolled_loop)
> +DEF_ENUM
> +DEF_ALG (vector_loop, vector_loop)
> Index: config/i386/i386.opt
> ===================================================================
> --- config/i386/i386.opt	(revision 201458)
> +++ config/i386/i386.opt	(working copy)
> @@ -316,6 +316,14 @@ mstack-arg-probe
>  Target Report Mask(STACK_PROBE) Save
>  Enable stack probing
>  
> +mmemcpy-strategy=
> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
> +Specify memcpy expansion strategy when expected size is known
> +
> +mmemset-strategy=
> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
> +Specify memset expansion strategy when expected size is known
> +
>  mstringop-strategy=
>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
>  Chose strategy to generate stringop using
> Index: config/i386/stringop.opt
> ===================================================================
> --- config/i386/stringop.opt	(revision 0)
> +++ config/i386/stringop.opt	(revision 0)
> @@ -0,0 +1,36 @@
> +/* Definitions for option handling for IA-32.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify
> +it under the terms of the GNU General Public License as published by
> +the Free Software Foundation; either version 3, or (at your option)
> +any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +Under Section 7 of GPL version 3, you are granted additional
> +permissions described in the GCC Runtime Library Exception, version
> +3.1, as published by the Free Software Foundation.
> +
> +You should have received a copy of the GNU General Public License and
> +a copy of the GCC Runtime Library Exception along with this program;
> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +<http://www.gnu.org/licenses/>.  */
> +
> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
> +
> +#undef DEF_ENUM
> +#define DEF_ENUM EnumValue
> +
> +#undef DEF_ALG
> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
> +
> +#include "stringop.def"
> +
> +#undef DEF_ENUM
> +#undef DEF_ALG
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c	(revision 201458)
> +++ config/i386/i386.c	(working copy)
> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
>  };
>  
>  /* Processor costs (relative to an add) */
> -static const
> +static
>  struct processor_costs i386_cost = {	/* 386 specific costs */
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (1),			/* cost of a lea instruction */
> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {	/*
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs i486_cost = {	/* 486 specific costs */
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (1),			/* cost of a lea instruction */
> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {	/*
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs pentium_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (1),			/* cost of a lea instruction */
> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs pentiumpro_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (1),			/* cost of a lea instruction */
> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs geode_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (1),			/* cost of a lea instruction */
> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs k6_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (2),			/* cost of a lea instruction */
> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs athlon_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (2),			/* cost of a lea instruction */
> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs k8_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (2),			/* cost of a lea instruction */
> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs pentium4_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (3),			/* cost of a lea instruction */
> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs nocona_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (1),			/* cost of a lea instruction */
> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
>    1,					/* cond_not_taken_branch_cost.  */
>  };
>  
> -static const
> +static
>  struct processor_costs atom_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
>  };
>  
>  /* Generic64 should produce code tuned for Nocona and K8.  */
> -static const
> +static
>  struct processor_costs generic64_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    /* On all chips taken into consideration lea is 2 cycles and more.  With
> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
>  };
>  
>  /* core_cost should produce code tuned for Core familly of CPUs.  */
> -static const
> +static
>  struct processor_costs core_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    /* On all chips taken into consideration lea is 2 cycles and more.  With
> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
>  
>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
>     Athlon and K8.  */
> -static const
> +static
>  struct processor_costs generic32_cost = {
>    COSTS_N_INSNS (1),			/* cost of an add instruction */
>    COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
>  
>    return;
>  }
> +
> +static const char *stringop_alg_names[] = {
> +#define DEF_ENUM
> +#define DEF_ALG(alg, name) #name,
> +#include "stringop.def"
> +#undef DEF_ENUM
> +#undef DEF_ALG
> +};
> +
> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
> +   The string is of the following form (or comma separated list of it):
> +
> +     strategy_alg:max_size:[align|noalign]
> +
> +   where the full size range for the strategy is either [0, max_size] or
> +   [min_size, max_size], in which min_size is the max_size + 1 of the
> +   preceding range.  The last size range must have max_size == -1.
> +
> +   Examples:
> +
> +    1.
> +       -mmemcpy-strategy=libcall:-1:noalign
> +
> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
> +
> +
> +   2.
> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
> +
> +      This is to tell the compiler to use the following strategy for memset
> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
> +      2) when the size is between [17, 2048], use vector_loop;
> +      3) when the size is > 2048, use libcall.
> +
> +*/
> +
> +struct stringop_size_range
> +{
> +  int min;
> +  int max;
> +  stringop_alg alg;
> +  bool noalign;
> +};
> +
> +static void
> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
> +{
> +  const struct stringop_algs *default_algs;
> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
> +  char *curr_range_str, *next_range_str;
> +  int i = 0, n = 0;
> +
> +  if (is_memset)
> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
> +  else
> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
> +
> +  curr_range_str = strategy_str;
> +
> +  do {
> +
> +    int mins, maxs;
> +    stringop_alg alg;
> +    char alg_name[128];
> +    char align[16];
> +
> +    next_range_str = strchr (curr_range_str, ',');
> +    if (next_range_str)
> +      *next_range_str++ = '\0';
> +
> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
> +      {
> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> +        return;
> +      }
> +
> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
> +      {
> +        warning (0, "Size ranges of option %s should be increasing",
> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> +        return;
> +      }
> +
> +    for (i = 0; i < last_alg; i++)
> +      {
> +        if (!strcmp (alg_name, stringop_alg_names[i]))
> +	  {
> +	    alg = (stringop_alg) i;
> +	    break;
> +          }
> +      }
> +
> +    if (i == last_alg)
> +      {
> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
> +	         alg_name,
> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> +	return;
> +      }
> +
> +    input_ranges[n].min = mins;
> +    input_ranges[n].max = maxs;
> +    input_ranges[n].alg = alg;
> +    if (!strcmp (align, "align"))
> +      input_ranges[n].noalign = false;
> +    else if (!strcmp (align, "noalign"))
> +      input_ranges[n].noalign = true;
> +    else
> +      {
> +        warning (0, "Unknown alignment %s specified for option %s",
> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> +        return;
> +      }
> +    n++;
> +    curr_range_str = next_range_str;
> +  } while (curr_range_str);
> +
> +  if (input_ranges[n - 1].max != -1)
> +    {
> +      warning (0, "The max value for the last size range should be -1"
> +               " for option %s",
> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> +      return;
> +    }
> +
> +  if (n > MAX_STRINGOP_ALGS)
> +    {
> +      warning (0, "Too many size ranges specified in option %s",
> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> +      return;
> +    }
> +
> +  /* Now override the default algs array  */
> +  for (i = 0; i < n; i++)
> +    {
> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
> +          = input_ranges[i].alg;
> +      *const_cast<int *>(&default_algs->size[i].noalign)
> +          = input_ranges[i].noalign;
> +    }
> +}
> +
>  \f
>  /* Override various settings based on options.  If MAIN_ARGS_P, the
>     options are from the command line, otherwise they are from
> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
>    /* Handle stack protector */
>    if (!global_options_set.x_ix86_stack_protector_guard)
>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
> +
> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
> +  if (ix86_tune_memcpy_strategy)
> +    {
> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
> +      ix86_parse_stringop_strategy_string (str, false);
> +      free (str);
> +    }
> +
> +  if (ix86_tune_memset_strategy)
> +    {
> +      char *str = xstrdup (ix86_tune_memset_strategy);
> +      ix86_parse_stringop_strategy_string (str, true);
> +      free (str);
> +    }
>  }
>  
>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>      {
>      case libcall:
>      case no_stringop:
> +    case last_alg:
>        gcc_unreachable ();
>      case loop_1_byte:
>        need_zero_guard = true;
> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>      {
>      case libcall:
>      case no_stringop:
> +    case last_alg:
>        gcc_unreachable ();
>      case loop_1_byte:
>      case loop:
> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>      {
>      case libcall:
>      case no_stringop:
> +    case last_alg:
>        gcc_unreachable ();
>      case loop:
>        need_zero_guard = true;
> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>      {
>      case libcall:
>      case no_stringop:
> +    case last_alg:
>        gcc_unreachable ();
>      case loop_1_byte:
>      case loop:
> Index: config/i386/i386-opts.h
> ===================================================================
> --- config/i386/i386-opts.h	(revision 201458)
> +++ config/i386/i386-opts.h	(working copy)
> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
>  /* Algorithm to expand string function with.  */
>  enum stringop_alg
>  {
> -   no_stringop,
> -   libcall,
> -   rep_prefix_1_byte,
> -   rep_prefix_4_byte,
> -   rep_prefix_8_byte,
> -   loop_1_byte,
> -   loop,
> -   unrolled_loop,
> -   vector_loop
> +#undef DEF_ENUM
> +#define DEF_ENUM
> +
> +#undef DEF_ALG
> +#define DEF_ALG(alg, name) alg, 
> +
> +#include "stringop.def"
> +last_alg
> +
> +#undef DEF_ENUM
> +#undef DEF_ALG
>  };
>  
>  /* Available call abi.  */
> Index: doc/invoke.texi
> ===================================================================
> --- doc/invoke.texi	(revision 201458)
> +++ doc/invoke.texi	(working copy)
> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
>  -mbmi2 -mrtm -mlwp -mthreads @gol
>  -mno-align-stringops  -minline-all-stringops @gol
>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} 
>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
>  -mregparm=@var{num}  -msseregparm @gol
> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
>  Always use a library call.
>  @end table
>  
> +@item -mmemcpy-strategy=@var{strategy}
> +@opindex mmemcpy-strategy=@var{strategy}
> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
> +should be inlined and what inline algorithm to use when the expected size
> +of the copy operation is known. @var{strategy} 
> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. 
> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
> +in the list must be specified in increasing order. The minimal byte size for 
> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the 
> +preceding range.
> +
> +@item -mmemset-strategy=@var{strategy}
> +@opindex mmemset-strategy=@var{strategy}
> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
> +@code{__builtin_memset} expansion.
> +
>  @item -momit-leaf-frame-pointer
>  @opindex momit-leaf-frame-pointer
>  Don't keep the frame pointer in a register for leaf functions.  This
> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
> ===================================================================
> --- testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
> +
> +char a[2048];
> +char b[2048];
> +void t (void)
> +{
> +  __builtin_memcpy (a, b, 2048);
> +}
> +
> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
> ===================================================================
> --- testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
> +
> +char a[2048];
> +char b[2048];
> +void t (void)
> +{
> +  __builtin_memcpy (a, b, 2048);
> +}
> +
> Index: testsuite/gcc.target/i386/memset-strategy-1.c
> ===================================================================
> --- testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
> +++ testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
> +/* { dg-final { scan-assembler-times "memset" 2  } } */
> +
> +char a[2048];
> +void t (void)
> +{
> +  __builtin_memset (a, 1, 2048);
> +}
> +
> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
> ===================================================================
> --- testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
> +
> +char a[2048];
> +char b[2048];
> +void t (void)
> +{
> +  __builtin_memcpy (a, b, 2048);
> +}
> +

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-05 10:57   ` Michael V. Zolotukhin
@ 2013-08-05 16:44     ` Xinliang David Li
  2013-08-06  8:46       ` Michael Zolotukhin
  0 siblings, 1 reply; 23+ messages in thread
From: Xinliang David Li @ 2013-08-05 16:44 UTC (permalink / raw)
  To: Michael V. Zolotukhin; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson

[-- Attachment #1: Type: text/plain, Size: 24045 bytes --]

thanks. Updated patch attached.

David

On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> Hi,
> This is a really convenient option, thanks for working on it.
> I can't approve it as I'm not a maintainer, but it looks ok to me,
> except fot a small nitpicking: afair, comments should end with
> dot-space-space.
>
> Michael
>
> On 04 Aug 20:01, Xinliang David Li wrote:
>> The attached is a new patch implementing the stringop inline strategy
>> control using two new -m options:
>>
>> -mmemcpy-strategy=
>> -mmemset-strategy=
>>
>> See changes in doc/invoke.texi for description of the new options. Example:
>>   -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned
>>
>> tells compiler to inline memcpy using rep_8byte when the size is no
>> larger than 64 byte, using unrolled_loop when size is no larger than
>> 2048, and for size > 2048, using library call. In all cases,
>> destination alignment adjustment is not done.
>>
>> Tested on x86-64/linux. Ok for trunk?
>>
>> thanks,
>>
>> David
>>
>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>
>>         * config/i386/stringop.def: New file.
>>         * config/i386/stringop.opt: New file.
>>         * config/i386/i386-opts.h: Include stringopt.def.
>>         * config/i386/i386.opt: Include stringopt.opt.
>>         * config/i386/i386.c (ix86_option_override_internal):
>>         Override default size based stringop inline strategies
>>         with options.
>>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
>>         New function.
>>
>> 2013-08-04  Xinliang David Li  <davidxl@google.com>
>>
>>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
>>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
>>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
>>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.
>>
>>
>>
>>
>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
>> > FDO), libcall strategy is used with the size is > 8192. This value is
>> > hard coded, which makes it hard to do performance tuning. This patch
>> > adds two new parameters to do that. Potential usage includes
>> > per-application libcall strategy min-size tuning based on summary data
>> > with FDO (e.g, instruction workset size).
>> >
>> > Bootstrap and tested on x86_64/linux. Ok for trunk?
>> >
>> > thanks,
>> >
>> > David
>> >
>> >
>> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
>> >
>> >         * params.def: New parameters.
>> >         * config/i386/i386.c (ix86_option_override_internal):
>> >         Override default libcall size limit with parameters.
>
>> Index: config/i386/stringop.def
>> ===================================================================
>> --- config/i386/stringop.def  (revision 0)
>> +++ config/i386/stringop.def  (revision 0)
>> @@ -0,0 +1,42 @@
>> +/* Definitions for option handling for IA-32.
>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>> +
>> +This file is part of GCC.
>> +
>> +GCC is free software; you can redistribute it and/or modify
>> +it under the terms of the GNU General Public License as published by
>> +the Free Software Foundation; either version 3, or (at your option)
>> +any later version.
>> +
>> +GCC is distributed in the hope that it will be useful,
>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +GNU General Public License for more details.
>> +
>> +Under Section 7 of GPL version 3, you are granted additional
>> +permissions described in the GCC Runtime Library Exception, version
>> +3.1, as published by the Free Software Foundation.
>> +
>> +You should have received a copy of the GNU General Public License and
>> +a copy of the GCC Runtime Library Exception along with this program;
>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>> +<http://www.gnu.org/licenses/>.  */
>> +
>> +DEF_ENUM
>> +DEF_ALG (no_stringop, no_stringop)
>> +DEF_ENUM
>> +DEF_ALG (libcall, libcall)
>> +DEF_ENUM
>> +DEF_ALG (rep_prefix_1_byte, rep_byte)
>> +DEF_ENUM
>> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
>> +DEF_ENUM
>> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
>> +DEF_ENUM
>> +DEF_ALG (loop_1_byte, byte_loop)
>> +DEF_ENUM
>> +DEF_ALG (loop, loop)
>> +DEF_ENUM
>> +DEF_ALG (unrolled_loop, unrolled_loop)
>> +DEF_ENUM
>> +DEF_ALG (vector_loop, vector_loop)
>> Index: config/i386/i386.opt
>> ===================================================================
>> --- config/i386/i386.opt      (revision 201458)
>> +++ config/i386/i386.opt      (working copy)
>> @@ -316,6 +316,14 @@ mstack-arg-probe
>>  Target Report Mask(STACK_PROBE) Save
>>  Enable stack probing
>>
>> +mmemcpy-strategy=
>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
>> +Specify memcpy expansion strategy when expected size is known
>> +
>> +mmemset-strategy=
>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
>> +Specify memset expansion strategy when expected size is known
>> +
>>  mstringop-strategy=
>>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
>>  Chose strategy to generate stringop using
>> Index: config/i386/stringop.opt
>> ===================================================================
>> --- config/i386/stringop.opt  (revision 0)
>> +++ config/i386/stringop.opt  (revision 0)
>> @@ -0,0 +1,36 @@
>> +/* Definitions for option handling for IA-32.
>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>> +
>> +This file is part of GCC.
>> +
>> +GCC is free software; you can redistribute it and/or modify
>> +it under the terms of the GNU General Public License as published by
>> +the Free Software Foundation; either version 3, or (at your option)
>> +any later version.
>> +
>> +GCC is distributed in the hope that it will be useful,
>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +GNU General Public License for more details.
>> +
>> +Under Section 7 of GPL version 3, you are granted additional
>> +permissions described in the GCC Runtime Library Exception, version
>> +3.1, as published by the Free Software Foundation.
>> +
>> +You should have received a copy of the GNU General Public License and
>> +a copy of the GCC Runtime Library Exception along with this program;
>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>> +<http://www.gnu.org/licenses/>.  */
>> +
>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
>> +
>> +#undef DEF_ENUM
>> +#define DEF_ENUM EnumValue
>> +
>> +#undef DEF_ALG
>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
>> +
>> +#include "stringop.def"
>> +
>> +#undef DEF_ENUM
>> +#undef DEF_ALG
>> Index: config/i386/i386.c
>> ===================================================================
>> --- config/i386/i386.c        (revision 201458)
>> +++ config/i386/i386.c        (working copy)
>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
>>  };
>>
>>  /* Processor costs (relative to an add) */
>> -static const
>> +static
>>  struct processor_costs i386_cost = { /* 386 specific costs */
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {      /*
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs i486_cost = { /* 486 specific costs */
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {      /*
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs pentium_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs pentiumpro_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs geode_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs k6_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs athlon_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs k8_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs pentium4_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (3),                 /* cost of a lea instruction */
>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs nocona_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
>>    1,                                 /* cond_not_taken_branch_cost.  */
>>  };
>>
>> -static const
>> +static
>>  struct processor_costs atom_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
>>  };
>>
>>  /* Generic64 should produce code tuned for Nocona and K8.  */
>> -static const
>> +static
>>  struct processor_costs generic64_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
>>  };
>>
>>  /* core_cost should produce code tuned for Core familly of CPUs.  */
>> -static const
>> +static
>>  struct processor_costs core_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
>>
>>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
>>     Athlon and K8.  */
>> -static const
>> +static
>>  struct processor_costs generic32_cost = {
>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
>>
>>    return;
>>  }
>> +
>> +static const char *stringop_alg_names[] = {
>> +#define DEF_ENUM
>> +#define DEF_ALG(alg, name) #name,
>> +#include "stringop.def"
>> +#undef DEF_ENUM
>> +#undef DEF_ALG
>> +};
>> +
>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
>> +   The string is of the following form (or comma separated list of it):
>> +
>> +     strategy_alg:max_size:[align|noalign]
>> +
>> +   where the full size range for the strategy is either [0, max_size] or
>> +   [min_size, max_size], in which min_size is the max_size + 1 of the
>> +   preceding range.  The last size range must have max_size == -1.
>> +
>> +   Examples:
>> +
>> +    1.
>> +       -mmemcpy-strategy=libcall:-1:noalign
>> +
>> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
>> +
>> +
>> +   2.
>> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
>> +
>> +      This is to tell the compiler to use the following strategy for memset
>> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
>> +      2) when the size is between [17, 2048], use vector_loop;
>> +      3) when the size is > 2048, use libcall.
>> +
>> +*/
>> +
>> +struct stringop_size_range
>> +{
>> +  int min;
>> +  int max;
>> +  stringop_alg alg;
>> +  bool noalign;
>> +};
>> +
>> +static void
>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
>> +{
>> +  const struct stringop_algs *default_algs;
>> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
>> +  char *curr_range_str, *next_range_str;
>> +  int i = 0, n = 0;
>> +
>> +  if (is_memset)
>> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
>> +  else
>> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
>> +
>> +  curr_range_str = strategy_str;
>> +
>> +  do {
>> +
>> +    int mins, maxs;
>> +    stringop_alg alg;
>> +    char alg_name[128];
>> +    char align[16];
>> +
>> +    next_range_str = strchr (curr_range_str, ',');
>> +    if (next_range_str)
>> +      *next_range_str++ = '\0';
>> +
>> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
>> +      {
>> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> +        return;
>> +      }
>> +
>> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
>> +      {
>> +        warning (0, "Size ranges of option %s should be increasing",
>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> +        return;
>> +      }
>> +
>> +    for (i = 0; i < last_alg; i++)
>> +      {
>> +        if (!strcmp (alg_name, stringop_alg_names[i]))
>> +       {
>> +         alg = (stringop_alg) i;
>> +         break;
>> +          }
>> +      }
>> +
>> +    if (i == last_alg)
>> +      {
>> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
>> +              alg_name,
>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> +     return;
>> +      }
>> +
>> +    input_ranges[n].min = mins;
>> +    input_ranges[n].max = maxs;
>> +    input_ranges[n].alg = alg;
>> +    if (!strcmp (align, "align"))
>> +      input_ranges[n].noalign = false;
>> +    else if (!strcmp (align, "noalign"))
>> +      input_ranges[n].noalign = true;
>> +    else
>> +      {
>> +        warning (0, "Unknown alignment %s specified for option %s",
>> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> +        return;
>> +      }
>> +    n++;
>> +    curr_range_str = next_range_str;
>> +  } while (curr_range_str);
>> +
>> +  if (input_ranges[n - 1].max != -1)
>> +    {
>> +      warning (0, "The max value for the last size range should be -1"
>> +               " for option %s",
>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> +      return;
>> +    }
>> +
>> +  if (n > MAX_STRINGOP_ALGS)
>> +    {
>> +      warning (0, "Too many size ranges specified in option %s",
>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> +      return;
>> +    }
>> +
>> +  /* Now override the default algs array  */
>> +  for (i = 0; i < n; i++)
>> +    {
>> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
>> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
>> +          = input_ranges[i].alg;
>> +      *const_cast<int *>(&default_algs->size[i].noalign)
>> +          = input_ranges[i].noalign;
>> +    }
>> +}
>> +
>>
>>  /* Override various settings based on options.  If MAIN_ARGS_P, the
>>     options are from the command line, otherwise they are from
>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
>>    /* Handle stack protector */
>>    if (!global_options_set.x_ix86_stack_protector_guard)
>>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
>> +
>> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
>> +  if (ix86_tune_memcpy_strategy)
>> +    {
>> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
>> +      ix86_parse_stringop_strategy_string (str, false);
>> +      free (str);
>> +    }
>> +
>> +  if (ix86_tune_memset_strategy)
>> +    {
>> +      char *str = xstrdup (ix86_tune_memset_strategy);
>> +      ix86_parse_stringop_strategy_string (str, true);
>> +      free (str);
>> +    }
>>  }
>>
>>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>      {
>>      case libcall:
>>      case no_stringop:
>> +    case last_alg:
>>        gcc_unreachable ();
>>      case loop_1_byte:
>>        need_zero_guard = true;
>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>      {
>>      case libcall:
>>      case no_stringop:
>> +    case last_alg:
>>        gcc_unreachable ();
>>      case loop_1_byte:
>>      case loop:
>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>      {
>>      case libcall:
>>      case no_stringop:
>> +    case last_alg:
>>        gcc_unreachable ();
>>      case loop:
>>        need_zero_guard = true;
>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>      {
>>      case libcall:
>>      case no_stringop:
>> +    case last_alg:
>>        gcc_unreachable ();
>>      case loop_1_byte:
>>      case loop:
>> Index: config/i386/i386-opts.h
>> ===================================================================
>> --- config/i386/i386-opts.h   (revision 201458)
>> +++ config/i386/i386-opts.h   (working copy)
>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
>>  /* Algorithm to expand string function with.  */
>>  enum stringop_alg
>>  {
>> -   no_stringop,
>> -   libcall,
>> -   rep_prefix_1_byte,
>> -   rep_prefix_4_byte,
>> -   rep_prefix_8_byte,
>> -   loop_1_byte,
>> -   loop,
>> -   unrolled_loop,
>> -   vector_loop
>> +#undef DEF_ENUM
>> +#define DEF_ENUM
>> +
>> +#undef DEF_ALG
>> +#define DEF_ALG(alg, name) alg,
>> +
>> +#include "stringop.def"
>> +last_alg
>> +
>> +#undef DEF_ENUM
>> +#undef DEF_ALG
>>  };
>>
>>  /* Available call abi.  */
>> Index: doc/invoke.texi
>> ===================================================================
>> --- doc/invoke.texi   (revision 201458)
>> +++ doc/invoke.texi   (working copy)
>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
>>  -mbmi2 -mrtm -mlwp -mthreads @gol
>>  -mno-align-stringops  -minline-all-stringops @gol
>>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
>>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
>>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
>>  -mregparm=@var{num}  -msseregparm @gol
>> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
>>  Always use a library call.
>>  @end table
>>
>> +@item -mmemcpy-strategy=@var{strategy}
>> +@opindex mmemcpy-strategy=@var{strategy}
>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
>> +should be inlined and what inline algorithm to use when the expected size
>> +of the copy operation is known. @var{strategy}
>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
>> +in the list must be specified in increasing order. The minimal byte size for
>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
>> +preceding range.
>> +
>> +@item -mmemset-strategy=@var{strategy}
>> +@opindex mmemset-strategy=@var{strategy}
>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
>> +@code{__builtin_memset} expansion.
>> +
>>  @item -momit-leaf-frame-pointer
>>  @opindex momit-leaf-frame-pointer
>>  Don't keep the frame pointer in a register for leaf functions.  This
>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
>> ===================================================================
>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>> @@ -0,0 +1,12 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>> +
>> +char a[2048];
>> +char b[2048];
>> +void t (void)
>> +{
>> +  __builtin_memcpy (a, b, 2048);
>> +}
>> +
>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
>> ===================================================================
>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>> @@ -0,0 +1,12 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>> +
>> +char a[2048];
>> +char b[2048];
>> +void t (void)
>> +{
>> +  __builtin_memcpy (a, b, 2048);
>> +}
>> +
>> Index: testsuite/gcc.target/i386/memset-strategy-1.c
>> ===================================================================
>> --- testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>> +++ testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>> @@ -0,0 +1,10 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
>> +/* { dg-final { scan-assembler-times "memset" 2  } } */
>> +
>> +char a[2048];
>> +void t (void)
>> +{
>> +  __builtin_memset (a, 1, 2048);
>> +}
>> +
>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
>> ===================================================================
>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>> @@ -0,0 +1,11 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
>> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
>> +
>> +char a[2048];
>> +char b[2048];
>> +void t (void)
>> +{
>> +  __builtin_memcpy (a, b, 2048);
>> +}
>> +
>

[-- Attachment #2: stringop_inl_option.p.txt --]
[-- Type: text/plain, Size: 18833 bytes --]

Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 201458)
+++ doc/invoke.texi	(working copy)
@@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
 -mbmi2 -mrtm -mlwp -mthreads @gol
 -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
+-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} 
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
 -mregparm=@var{num}  -msseregparm @gol
@@ -14598,6 +14599,24 @@ Expand into an inline loop.
 Always use a library call.
 @end table
 
+@item -mmemcpy-strategy=@var{strategy}
+@opindex mmemcpy-strategy=@var{strategy}
+Override the internal decision heuristic to decide if @code{__builtin_memcpy}
+should be inlined and what inline algorithm to use when the expected size
+of the copy operation is known. @var{strategy} 
+is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. 
+@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
+the max byte size with which inline algorithm @var{alg} is allowed. For the last
+triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
+in the list must be specified in increasing order. The minimal byte size for 
+@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the 
+preceding range.
+
+@item -mmemset-strategy=@var{strategy}
+@opindex mmemset-strategy=@var{strategy}
+The option is similar to @option{-mmemcpy-strategy=} except that it is to control
+@code{__builtin_memset} expansion.
+
 @item -momit-leaf-frame-pointer
 @opindex momit-leaf-frame-pointer
 Don't keep the frame pointer in a register for leaf functions.  This
Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memset-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memset" 2  } } */
+
+char a[2048];
+void t (void)
+{
+  __builtin_memset (a, 1, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memcpy" 2  } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: config/i386/stringop.def
===================================================================
--- config/i386/stringop.def	(revision 0)
+++ config/i386/stringop.def	(revision 0)
@@ -0,0 +1,42 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+DEF_ENUM
+DEF_ALG (no_stringop, no_stringop)
+DEF_ENUM
+DEF_ALG (libcall, libcall)
+DEF_ENUM
+DEF_ALG (rep_prefix_1_byte, rep_byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_4_byte, rep_4byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_8_byte, rep_8byte)
+DEF_ENUM
+DEF_ALG (loop_1_byte, byte_loop)
+DEF_ENUM
+DEF_ALG (loop, loop)
+DEF_ENUM
+DEF_ALG (unrolled_loop, unrolled_loop)
+DEF_ENUM
+DEF_ALG (vector_loop, vector_loop)
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 201458)
+++ config/i386/i386.c	(working copy)
@@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
 };
 
 /* Processor costs (relative to an add) */
-static const
+static
 struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -226,7 +226,7 @@ struct processor_costs i386_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -298,7 +298,7 @@ struct processor_costs i486_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k6_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (3),			/* cost of a lea instruction */
@@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
-static const
+static
 struct processor_costs generic64_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
-static const
+static
 struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
 
 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
    Athlon and K8.  */
-static const
+static
 struct processor_costs generic32_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -2900,6 +2900,148 @@ ix86_debug_options (void)
 
   return;
 }
+
+static const char *stringop_alg_names[] = {
+#define DEF_ENUM
+#define DEF_ALG(alg, name) #name,
+#include "stringop.def"
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+   The string is of the following form (or comma separated list of it):
+
+     strategy_alg:max_size:[align|noalign]
+
+   where the full size range for the strategy is either [0, max_size] or
+   [min_size, max_size], in which min_size is the max_size + 1 of the
+   preceding range.  The last size range must have max_size == -1.
+
+   Examples:
+
+    1.
+       -mmemcpy-strategy=libcall:-1:noalign
+
+      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+
+
+   2.
+      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+
+      This is to tell the compiler to use the following strategy for memset
+      1) when the expected size is between [1, 16], use rep_8byte strategy;
+      2) when the size is between [17, 2048], use vector_loop;
+      3) when the size is > 2048, use libcall.  */
+
+struct stringop_size_range
+{
+  int min;
+  int max;
+  stringop_alg alg;
+  bool noalign;
+};
+
+static void
+ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+{
+  const struct stringop_algs *default_algs;
+  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+  char *curr_range_str, *next_range_str;
+  int i = 0, n = 0;
+
+  if (is_memset)
+    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+  else
+    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+  curr_range_str = strategy_str;
+
+  do {
+
+    int mins, maxs;
+    stringop_alg alg;
+    char alg_name[128];
+    char align[16];
+
+    next_range_str = strchr (curr_range_str, ',');
+    if (next_range_str)
+      *next_range_str++ = '\0';
+
+    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
+      {
+        warning (0, "Wrong arg %s to option %s", curr_range_str,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+
+    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
+      {
+        warning (0, "Size ranges of option %s should be increasing",
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+
+    for (i = 0; i < last_alg; i++)
+      {
+        if (!strcmp (alg_name, stringop_alg_names[i]))
+	  {
+	    alg = (stringop_alg) i;
+	    break;
+          }
+      }
+
+    if (i == last_alg)
+      {
+        warning (0, "Wrong stringop strategy name %s specified for option %s",
+	         alg_name,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+	return;
+      }
+
+    input_ranges[n].min = mins;
+    input_ranges[n].max = maxs;
+    input_ranges[n].alg = alg;
+    if (!strcmp (align, "align"))
+      input_ranges[n].noalign = false;
+    else if (!strcmp (align, "noalign"))
+      input_ranges[n].noalign = true;
+    else
+      {
+        warning (0, "Unknown alignment %s specified for option %s",
+                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+    n++;
+    curr_range_str = next_range_str;
+  } while (curr_range_str);
+
+  if (input_ranges[n - 1].max != -1)
+    {
+      warning (0, "The max value for the last size range should be -1"
+               " for option %s",
+               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  if (n > MAX_STRINGOP_ALGS)
+    {
+      warning (0, "Too many size ranges specified in option %s",
+               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  /* Now override the default algs array  */
+  for (i = 0; i < n; i++)
+    {
+      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+          = input_ranges[i].alg;
+      *const_cast<int *>(&default_algs->size[i].noalign)
+          = input_ranges[i].noalign;
+    }
+}
+
 \f
 /* Override various settings based on options.  If MAIN_ARGS_P, the
    options are from the command line, otherwise they are from
@@ -4021,6 +4163,21 @@ ix86_option_override_internal (bool main
   /* Handle stack protector */
   if (!global_options_set.x_ix86_stack_protector_guard)
     ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
+
+  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+  if (ix86_tune_memcpy_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memcpy_strategy);
+      ix86_parse_stringop_strategy_string (str, false);
+      free (str);
+    }
+
+  if (ix86_tune_memset_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memset_strategy);
+      ix86_parse_stringop_strategy_string (str, true);
+      free (str);
+    }
 }
 
 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
@@ -22903,6 +23060,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
       need_zero_guard = true;
@@ -23093,6 +23251,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
@@ -23304,6 +23463,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop:
       need_zero_guard = true;
@@ -23481,6 +23641,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
Index: config/i386/stringop.opt
===================================================================
--- config/i386/stringop.opt	(revision 0)
+++ config/i386/stringop.opt	(revision 0)
@@ -0,0 +1,36 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
+
+#undef DEF_ENUM
+#define DEF_ENUM EnumValue
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
+
+#include "stringop.def"
+
+#undef DEF_ENUM
+#undef DEF_ALG
Index: config/i386/i386-opts.h
===================================================================
--- config/i386/i386-opts.h	(revision 201458)
+++ config/i386/i386-opts.h	(working copy)
@@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
 /* Algorithm to expand string function with.  */
 enum stringop_alg
 {
-   no_stringop,
-   libcall,
-   rep_prefix_1_byte,
-   rep_prefix_4_byte,
-   rep_prefix_8_byte,
-   loop_1_byte,
-   loop,
-   unrolled_loop,
-   vector_loop
+#undef DEF_ENUM
+#define DEF_ENUM
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) alg, 
+
+#include "stringop.def"
+last_alg
+
+#undef DEF_ENUM
+#undef DEF_ALG
 };
 
 /* Available call abi.  */
Index: config/i386/i386.opt
===================================================================
--- config/i386/i386.opt	(revision 201458)
+++ config/i386/i386.opt	(working copy)
@@ -316,6 +316,14 @@ mstack-arg-probe
 Target Report Mask(STACK_PROBE) Save
 Enable stack probing
 
+mmemcpy-strategy=
+Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
+Specify memcpy expansion strategy when expected size is known
+
+mmemset-strategy=
+Target RejectNegative Joined Var(ix86_tune_memset_strategy)
+Specify memset expansion strategy when expected size is known
+
 mstringop-strategy=
 Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
 Chose strategy to generate stringop using

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-05 16:44     ` Xinliang David Li
@ 2013-08-06  8:46       ` Michael Zolotukhin
  2013-08-06  9:42         ` Jan Hubicka
  2013-08-06 16:42         ` Xinliang David Li
  0 siblings, 2 replies; 23+ messages in thread
From: Michael Zolotukhin @ 2013-08-06  8:46 UTC (permalink / raw)
  To: Xinliang David Li; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson

There are still some formatting issues (like 8 spaces instead of a
tab, wrong indentation of do-loop and some other places) - to reveal
some of them you could use contrib/check_GNU_style.sh script.
But that was a nitpicking again:) Actually I wanted to ask whether
you're going to use this option for some performance experiments
involving memmov/memset - if so, probably you could tune existing
cost-models as well? Is it possible?

Michael

On 5 August 2013 20:44, Xinliang David Li <davidxl@google.com> wrote:
> thanks. Updated patch attached.
>
> David
>
> On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
>> Hi,
>> This is a really convenient option, thanks for working on it.
>> I can't approve it as I'm not a maintainer, but it looks ok to me,
>> except fot a small nitpicking: afair, comments should end with
>> dot-space-space.
>>
>> Michael
>>
>> On 04 Aug 20:01, Xinliang David Li wrote:
>>> The attached is a new patch implementing the stringop inline strategy
>>> control using two new -m options:
>>>
>>> -mmemcpy-strategy=
>>> -mmemset-strategy=
>>>
>>> See changes in doc/invoke.texi for description of the new options. Example:
>>>   -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned
>>>
>>> tells compiler to inline memcpy using rep_8byte when the size is no
>>> larger than 64 byte, using unrolled_loop when size is no larger than
>>> 2048, and for size > 2048, using library call. In all cases,
>>> destination alignment adjustment is not done.
>>>
>>> Tested on x86-64/linux. Ok for trunk?
>>>
>>> thanks,
>>>
>>> David
>>>
>>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>>
>>>         * config/i386/stringop.def: New file.
>>>         * config/i386/stringop.opt: New file.
>>>         * config/i386/i386-opts.h: Include stringopt.def.
>>>         * config/i386/i386.opt: Include stringopt.opt.
>>>         * config/i386/i386.c (ix86_option_override_internal):
>>>         Override default size based stringop inline strategies
>>>         with options.
>>>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
>>>         New function.
>>>
>>> 2013-08-04  Xinliang David Li  <davidxl@google.com>
>>>
>>>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
>>>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
>>>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
>>>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.
>>>
>>>
>>>
>>>
>>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
>>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
>>> > FDO), libcall strategy is used with the size is > 8192. This value is
>>> > hard coded, which makes it hard to do performance tuning. This patch
>>> > adds two new parameters to do that. Potential usage includes
>>> > per-application libcall strategy min-size tuning based on summary data
>>> > with FDO (e.g, instruction workset size).
>>> >
>>> > Bootstrap and tested on x86_64/linux. Ok for trunk?
>>> >
>>> > thanks,
>>> >
>>> > David
>>> >
>>> >
>>> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>> >
>>> >         * params.def: New parameters.
>>> >         * config/i386/i386.c (ix86_option_override_internal):
>>> >         Override default libcall size limit with parameters.
>>
>>> Index: config/i386/stringop.def
>>> ===================================================================
>>> --- config/i386/stringop.def  (revision 0)
>>> +++ config/i386/stringop.def  (revision 0)
>>> @@ -0,0 +1,42 @@
>>> +/* Definitions for option handling for IA-32.
>>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>>> +
>>> +This file is part of GCC.
>>> +
>>> +GCC is free software; you can redistribute it and/or modify
>>> +it under the terms of the GNU General Public License as published by
>>> +the Free Software Foundation; either version 3, or (at your option)
>>> +any later version.
>>> +
>>> +GCC is distributed in the hope that it will be useful,
>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> +GNU General Public License for more details.
>>> +
>>> +Under Section 7 of GPL version 3, you are granted additional
>>> +permissions described in the GCC Runtime Library Exception, version
>>> +3.1, as published by the Free Software Foundation.
>>> +
>>> +You should have received a copy of the GNU General Public License and
>>> +a copy of the GCC Runtime Library Exception along with this program;
>>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>> +<http://www.gnu.org/licenses/>.  */
>>> +
>>> +DEF_ENUM
>>> +DEF_ALG (no_stringop, no_stringop)
>>> +DEF_ENUM
>>> +DEF_ALG (libcall, libcall)
>>> +DEF_ENUM
>>> +DEF_ALG (rep_prefix_1_byte, rep_byte)
>>> +DEF_ENUM
>>> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
>>> +DEF_ENUM
>>> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
>>> +DEF_ENUM
>>> +DEF_ALG (loop_1_byte, byte_loop)
>>> +DEF_ENUM
>>> +DEF_ALG (loop, loop)
>>> +DEF_ENUM
>>> +DEF_ALG (unrolled_loop, unrolled_loop)
>>> +DEF_ENUM
>>> +DEF_ALG (vector_loop, vector_loop)
>>> Index: config/i386/i386.opt
>>> ===================================================================
>>> --- config/i386/i386.opt      (revision 201458)
>>> +++ config/i386/i386.opt      (working copy)
>>> @@ -316,6 +316,14 @@ mstack-arg-probe
>>>  Target Report Mask(STACK_PROBE) Save
>>>  Enable stack probing
>>>
>>> +mmemcpy-strategy=
>>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
>>> +Specify memcpy expansion strategy when expected size is known
>>> +
>>> +mmemset-strategy=
>>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
>>> +Specify memset expansion strategy when expected size is known
>>> +
>>>  mstringop-strategy=
>>>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
>>>  Chose strategy to generate stringop using
>>> Index: config/i386/stringop.opt
>>> ===================================================================
>>> --- config/i386/stringop.opt  (revision 0)
>>> +++ config/i386/stringop.opt  (revision 0)
>>> @@ -0,0 +1,36 @@
>>> +/* Definitions for option handling for IA-32.
>>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>>> +
>>> +This file is part of GCC.
>>> +
>>> +GCC is free software; you can redistribute it and/or modify
>>> +it under the terms of the GNU General Public License as published by
>>> +the Free Software Foundation; either version 3, or (at your option)
>>> +any later version.
>>> +
>>> +GCC is distributed in the hope that it will be useful,
>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>> +GNU General Public License for more details.
>>> +
>>> +Under Section 7 of GPL version 3, you are granted additional
>>> +permissions described in the GCC Runtime Library Exception, version
>>> +3.1, as published by the Free Software Foundation.
>>> +
>>> +You should have received a copy of the GNU General Public License and
>>> +a copy of the GCC Runtime Library Exception along with this program;
>>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>> +<http://www.gnu.org/licenses/>.  */
>>> +
>>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
>>> +
>>> +#undef DEF_ENUM
>>> +#define DEF_ENUM EnumValue
>>> +
>>> +#undef DEF_ALG
>>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
>>> +
>>> +#include "stringop.def"
>>> +
>>> +#undef DEF_ENUM
>>> +#undef DEF_ALG
>>> Index: config/i386/i386.c
>>> ===================================================================
>>> --- config/i386/i386.c        (revision 201458)
>>> +++ config/i386/i386.c        (working copy)
>>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
>>>  };
>>>
>>>  /* Processor costs (relative to an add) */
>>> -static const
>>> +static
>>>  struct processor_costs i386_cost = { /* 386 specific costs */
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {      /*
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs i486_cost = { /* 486 specific costs */
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {      /*
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs pentium_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs pentiumpro_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs geode_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs k6_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs athlon_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs k8_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs pentium4_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (3),                 /* cost of a lea instruction */
>>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs nocona_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>  };
>>>
>>> -static const
>>> +static
>>>  struct processor_costs atom_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
>>>  };
>>>
>>>  /* Generic64 should produce code tuned for Nocona and K8.  */
>>> -static const
>>> +static
>>>  struct processor_costs generic64_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
>>>  };
>>>
>>>  /* core_cost should produce code tuned for Core familly of CPUs.  */
>>> -static const
>>> +static
>>>  struct processor_costs core_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
>>>
>>>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
>>>     Athlon and K8.  */
>>> -static const
>>> +static
>>>  struct processor_costs generic32_cost = {
>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
>>>
>>>    return;
>>>  }
>>> +
>>> +static const char *stringop_alg_names[] = {
>>> +#define DEF_ENUM
>>> +#define DEF_ALG(alg, name) #name,
>>> +#include "stringop.def"
>>> +#undef DEF_ENUM
>>> +#undef DEF_ALG
>>> +};
>>> +
>>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
>>> +   The string is of the following form (or comma separated list of it):
>>> +
>>> +     strategy_alg:max_size:[align|noalign]
>>> +
>>> +   where the full size range for the strategy is either [0, max_size] or
>>> +   [min_size, max_size], in which min_size is the max_size + 1 of the
>>> +   preceding range.  The last size range must have max_size == -1.
>>> +
>>> +   Examples:
>>> +
>>> +    1.
>>> +       -mmemcpy-strategy=libcall:-1:noalign
>>> +
>>> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
>>> +
>>> +
>>> +   2.
>>> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
>>> +
>>> +      This is to tell the compiler to use the following strategy for memset
>>> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
>>> +      2) when the size is between [17, 2048], use vector_loop;
>>> +      3) when the size is > 2048, use libcall.
>>> +
>>> +*/
>>> +
>>> +struct stringop_size_range
>>> +{
>>> +  int min;
>>> +  int max;
>>> +  stringop_alg alg;
>>> +  bool noalign;
>>> +};
>>> +
>>> +static void
>>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
>>> +{
>>> +  const struct stringop_algs *default_algs;
>>> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
>>> +  char *curr_range_str, *next_range_str;
>>> +  int i = 0, n = 0;
>>> +
>>> +  if (is_memset)
>>> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
>>> +  else
>>> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
>>> +
>>> +  curr_range_str = strategy_str;
>>> +
>>> +  do {
>>> +
>>> +    int mins, maxs;
>>> +    stringop_alg alg;
>>> +    char alg_name[128];
>>> +    char align[16];
>>> +
>>> +    next_range_str = strchr (curr_range_str, ',');
>>> +    if (next_range_str)
>>> +      *next_range_str++ = '\0';
>>> +
>>> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
>>> +      {
>>> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>> +        return;
>>> +      }
>>> +
>>> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
>>> +      {
>>> +        warning (0, "Size ranges of option %s should be increasing",
>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>> +        return;
>>> +      }
>>> +
>>> +    for (i = 0; i < last_alg; i++)
>>> +      {
>>> +        if (!strcmp (alg_name, stringop_alg_names[i]))
>>> +       {
>>> +         alg = (stringop_alg) i;
>>> +         break;
>>> +          }
>>> +      }
>>> +
>>> +    if (i == last_alg)
>>> +      {
>>> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
>>> +              alg_name,
>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>> +     return;
>>> +      }
>>> +
>>> +    input_ranges[n].min = mins;
>>> +    input_ranges[n].max = maxs;
>>> +    input_ranges[n].alg = alg;
>>> +    if (!strcmp (align, "align"))
>>> +      input_ranges[n].noalign = false;
>>> +    else if (!strcmp (align, "noalign"))
>>> +      input_ranges[n].noalign = true;
>>> +    else
>>> +      {
>>> +        warning (0, "Unknown alignment %s specified for option %s",
>>> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>> +        return;
>>> +      }
>>> +    n++;
>>> +    curr_range_str = next_range_str;
>>> +  } while (curr_range_str);
>>> +
>>> +  if (input_ranges[n - 1].max != -1)
>>> +    {
>>> +      warning (0, "The max value for the last size range should be -1"
>>> +               " for option %s",
>>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>> +      return;
>>> +    }
>>> +
>>> +  if (n > MAX_STRINGOP_ALGS)
>>> +    {
>>> +      warning (0, "Too many size ranges specified in option %s",
>>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>> +      return;
>>> +    }
>>> +
>>> +  /* Now override the default algs array  */
>>> +  for (i = 0; i < n; i++)
>>> +    {
>>> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
>>> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
>>> +          = input_ranges[i].alg;
>>> +      *const_cast<int *>(&default_algs->size[i].noalign)
>>> +          = input_ranges[i].noalign;
>>> +    }
>>> +}
>>> +
>>>
>>>  /* Override various settings based on options.  If MAIN_ARGS_P, the
>>>     options are from the command line, otherwise they are from
>>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
>>>    /* Handle stack protector */
>>>    if (!global_options_set.x_ix86_stack_protector_guard)
>>>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
>>> +
>>> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
>>> +  if (ix86_tune_memcpy_strategy)
>>> +    {
>>> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
>>> +      ix86_parse_stringop_strategy_string (str, false);
>>> +      free (str);
>>> +    }
>>> +
>>> +  if (ix86_tune_memset_strategy)
>>> +    {
>>> +      char *str = xstrdup (ix86_tune_memset_strategy);
>>> +      ix86_parse_stringop_strategy_string (str, true);
>>> +      free (str);
>>> +    }
>>>  }
>>>
>>>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
>>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>>      {
>>>      case libcall:
>>>      case no_stringop:
>>> +    case last_alg:
>>>        gcc_unreachable ();
>>>      case loop_1_byte:
>>>        need_zero_guard = true;
>>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>>      {
>>>      case libcall:
>>>      case no_stringop:
>>> +    case last_alg:
>>>        gcc_unreachable ();
>>>      case loop_1_byte:
>>>      case loop:
>>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>>      {
>>>      case libcall:
>>>      case no_stringop:
>>> +    case last_alg:
>>>        gcc_unreachable ();
>>>      case loop:
>>>        need_zero_guard = true;
>>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>>      {
>>>      case libcall:
>>>      case no_stringop:
>>> +    case last_alg:
>>>        gcc_unreachable ();
>>>      case loop_1_byte:
>>>      case loop:
>>> Index: config/i386/i386-opts.h
>>> ===================================================================
>>> --- config/i386/i386-opts.h   (revision 201458)
>>> +++ config/i386/i386-opts.h   (working copy)
>>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
>>>  /* Algorithm to expand string function with.  */
>>>  enum stringop_alg
>>>  {
>>> -   no_stringop,
>>> -   libcall,
>>> -   rep_prefix_1_byte,
>>> -   rep_prefix_4_byte,
>>> -   rep_prefix_8_byte,
>>> -   loop_1_byte,
>>> -   loop,
>>> -   unrolled_loop,
>>> -   vector_loop
>>> +#undef DEF_ENUM
>>> +#define DEF_ENUM
>>> +
>>> +#undef DEF_ALG
>>> +#define DEF_ALG(alg, name) alg,
>>> +
>>> +#include "stringop.def"
>>> +last_alg
>>> +
>>> +#undef DEF_ENUM
>>> +#undef DEF_ALG
>>>  };
>>>
>>>  /* Available call abi.  */
>>> Index: doc/invoke.texi
>>> ===================================================================
>>> --- doc/invoke.texi   (revision 201458)
>>> +++ doc/invoke.texi   (working copy)
>>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
>>>  -mbmi2 -mrtm -mlwp -mthreads @gol
>>>  -mno-align-stringops  -minline-all-stringops @gol
>>>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
>>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
>>>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
>>>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
>>>  -mregparm=@var{num}  -msseregparm @gol
>>> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
>>>  Always use a library call.
>>>  @end table
>>>
>>> +@item -mmemcpy-strategy=@var{strategy}
>>> +@opindex mmemcpy-strategy=@var{strategy}
>>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
>>> +should be inlined and what inline algorithm to use when the expected size
>>> +of the copy operation is known. @var{strategy}
>>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
>>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
>>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
>>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
>>> +in the list must be specified in increasing order. The minimal byte size for
>>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
>>> +preceding range.
>>> +
>>> +@item -mmemset-strategy=@var{strategy}
>>> +@opindex mmemset-strategy=@var{strategy}
>>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
>>> +@code{__builtin_memset} expansion.
>>> +
>>>  @item -momit-leaf-frame-pointer
>>>  @opindex momit-leaf-frame-pointer
>>>  Don't keep the frame pointer in a register for leaf functions.  This
>>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
>>> ===================================================================
>>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>>> @@ -0,0 +1,12 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>>> +
>>> +char a[2048];
>>> +char b[2048];
>>> +void t (void)
>>> +{
>>> +  __builtin_memcpy (a, b, 2048);
>>> +}
>>> +
>>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
>>> ===================================================================
>>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>>> @@ -0,0 +1,12 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>>> +
>>> +char a[2048];
>>> +char b[2048];
>>> +void t (void)
>>> +{
>>> +  __builtin_memcpy (a, b, 2048);
>>> +}
>>> +
>>> Index: testsuite/gcc.target/i386/memset-strategy-1.c
>>> ===================================================================
>>> --- testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>>> +++ testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>>> @@ -0,0 +1,10 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
>>> +/* { dg-final { scan-assembler-times "memset" 2  } } */
>>> +
>>> +char a[2048];
>>> +void t (void)
>>> +{
>>> +  __builtin_memset (a, 1, 2048);
>>> +}
>>> +
>>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
>>> ===================================================================
>>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>>> @@ -0,0 +1,11 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
>>> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
>>> +
>>> +char a[2048];
>>> +char b[2048];
>>> +void t (void)
>>> +{
>>> +  __builtin_memcpy (a, b, 2048);
>>> +}
>>> +
>>



-- 
---
Best regards,
Michael V. Zolotukhin,
Software Engineer
Intel Corporation.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-06  8:46       ` Michael Zolotukhin
@ 2013-08-06  9:42         ` Jan Hubicka
  2013-08-06 16:08           ` Xinliang David Li
  2013-08-07 17:06           ` Xinliang David Li
  2013-08-06 16:42         ` Xinliang David Li
  1 sibling, 2 replies; 23+ messages in thread
From: Jan Hubicka @ 2013-08-06  9:42 UTC (permalink / raw)
  To: Michael Zolotukhin
  Cc: Xinliang David Li, GCC Patches, Jan Hubicka, Teresa Johnson

> >>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
> >>>
> >>>         * config/i386/stringop.def: New file.
> >>>         * config/i386/stringop.opt: New file.
> >>>         * config/i386/i386-opts.h: Include stringopt.def.
> >>>         * config/i386/i386.opt: Include stringopt.opt.
> >>>         * config/i386/i386.c (ix86_option_override_internal):
> >>>         Override default size based stringop inline strategies
> >>>         with options.
> >>>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
> >>>         New function.
> >>>
> >>> 2013-08-04  Xinliang David Li  <davidxl@google.com>
> >>>
> >>>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
> >>>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
> >>>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
> >>>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.

The patch looks resonable to me in general.  I wonder why we need to bring
all the cost tables non-const instead of just having writable storage for
the "current strategy" like we do with other flags anyway.

Your strings are definitely more readable than the in-memory representation
I came up with. Perhaps we can even turn the cost tables into strings
for easier maintenance?  I guess they are bit confusing for people
not familiar with a code.

Honza
> >>>
> >>>
> >>>
> >>>
> >>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
> >>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
> >>> > FDO), libcall strategy is used with the size is > 8192. This value is
> >>> > hard coded, which makes it hard to do performance tuning. This patch
> >>> > adds two new parameters to do that. Potential usage includes
> >>> > per-application libcall strategy min-size tuning based on summary data
> >>> > with FDO (e.g, instruction workset size).
> >>> >
> >>> > Bootstrap and tested on x86_64/linux. Ok for trunk?
> >>> >
> >>> > thanks,
> >>> >
> >>> > David
> >>> >
> >>> >
> >>> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
> >>> >
> >>> >         * params.def: New parameters.
> >>> >         * config/i386/i386.c (ix86_option_override_internal):
> >>> >         Override default libcall size limit with parameters.
> >>
> >>> Index: config/i386/stringop.def
> >>> ===================================================================
> >>> --- config/i386/stringop.def  (revision 0)
> >>> +++ config/i386/stringop.def  (revision 0)
> >>> @@ -0,0 +1,42 @@
> >>> +/* Definitions for option handling for IA-32.
> >>> +   Copyright (C) 2013 Free Software Foundation, Inc.
> >>> +
> >>> +This file is part of GCC.
> >>> +
> >>> +GCC is free software; you can redistribute it and/or modify
> >>> +it under the terms of the GNU General Public License as published by
> >>> +the Free Software Foundation; either version 3, or (at your option)
> >>> +any later version.
> >>> +
> >>> +GCC is distributed in the hope that it will be useful,
> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>> +GNU General Public License for more details.
> >>> +
> >>> +Under Section 7 of GPL version 3, you are granted additional
> >>> +permissions described in the GCC Runtime Library Exception, version
> >>> +3.1, as published by the Free Software Foundation.
> >>> +
> >>> +You should have received a copy of the GNU General Public License and
> >>> +a copy of the GCC Runtime Library Exception along with this program;
> >>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> >>> +<http://www.gnu.org/licenses/>.  */
> >>> +
> >>> +DEF_ENUM
> >>> +DEF_ALG (no_stringop, no_stringop)
> >>> +DEF_ENUM
> >>> +DEF_ALG (libcall, libcall)
> >>> +DEF_ENUM
> >>> +DEF_ALG (rep_prefix_1_byte, rep_byte)
> >>> +DEF_ENUM
> >>> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
> >>> +DEF_ENUM
> >>> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
> >>> +DEF_ENUM
> >>> +DEF_ALG (loop_1_byte, byte_loop)
> >>> +DEF_ENUM
> >>> +DEF_ALG (loop, loop)
> >>> +DEF_ENUM
> >>> +DEF_ALG (unrolled_loop, unrolled_loop)
> >>> +DEF_ENUM
> >>> +DEF_ALG (vector_loop, vector_loop)
> >>> Index: config/i386/i386.opt
> >>> ===================================================================
> >>> --- config/i386/i386.opt      (revision 201458)
> >>> +++ config/i386/i386.opt      (working copy)
> >>> @@ -316,6 +316,14 @@ mstack-arg-probe
> >>>  Target Report Mask(STACK_PROBE) Save
> >>>  Enable stack probing
> >>>
> >>> +mmemcpy-strategy=
> >>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
> >>> +Specify memcpy expansion strategy when expected size is known
> >>> +
> >>> +mmemset-strategy=
> >>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
> >>> +Specify memset expansion strategy when expected size is known
> >>> +
> >>>  mstringop-strategy=
> >>>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
> >>>  Chose strategy to generate stringop using
> >>> Index: config/i386/stringop.opt
> >>> ===================================================================
> >>> --- config/i386/stringop.opt  (revision 0)
> >>> +++ config/i386/stringop.opt  (revision 0)
> >>> @@ -0,0 +1,36 @@
> >>> +/* Definitions for option handling for IA-32.
> >>> +   Copyright (C) 2013 Free Software Foundation, Inc.
> >>> +
> >>> +This file is part of GCC.
> >>> +
> >>> +GCC is free software; you can redistribute it and/or modify
> >>> +it under the terms of the GNU General Public License as published by
> >>> +the Free Software Foundation; either version 3, or (at your option)
> >>> +any later version.
> >>> +
> >>> +GCC is distributed in the hope that it will be useful,
> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> >>> +GNU General Public License for more details.
> >>> +
> >>> +Under Section 7 of GPL version 3, you are granted additional
> >>> +permissions described in the GCC Runtime Library Exception, version
> >>> +3.1, as published by the Free Software Foundation.
> >>> +
> >>> +You should have received a copy of the GNU General Public License and
> >>> +a copy of the GCC Runtime Library Exception along with this program;
> >>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> >>> +<http://www.gnu.org/licenses/>.  */
> >>> +
> >>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
> >>> +
> >>> +#undef DEF_ENUM
> >>> +#define DEF_ENUM EnumValue
> >>> +
> >>> +#undef DEF_ALG
> >>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
> >>> +
> >>> +#include "stringop.def"
> >>> +
> >>> +#undef DEF_ENUM
> >>> +#undef DEF_ALG
> >>> Index: config/i386/i386.c
> >>> ===================================================================
> >>> --- config/i386/i386.c        (revision 201458)
> >>> +++ config/i386/i386.c        (working copy)
> >>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
> >>>  };
> >>>
> >>>  /* Processor costs (relative to an add) */
> >>> -static const
> >>> +static
> >>>  struct processor_costs i386_cost = { /* 386 specific costs */
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
> >>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {      /*
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs i486_cost = { /* 486 specific costs */
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
> >>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {      /*
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs pentium_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
> >>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs pentiumpro_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
> >>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs geode_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
> >>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs k6_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
> >>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs athlon_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
> >>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs k8_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
> >>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs pentium4_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (3),                 /* cost of a lea instruction */
> >>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs nocona_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
> >>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
> >>>    1,                                 /* cond_not_taken_branch_cost.  */
> >>>  };
> >>>
> >>> -static const
> >>> +static
> >>>  struct processor_costs atom_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
> >>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
> >>>  };
> >>>
> >>>  /* Generic64 should produce code tuned for Nocona and K8.  */
> >>> -static const
> >>> +static
> >>>  struct processor_costs generic64_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
> >>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
> >>>  };
> >>>
> >>>  /* core_cost should produce code tuned for Core familly of CPUs.  */
> >>> -static const
> >>> +static
> >>>  struct processor_costs core_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
> >>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
> >>>
> >>>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
> >>>     Athlon and K8.  */
> >>> -static const
> >>> +static
> >>>  struct processor_costs generic32_cost = {
> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
> >>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
> >>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
> >>>
> >>>    return;
> >>>  }
> >>> +
> >>> +static const char *stringop_alg_names[] = {
> >>> +#define DEF_ENUM
> >>> +#define DEF_ALG(alg, name) #name,
> >>> +#include "stringop.def"
> >>> +#undef DEF_ENUM
> >>> +#undef DEF_ALG
> >>> +};
> >>> +
> >>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
> >>> +   The string is of the following form (or comma separated list of it):
> >>> +
> >>> +     strategy_alg:max_size:[align|noalign]
> >>> +
> >>> +   where the full size range for the strategy is either [0, max_size] or
> >>> +   [min_size, max_size], in which min_size is the max_size + 1 of the
> >>> +   preceding range.  The last size range must have max_size == -1.
> >>> +
> >>> +   Examples:
> >>> +
> >>> +    1.
> >>> +       -mmemcpy-strategy=libcall:-1:noalign
> >>> +
> >>> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
> >>> +
> >>> +
> >>> +   2.
> >>> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
> >>> +
> >>> +      This is to tell the compiler to use the following strategy for memset
> >>> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
> >>> +      2) when the size is between [17, 2048], use vector_loop;
> >>> +      3) when the size is > 2048, use libcall.
> >>> +
> >>> +*/
> >>> +
> >>> +struct stringop_size_range
> >>> +{
> >>> +  int min;
> >>> +  int max;
> >>> +  stringop_alg alg;
> >>> +  bool noalign;
> >>> +};
> >>> +
> >>> +static void
> >>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
> >>> +{
> >>> +  const struct stringop_algs *default_algs;
> >>> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
> >>> +  char *curr_range_str, *next_range_str;
> >>> +  int i = 0, n = 0;
> >>> +
> >>> +  if (is_memset)
> >>> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
> >>> +  else
> >>> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
> >>> +
> >>> +  curr_range_str = strategy_str;
> >>> +
> >>> +  do {
> >>> +
> >>> +    int mins, maxs;
> >>> +    stringop_alg alg;
> >>> +    char alg_name[128];
> >>> +    char align[16];
> >>> +
> >>> +    next_range_str = strchr (curr_range_str, ',');
> >>> +    if (next_range_str)
> >>> +      *next_range_str++ = '\0';
> >>> +
> >>> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
> >>> +      {
> >>> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> >>> +        return;
> >>> +      }
> >>> +
> >>> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
> >>> +      {
> >>> +        warning (0, "Size ranges of option %s should be increasing",
> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> >>> +        return;
> >>> +      }
> >>> +
> >>> +    for (i = 0; i < last_alg; i++)
> >>> +      {
> >>> +        if (!strcmp (alg_name, stringop_alg_names[i]))
> >>> +       {
> >>> +         alg = (stringop_alg) i;
> >>> +         break;
> >>> +          }
> >>> +      }
> >>> +
> >>> +    if (i == last_alg)
> >>> +      {
> >>> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
> >>> +              alg_name,
> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> >>> +     return;
> >>> +      }
> >>> +
> >>> +    input_ranges[n].min = mins;
> >>> +    input_ranges[n].max = maxs;
> >>> +    input_ranges[n].alg = alg;
> >>> +    if (!strcmp (align, "align"))
> >>> +      input_ranges[n].noalign = false;
> >>> +    else if (!strcmp (align, "noalign"))
> >>> +      input_ranges[n].noalign = true;
> >>> +    else
> >>> +      {
> >>> +        warning (0, "Unknown alignment %s specified for option %s",
> >>> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> >>> +        return;
> >>> +      }
> >>> +    n++;
> >>> +    curr_range_str = next_range_str;
> >>> +  } while (curr_range_str);
> >>> +
> >>> +  if (input_ranges[n - 1].max != -1)
> >>> +    {
> >>> +      warning (0, "The max value for the last size range should be -1"
> >>> +               " for option %s",
> >>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> >>> +      return;
> >>> +    }
> >>> +
> >>> +  if (n > MAX_STRINGOP_ALGS)
> >>> +    {
> >>> +      warning (0, "Too many size ranges specified in option %s",
> >>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> >>> +      return;
> >>> +    }
> >>> +
> >>> +  /* Now override the default algs array  */
> >>> +  for (i = 0; i < n; i++)
> >>> +    {
> >>> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
> >>> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
> >>> +          = input_ranges[i].alg;
> >>> +      *const_cast<int *>(&default_algs->size[i].noalign)
> >>> +          = input_ranges[i].noalign;
> >>> +    }
> >>> +}
> >>> +
> >>>
> >>>  /* Override various settings based on options.  If MAIN_ARGS_P, the
> >>>     options are from the command line, otherwise they are from
> >>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
> >>>    /* Handle stack protector */
> >>>    if (!global_options_set.x_ix86_stack_protector_guard)
> >>>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
> >>> +
> >>> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
> >>> +  if (ix86_tune_memcpy_strategy)
> >>> +    {
> >>> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
> >>> +      ix86_parse_stringop_strategy_string (str, false);
> >>> +      free (str);
> >>> +    }
> >>> +
> >>> +  if (ix86_tune_memset_strategy)
> >>> +    {
> >>> +      char *str = xstrdup (ix86_tune_memset_strategy);
> >>> +      ix86_parse_stringop_strategy_string (str, true);
> >>> +      free (str);
> >>> +    }
> >>>  }
> >>>
> >>>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
> >>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
> >>>      {
> >>>      case libcall:
> >>>      case no_stringop:
> >>> +    case last_alg:
> >>>        gcc_unreachable ();
> >>>      case loop_1_byte:
> >>>        need_zero_guard = true;
> >>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
> >>>      {
> >>>      case libcall:
> >>>      case no_stringop:
> >>> +    case last_alg:
> >>>        gcc_unreachable ();
> >>>      case loop_1_byte:
> >>>      case loop:
> >>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
> >>>      {
> >>>      case libcall:
> >>>      case no_stringop:
> >>> +    case last_alg:
> >>>        gcc_unreachable ();
> >>>      case loop:
> >>>        need_zero_guard = true;
> >>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
> >>>      {
> >>>      case libcall:
> >>>      case no_stringop:
> >>> +    case last_alg:
> >>>        gcc_unreachable ();
> >>>      case loop_1_byte:
> >>>      case loop:
> >>> Index: config/i386/i386-opts.h
> >>> ===================================================================
> >>> --- config/i386/i386-opts.h   (revision 201458)
> >>> +++ config/i386/i386-opts.h   (working copy)
> >>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
> >>>  /* Algorithm to expand string function with.  */
> >>>  enum stringop_alg
> >>>  {
> >>> -   no_stringop,
> >>> -   libcall,
> >>> -   rep_prefix_1_byte,
> >>> -   rep_prefix_4_byte,
> >>> -   rep_prefix_8_byte,
> >>> -   loop_1_byte,
> >>> -   loop,
> >>> -   unrolled_loop,
> >>> -   vector_loop
> >>> +#undef DEF_ENUM
> >>> +#define DEF_ENUM
> >>> +
> >>> +#undef DEF_ALG
> >>> +#define DEF_ALG(alg, name) alg,
> >>> +
> >>> +#include "stringop.def"
> >>> +last_alg
> >>> +
> >>> +#undef DEF_ENUM
> >>> +#undef DEF_ALG
> >>>  };
> >>>
> >>>  /* Available call abi.  */
> >>> Index: doc/invoke.texi
> >>> ===================================================================
> >>> --- doc/invoke.texi   (revision 201458)
> >>> +++ doc/invoke.texi   (working copy)
> >>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
> >>>  -mbmi2 -mrtm -mlwp -mthreads @gol
> >>>  -mno-align-stringops  -minline-all-stringops @gol
> >>>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
> >>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
> >>>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
> >>>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
> >>>  -mregparm=@var{num}  -msseregparm @gol
> >>> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
> >>>  Always use a library call.
> >>>  @end table
> >>>
> >>> +@item -mmemcpy-strategy=@var{strategy}
> >>> +@opindex mmemcpy-strategy=@var{strategy}
> >>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
> >>> +should be inlined and what inline algorithm to use when the expected size
> >>> +of the copy operation is known. @var{strategy}
> >>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
> >>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
> >>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
> >>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
> >>> +in the list must be specified in increasing order. The minimal byte size for
> >>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
> >>> +preceding range.
> >>> +
> >>> +@item -mmemset-strategy=@var{strategy}
> >>> +@opindex mmemset-strategy=@var{strategy}
> >>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
> >>> +@code{__builtin_memset} expansion.
> >>> +
> >>>  @item -momit-leaf-frame-pointer
> >>>  @opindex momit-leaf-frame-pointer
> >>>  Don't keep the frame pointer in a register for leaf functions.  This
> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
> >>> ===================================================================
> >>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
> >>> @@ -0,0 +1,12 @@
> >>> +/* { dg-do compile } */
> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
> >>> +
> >>> +char a[2048];
> >>> +char b[2048];
> >>> +void t (void)
> >>> +{
> >>> +  __builtin_memcpy (a, b, 2048);
> >>> +}
> >>> +
> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
> >>> ===================================================================
> >>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
> >>> @@ -0,0 +1,12 @@
> >>> +/* { dg-do compile } */
> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
> >>> +
> >>> +char a[2048];
> >>> +char b[2048];
> >>> +void t (void)
> >>> +{
> >>> +  __builtin_memcpy (a, b, 2048);
> >>> +}
> >>> +
> >>> Index: testsuite/gcc.target/i386/memset-strategy-1.c
> >>> ===================================================================
> >>> --- testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
> >>> +++ testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
> >>> @@ -0,0 +1,10 @@
> >>> +/* { dg-do compile } */
> >>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
> >>> +/* { dg-final { scan-assembler-times "memset" 2  } } */
> >>> +
> >>> +char a[2048];
> >>> +void t (void)
> >>> +{
> >>> +  __builtin_memset (a, 1, 2048);
> >>> +}
> >>> +
> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
> >>> ===================================================================
> >>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
> >>> @@ -0,0 +1,11 @@
> >>> +/* { dg-do compile } */
> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
> >>> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
> >>> +
> >>> +char a[2048];
> >>> +char b[2048];
> >>> +void t (void)
> >>> +{
> >>> +  __builtin_memcpy (a, b, 2048);
> >>> +}
> >>> +
> >>
> 
> 
> 
> -- 
> ---
> Best regards,
> Michael V. Zolotukhin,
> Software Engineer
> Intel Corporation.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-06  9:42         ` Jan Hubicka
@ 2013-08-06 16:08           ` Xinliang David Li
  2013-08-07 17:06           ` Xinliang David Li
  1 sibling, 0 replies; 23+ messages in thread
From: Xinliang David Li @ 2013-08-06 16:08 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: Michael Zolotukhin, GCC Patches, Teresa Johnson

On Tue, Aug 6, 2013 at 2:42 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> >>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>> >>>
>> >>>         * config/i386/stringop.def: New file.
>> >>>         * config/i386/stringop.opt: New file.
>> >>>         * config/i386/i386-opts.h: Include stringopt.def.
>> >>>         * config/i386/i386.opt: Include stringopt.opt.
>> >>>         * config/i386/i386.c (ix86_option_override_internal):
>> >>>         Override default size based stringop inline strategies
>> >>>         with options.
>> >>>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
>> >>>         New function.
>> >>>
>> >>> 2013-08-04  Xinliang David Li  <davidxl@google.com>
>> >>>
>> >>>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
>> >>>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
>> >>>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
>> >>>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.
>
> The patch looks resonable to me in general.  I wonder why we need to bring
> all the cost tables non-const instead of just having writable storage for
> the "current strategy" like we do with other flags anyway.

Having const on those arrays do not bring us anything -- those tables
will be accessed indirectly so const-prop won't happen anyways.
current_strategy is an embedded struct in the cost array so it ends up
in RO data when top level array is const.

>
> Your strings are definitely more readable than the in-memory representation
> I came up with. Perhaps we can even turn the cost tables into strings
> for easier maintenance?  I guess they are bit confusing for people
> not familiar with a code.

I think the in memory representation is fine -- if there is a need for
internal representation cleanup, it should done as another patch.
WDTY?

thanks,

David

>
> Honza
>> >>>
>> >>>
>> >>>
>> >>>
>> >>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
>> >>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
>> >>> > FDO), libcall strategy is used with the size is > 8192. This value is
>> >>> > hard coded, which makes it hard to do performance tuning. This patch
>> >>> > adds two new parameters to do that. Potential usage includes
>> >>> > per-application libcall strategy min-size tuning based on summary data
>> >>> > with FDO (e.g, instruction workset size).
>> >>> >
>> >>> > Bootstrap and tested on x86_64/linux. Ok for trunk?
>> >>> >
>> >>> > thanks,
>> >>> >
>> >>> > David
>> >>> >
>> >>> >
>> >>> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
>> >>> >
>> >>> >         * params.def: New parameters.
>> >>> >         * config/i386/i386.c (ix86_option_override_internal):
>> >>> >         Override default libcall size limit with parameters.
>> >>
>> >>> Index: config/i386/stringop.def
>> >>> ===================================================================
>> >>> --- config/i386/stringop.def  (revision 0)
>> >>> +++ config/i386/stringop.def  (revision 0)
>> >>> @@ -0,0 +1,42 @@
>> >>> +/* Definitions for option handling for IA-32.
>> >>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>> >>> +
>> >>> +This file is part of GCC.
>> >>> +
>> >>> +GCC is free software; you can redistribute it and/or modify
>> >>> +it under the terms of the GNU General Public License as published by
>> >>> +the Free Software Foundation; either version 3, or (at your option)
>> >>> +any later version.
>> >>> +
>> >>> +GCC is distributed in the hope that it will be useful,
>> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> >>> +GNU General Public License for more details.
>> >>> +
>> >>> +Under Section 7 of GPL version 3, you are granted additional
>> >>> +permissions described in the GCC Runtime Library Exception, version
>> >>> +3.1, as published by the Free Software Foundation.
>> >>> +
>> >>> +You should have received a copy of the GNU General Public License and
>> >>> +a copy of the GCC Runtime Library Exception along with this program;
>> >>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>> >>> +<http://www.gnu.org/licenses/>.  */
>> >>> +
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (no_stringop, no_stringop)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (libcall, libcall)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (rep_prefix_1_byte, rep_byte)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (loop_1_byte, byte_loop)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (loop, loop)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (unrolled_loop, unrolled_loop)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (vector_loop, vector_loop)
>> >>> Index: config/i386/i386.opt
>> >>> ===================================================================
>> >>> --- config/i386/i386.opt      (revision 201458)
>> >>> +++ config/i386/i386.opt      (working copy)
>> >>> @@ -316,6 +316,14 @@ mstack-arg-probe
>> >>>  Target Report Mask(STACK_PROBE) Save
>> >>>  Enable stack probing
>> >>>
>> >>> +mmemcpy-strategy=
>> >>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
>> >>> +Specify memcpy expansion strategy when expected size is known
>> >>> +
>> >>> +mmemset-strategy=
>> >>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
>> >>> +Specify memset expansion strategy when expected size is known
>> >>> +
>> >>>  mstringop-strategy=
>> >>>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
>> >>>  Chose strategy to generate stringop using
>> >>> Index: config/i386/stringop.opt
>> >>> ===================================================================
>> >>> --- config/i386/stringop.opt  (revision 0)
>> >>> +++ config/i386/stringop.opt  (revision 0)
>> >>> @@ -0,0 +1,36 @@
>> >>> +/* Definitions for option handling for IA-32.
>> >>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>> >>> +
>> >>> +This file is part of GCC.
>> >>> +
>> >>> +GCC is free software; you can redistribute it and/or modify
>> >>> +it under the terms of the GNU General Public License as published by
>> >>> +the Free Software Foundation; either version 3, or (at your option)
>> >>> +any later version.
>> >>> +
>> >>> +GCC is distributed in the hope that it will be useful,
>> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> >>> +GNU General Public License for more details.
>> >>> +
>> >>> +Under Section 7 of GPL version 3, you are granted additional
>> >>> +permissions described in the GCC Runtime Library Exception, version
>> >>> +3.1, as published by the Free Software Foundation.
>> >>> +
>> >>> +You should have received a copy of the GNU General Public License and
>> >>> +a copy of the GCC Runtime Library Exception along with this program;
>> >>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>> >>> +<http://www.gnu.org/licenses/>.  */
>> >>> +
>> >>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
>> >>> +
>> >>> +#undef DEF_ENUM
>> >>> +#define DEF_ENUM EnumValue
>> >>> +
>> >>> +#undef DEF_ALG
>> >>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
>> >>> +
>> >>> +#include "stringop.def"
>> >>> +
>> >>> +#undef DEF_ENUM
>> >>> +#undef DEF_ALG
>> >>> Index: config/i386/i386.c
>> >>> ===================================================================
>> >>> --- config/i386/i386.c        (revision 201458)
>> >>> +++ config/i386/i386.c        (working copy)
>> >>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
>> >>>  };
>> >>>
>> >>>  /* Processor costs (relative to an add) */
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs i386_cost = { /* 386 specific costs */
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {      /*
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs i486_cost = { /* 486 specific costs */
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {      /*
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs pentium_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs pentiumpro_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs geode_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs k6_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> >>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs athlon_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> >>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs k8_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> >>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs pentium4_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (3),                 /* cost of a lea instruction */
>> >>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs nocona_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs atom_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>> >>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
>> >>>  };
>> >>>
>> >>>  /* Generic64 should produce code tuned for Nocona and K8.  */
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs generic64_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>> >>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
>> >>>  };
>> >>>
>> >>>  /* core_cost should produce code tuned for Core familly of CPUs.  */
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs core_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>> >>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
>> >>>
>> >>>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
>> >>>     Athlon and K8.  */
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs generic32_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>> >>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
>> >>>
>> >>>    return;
>> >>>  }
>> >>> +
>> >>> +static const char *stringop_alg_names[] = {
>> >>> +#define DEF_ENUM
>> >>> +#define DEF_ALG(alg, name) #name,
>> >>> +#include "stringop.def"
>> >>> +#undef DEF_ENUM
>> >>> +#undef DEF_ALG
>> >>> +};
>> >>> +
>> >>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
>> >>> +   The string is of the following form (or comma separated list of it):
>> >>> +
>> >>> +     strategy_alg:max_size:[align|noalign]
>> >>> +
>> >>> +   where the full size range for the strategy is either [0, max_size] or
>> >>> +   [min_size, max_size], in which min_size is the max_size + 1 of the
>> >>> +   preceding range.  The last size range must have max_size == -1.
>> >>> +
>> >>> +   Examples:
>> >>> +
>> >>> +    1.
>> >>> +       -mmemcpy-strategy=libcall:-1:noalign
>> >>> +
>> >>> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
>> >>> +
>> >>> +
>> >>> +   2.
>> >>> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
>> >>> +
>> >>> +      This is to tell the compiler to use the following strategy for memset
>> >>> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
>> >>> +      2) when the size is between [17, 2048], use vector_loop;
>> >>> +      3) when the size is > 2048, use libcall.
>> >>> +
>> >>> +*/
>> >>> +
>> >>> +struct stringop_size_range
>> >>> +{
>> >>> +  int min;
>> >>> +  int max;
>> >>> +  stringop_alg alg;
>> >>> +  bool noalign;
>> >>> +};
>> >>> +
>> >>> +static void
>> >>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
>> >>> +{
>> >>> +  const struct stringop_algs *default_algs;
>> >>> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
>> >>> +  char *curr_range_str, *next_range_str;
>> >>> +  int i = 0, n = 0;
>> >>> +
>> >>> +  if (is_memset)
>> >>> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
>> >>> +  else
>> >>> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
>> >>> +
>> >>> +  curr_range_str = strategy_str;
>> >>> +
>> >>> +  do {
>> >>> +
>> >>> +    int mins, maxs;
>> >>> +    stringop_alg alg;
>> >>> +    char alg_name[128];
>> >>> +    char align[16];
>> >>> +
>> >>> +    next_range_str = strchr (curr_range_str, ',');
>> >>> +    if (next_range_str)
>> >>> +      *next_range_str++ = '\0';
>> >>> +
>> >>> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
>> >>> +      {
>> >>> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
>> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +        return;
>> >>> +      }
>> >>> +
>> >>> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
>> >>> +      {
>> >>> +        warning (0, "Size ranges of option %s should be increasing",
>> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +        return;
>> >>> +      }
>> >>> +
>> >>> +    for (i = 0; i < last_alg; i++)
>> >>> +      {
>> >>> +        if (!strcmp (alg_name, stringop_alg_names[i]))
>> >>> +       {
>> >>> +         alg = (stringop_alg) i;
>> >>> +         break;
>> >>> +          }
>> >>> +      }
>> >>> +
>> >>> +    if (i == last_alg)
>> >>> +      {
>> >>> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
>> >>> +              alg_name,
>> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +     return;
>> >>> +      }
>> >>> +
>> >>> +    input_ranges[n].min = mins;
>> >>> +    input_ranges[n].max = maxs;
>> >>> +    input_ranges[n].alg = alg;
>> >>> +    if (!strcmp (align, "align"))
>> >>> +      input_ranges[n].noalign = false;
>> >>> +    else if (!strcmp (align, "noalign"))
>> >>> +      input_ranges[n].noalign = true;
>> >>> +    else
>> >>> +      {
>> >>> +        warning (0, "Unknown alignment %s specified for option %s",
>> >>> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +        return;
>> >>> +      }
>> >>> +    n++;
>> >>> +    curr_range_str = next_range_str;
>> >>> +  } while (curr_range_str);
>> >>> +
>> >>> +  if (input_ranges[n - 1].max != -1)
>> >>> +    {
>> >>> +      warning (0, "The max value for the last size range should be -1"
>> >>> +               " for option %s",
>> >>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +      return;
>> >>> +    }
>> >>> +
>> >>> +  if (n > MAX_STRINGOP_ALGS)
>> >>> +    {
>> >>> +      warning (0, "Too many size ranges specified in option %s",
>> >>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +      return;
>> >>> +    }
>> >>> +
>> >>> +  /* Now override the default algs array  */
>> >>> +  for (i = 0; i < n; i++)
>> >>> +    {
>> >>> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
>> >>> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
>> >>> +          = input_ranges[i].alg;
>> >>> +      *const_cast<int *>(&default_algs->size[i].noalign)
>> >>> +          = input_ranges[i].noalign;
>> >>> +    }
>> >>> +}
>> >>> +
>> >>>
>> >>>  /* Override various settings based on options.  If MAIN_ARGS_P, the
>> >>>     options are from the command line, otherwise they are from
>> >>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
>> >>>    /* Handle stack protector */
>> >>>    if (!global_options_set.x_ix86_stack_protector_guard)
>> >>>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
>> >>> +
>> >>> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
>> >>> +  if (ix86_tune_memcpy_strategy)
>> >>> +    {
>> >>> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
>> >>> +      ix86_parse_stringop_strategy_string (str, false);
>> >>> +      free (str);
>> >>> +    }
>> >>> +
>> >>> +  if (ix86_tune_memset_strategy)
>> >>> +    {
>> >>> +      char *str = xstrdup (ix86_tune_memset_strategy);
>> >>> +      ix86_parse_stringop_strategy_string (str, true);
>> >>> +      free (str);
>> >>> +    }
>> >>>  }
>> >>>
>> >>>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
>> >>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>> >>>      {
>> >>>      case libcall:
>> >>>      case no_stringop:
>> >>> +    case last_alg:
>> >>>        gcc_unreachable ();
>> >>>      case loop_1_byte:
>> >>>        need_zero_guard = true;
>> >>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>> >>>      {
>> >>>      case libcall:
>> >>>      case no_stringop:
>> >>> +    case last_alg:
>> >>>        gcc_unreachable ();
>> >>>      case loop_1_byte:
>> >>>      case loop:
>> >>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>> >>>      {
>> >>>      case libcall:
>> >>>      case no_stringop:
>> >>> +    case last_alg:
>> >>>        gcc_unreachable ();
>> >>>      case loop:
>> >>>        need_zero_guard = true;
>> >>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>> >>>      {
>> >>>      case libcall:
>> >>>      case no_stringop:
>> >>> +    case last_alg:
>> >>>        gcc_unreachable ();
>> >>>      case loop_1_byte:
>> >>>      case loop:
>> >>> Index: config/i386/i386-opts.h
>> >>> ===================================================================
>> >>> --- config/i386/i386-opts.h   (revision 201458)
>> >>> +++ config/i386/i386-opts.h   (working copy)
>> >>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
>> >>>  /* Algorithm to expand string function with.  */
>> >>>  enum stringop_alg
>> >>>  {
>> >>> -   no_stringop,
>> >>> -   libcall,
>> >>> -   rep_prefix_1_byte,
>> >>> -   rep_prefix_4_byte,
>> >>> -   rep_prefix_8_byte,
>> >>> -   loop_1_byte,
>> >>> -   loop,
>> >>> -   unrolled_loop,
>> >>> -   vector_loop
>> >>> +#undef DEF_ENUM
>> >>> +#define DEF_ENUM
>> >>> +
>> >>> +#undef DEF_ALG
>> >>> +#define DEF_ALG(alg, name) alg,
>> >>> +
>> >>> +#include "stringop.def"
>> >>> +last_alg
>> >>> +
>> >>> +#undef DEF_ENUM
>> >>> +#undef DEF_ALG
>> >>>  };
>> >>>
>> >>>  /* Available call abi.  */
>> >>> Index: doc/invoke.texi
>> >>> ===================================================================
>> >>> --- doc/invoke.texi   (revision 201458)
>> >>> +++ doc/invoke.texi   (working copy)
>> >>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
>> >>>  -mbmi2 -mrtm -mlwp -mthreads @gol
>> >>>  -mno-align-stringops  -minline-all-stringops @gol
>> >>>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
>> >>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
>> >>>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
>> >>>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
>> >>>  -mregparm=@var{num}  -msseregparm @gol
>> >>> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
>> >>>  Always use a library call.
>> >>>  @end table
>> >>>
>> >>> +@item -mmemcpy-strategy=@var{strategy}
>> >>> +@opindex mmemcpy-strategy=@var{strategy}
>> >>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
>> >>> +should be inlined and what inline algorithm to use when the expected size
>> >>> +of the copy operation is known. @var{strategy}
>> >>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
>> >>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
>> >>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
>> >>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
>> >>> +in the list must be specified in increasing order. The minimal byte size for
>> >>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
>> >>> +preceding range.
>> >>> +
>> >>> +@item -mmemset-strategy=@var{strategy}
>> >>> +@opindex mmemset-strategy=@var{strategy}
>> >>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
>> >>> +@code{__builtin_memset} expansion.
>> >>> +
>> >>>  @item -momit-leaf-frame-pointer
>> >>>  @opindex momit-leaf-frame-pointer
>> >>>  Don't keep the frame pointer in a register for leaf functions.  This
>> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
>> >>> ===================================================================
>> >>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>> >>> @@ -0,0 +1,12 @@
>> >>> +/* { dg-do compile } */
>> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
>> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>> >>> +
>> >>> +char a[2048];
>> >>> +char b[2048];
>> >>> +void t (void)
>> >>> +{
>> >>> +  __builtin_memcpy (a, b, 2048);
>> >>> +}
>> >>> +
>> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
>> >>> ===================================================================
>> >>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>> >>> @@ -0,0 +1,12 @@
>> >>> +/* { dg-do compile } */
>> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
>> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>> >>> +
>> >>> +char a[2048];
>> >>> +char b[2048];
>> >>> +void t (void)
>> >>> +{
>> >>> +  __builtin_memcpy (a, b, 2048);
>> >>> +}
>> >>> +
>> >>> Index: testsuite/gcc.target/i386/memset-strategy-1.c
>> >>> ===================================================================
>> >>> --- testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>> >>> +++ testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>> >>> @@ -0,0 +1,10 @@
>> >>> +/* { dg-do compile } */
>> >>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
>> >>> +/* { dg-final { scan-assembler-times "memset" 2  } } */
>> >>> +
>> >>> +char a[2048];
>> >>> +void t (void)
>> >>> +{
>> >>> +  __builtin_memset (a, 1, 2048);
>> >>> +}
>> >>> +
>> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
>> >>> ===================================================================
>> >>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>> >>> @@ -0,0 +1,11 @@
>> >>> +/* { dg-do compile } */
>> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
>> >>> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
>> >>> +
>> >>> +char a[2048];
>> >>> +char b[2048];
>> >>> +void t (void)
>> >>> +{
>> >>> +  __builtin_memcpy (a, b, 2048);
>> >>> +}
>> >>> +
>> >>
>>
>>
>>
>> --
>> ---
>> Best regards,
>> Michael V. Zolotukhin,
>> Software Engineer
>> Intel Corporation.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-06  8:46       ` Michael Zolotukhin
  2013-08-06  9:42         ` Jan Hubicka
@ 2013-08-06 16:42         ` Xinliang David Li
  2013-08-06 16:45           ` Xinliang David Li
  2013-08-07  7:14           ` Michael Zolotukhin
  1 sibling, 2 replies; 23+ messages in thread
From: Xinliang David Li @ 2013-08-06 16:42 UTC (permalink / raw)
  To: Michael Zolotukhin; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson

Corrected two small problems reported by the style checker (The
warnings about the EnumValue for options  in stringopt.opt are not
valid).

On Tue, Aug 6, 2013 at 1:46 AM, Michael Zolotukhin
<michael.v.zolotukhin@gmail.com> wrote:
> There are still some formatting issues (like 8 spaces instead of a
> tab, wrong indentation of do-loop and some other places) - to reveal
> some of them you could use contrib/check_GNU_style.sh script.
> But that was a nitpicking again:) Actually I wanted to ask whether
> you're going to use this option for some performance experiments
> involving memmov/memset - if so, probably you could tune existing
> cost-models as well? Is it possible?

the option is designed for purpose like this.

thanks,

David

>
> Michael
>
> On 5 August 2013 20:44, Xinliang David Li <davidxl@google.com> wrote:
>> thanks. Updated patch attached.
>>
>> David
>>
>> On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin
>> <michael.v.zolotukhin@gmail.com> wrote:
>>> Hi,
>>> This is a really convenient option, thanks for working on it.
>>> I can't approve it as I'm not a maintainer, but it looks ok to me,
>>> except fot a small nitpicking: afair, comments should end with
>>> dot-space-space.
>>>
>>> Michael
>>>
>>> On 04 Aug 20:01, Xinliang David Li wrote:
>>>> The attached is a new patch implementing the stringop inline strategy
>>>> control using two new -m options:
>>>>
>>>> -mmemcpy-strategy=
>>>> -mmemset-strategy=
>>>>
>>>> See changes in doc/invoke.texi for description of the new options. Example:
>>>>   -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned
>>>>
>>>> tells compiler to inline memcpy using rep_8byte when the size is no
>>>> larger than 64 byte, using unrolled_loop when size is no larger than
>>>> 2048, and for size > 2048, using library call. In all cases,
>>>> destination alignment adjustment is not done.
>>>>
>>>> Tested on x86-64/linux. Ok for trunk?
>>>>
>>>> thanks,
>>>>
>>>> David
>>>>
>>>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>>>
>>>>         * config/i386/stringop.def: New file.
>>>>         * config/i386/stringop.opt: New file.
>>>>         * config/i386/i386-opts.h: Include stringopt.def.
>>>>         * config/i386/i386.opt: Include stringopt.opt.
>>>>         * config/i386/i386.c (ix86_option_override_internal):
>>>>         Override default size based stringop inline strategies
>>>>         with options.
>>>>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
>>>>         New function.
>>>>
>>>> 2013-08-04  Xinliang David Li  <davidxl@google.com>
>>>>
>>>>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
>>>>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
>>>>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
>>>>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.
>>>>
>>>>
>>>>
>>>>
>>>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
>>>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
>>>> > FDO), libcall strategy is used with the size is > 8192. This value is
>>>> > hard coded, which makes it hard to do performance tuning. This patch
>>>> > adds two new parameters to do that. Potential usage includes
>>>> > per-application libcall strategy min-size tuning based on summary data
>>>> > with FDO (e.g, instruction workset size).
>>>> >
>>>> > Bootstrap and tested on x86_64/linux. Ok for trunk?
>>>> >
>>>> > thanks,
>>>> >
>>>> > David
>>>> >
>>>> >
>>>> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>>> >
>>>> >         * params.def: New parameters.
>>>> >         * config/i386/i386.c (ix86_option_override_internal):
>>>> >         Override default libcall size limit with parameters.
>>>
>>>> Index: config/i386/stringop.def
>>>> ===================================================================
>>>> --- config/i386/stringop.def  (revision 0)
>>>> +++ config/i386/stringop.def  (revision 0)
>>>> @@ -0,0 +1,42 @@
>>>> +/* Definitions for option handling for IA-32.
>>>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>>>> +
>>>> +This file is part of GCC.
>>>> +
>>>> +GCC is free software; you can redistribute it and/or modify
>>>> +it under the terms of the GNU General Public License as published by
>>>> +the Free Software Foundation; either version 3, or (at your option)
>>>> +any later version.
>>>> +
>>>> +GCC is distributed in the hope that it will be useful,
>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>> +GNU General Public License for more details.
>>>> +
>>>> +Under Section 7 of GPL version 3, you are granted additional
>>>> +permissions described in the GCC Runtime Library Exception, version
>>>> +3.1, as published by the Free Software Foundation.
>>>> +
>>>> +You should have received a copy of the GNU General Public License and
>>>> +a copy of the GCC Runtime Library Exception along with this program;
>>>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>>> +<http://www.gnu.org/licenses/>.  */
>>>> +
>>>> +DEF_ENUM
>>>> +DEF_ALG (no_stringop, no_stringop)
>>>> +DEF_ENUM
>>>> +DEF_ALG (libcall, libcall)
>>>> +DEF_ENUM
>>>> +DEF_ALG (rep_prefix_1_byte, rep_byte)
>>>> +DEF_ENUM
>>>> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
>>>> +DEF_ENUM
>>>> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
>>>> +DEF_ENUM
>>>> +DEF_ALG (loop_1_byte, byte_loop)
>>>> +DEF_ENUM
>>>> +DEF_ALG (loop, loop)
>>>> +DEF_ENUM
>>>> +DEF_ALG (unrolled_loop, unrolled_loop)
>>>> +DEF_ENUM
>>>> +DEF_ALG (vector_loop, vector_loop)
>>>> Index: config/i386/i386.opt
>>>> ===================================================================
>>>> --- config/i386/i386.opt      (revision 201458)
>>>> +++ config/i386/i386.opt      (working copy)
>>>> @@ -316,6 +316,14 @@ mstack-arg-probe
>>>>  Target Report Mask(STACK_PROBE) Save
>>>>  Enable stack probing
>>>>
>>>> +mmemcpy-strategy=
>>>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
>>>> +Specify memcpy expansion strategy when expected size is known
>>>> +
>>>> +mmemset-strategy=
>>>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
>>>> +Specify memset expansion strategy when expected size is known
>>>> +
>>>>  mstringop-strategy=
>>>>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
>>>>  Chose strategy to generate stringop using
>>>> Index: config/i386/stringop.opt
>>>> ===================================================================
>>>> --- config/i386/stringop.opt  (revision 0)
>>>> +++ config/i386/stringop.opt  (revision 0)
>>>> @@ -0,0 +1,36 @@
>>>> +/* Definitions for option handling for IA-32.
>>>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>>>> +
>>>> +This file is part of GCC.
>>>> +
>>>> +GCC is free software; you can redistribute it and/or modify
>>>> +it under the terms of the GNU General Public License as published by
>>>> +the Free Software Foundation; either version 3, or (at your option)
>>>> +any later version.
>>>> +
>>>> +GCC is distributed in the hope that it will be useful,
>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>> +GNU General Public License for more details.
>>>> +
>>>> +Under Section 7 of GPL version 3, you are granted additional
>>>> +permissions described in the GCC Runtime Library Exception, version
>>>> +3.1, as published by the Free Software Foundation.
>>>> +
>>>> +You should have received a copy of the GNU General Public License and
>>>> +a copy of the GCC Runtime Library Exception along with this program;
>>>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>>> +<http://www.gnu.org/licenses/>.  */
>>>> +
>>>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
>>>> +
>>>> +#undef DEF_ENUM
>>>> +#define DEF_ENUM EnumValue
>>>> +
>>>> +#undef DEF_ALG
>>>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
>>>> +
>>>> +#include "stringop.def"
>>>> +
>>>> +#undef DEF_ENUM
>>>> +#undef DEF_ALG
>>>> Index: config/i386/i386.c
>>>> ===================================================================
>>>> --- config/i386/i386.c        (revision 201458)
>>>> +++ config/i386/i386.c        (working copy)
>>>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
>>>>  };
>>>>
>>>>  /* Processor costs (relative to an add) */
>>>> -static const
>>>> +static
>>>>  struct processor_costs i386_cost = { /* 386 specific costs */
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {      /*
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs i486_cost = { /* 486 specific costs */
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {      /*
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs pentium_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs pentiumpro_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs geode_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs k6_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs athlon_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs k8_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs pentium4_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (3),                 /* cost of a lea instruction */
>>>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs nocona_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>  };
>>>>
>>>> -static const
>>>> +static
>>>>  struct processor_costs atom_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>>>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
>>>>  };
>>>>
>>>>  /* Generic64 should produce code tuned for Nocona and K8.  */
>>>> -static const
>>>> +static
>>>>  struct processor_costs generic64_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>>>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
>>>>  };
>>>>
>>>>  /* core_cost should produce code tuned for Core familly of CPUs.  */
>>>> -static const
>>>> +static
>>>>  struct processor_costs core_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>>>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
>>>>
>>>>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
>>>>     Athlon and K8.  */
>>>> -static const
>>>> +static
>>>>  struct processor_costs generic32_cost = {
>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>>>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
>>>>
>>>>    return;
>>>>  }
>>>> +
>>>> +static const char *stringop_alg_names[] = {
>>>> +#define DEF_ENUM
>>>> +#define DEF_ALG(alg, name) #name,
>>>> +#include "stringop.def"
>>>> +#undef DEF_ENUM
>>>> +#undef DEF_ALG
>>>> +};
>>>> +
>>>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
>>>> +   The string is of the following form (or comma separated list of it):
>>>> +
>>>> +     strategy_alg:max_size:[align|noalign]
>>>> +
>>>> +   where the full size range for the strategy is either [0, max_size] or
>>>> +   [min_size, max_size], in which min_size is the max_size + 1 of the
>>>> +   preceding range.  The last size range must have max_size == -1.
>>>> +
>>>> +   Examples:
>>>> +
>>>> +    1.
>>>> +       -mmemcpy-strategy=libcall:-1:noalign
>>>> +
>>>> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
>>>> +
>>>> +
>>>> +   2.
>>>> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
>>>> +
>>>> +      This is to tell the compiler to use the following strategy for memset
>>>> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
>>>> +      2) when the size is between [17, 2048], use vector_loop;
>>>> +      3) when the size is > 2048, use libcall.
>>>> +
>>>> +*/
>>>> +
>>>> +struct stringop_size_range
>>>> +{
>>>> +  int min;
>>>> +  int max;
>>>> +  stringop_alg alg;
>>>> +  bool noalign;
>>>> +};
>>>> +
>>>> +static void
>>>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
>>>> +{
>>>> +  const struct stringop_algs *default_algs;
>>>> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
>>>> +  char *curr_range_str, *next_range_str;
>>>> +  int i = 0, n = 0;
>>>> +
>>>> +  if (is_memset)
>>>> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
>>>> +  else
>>>> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
>>>> +
>>>> +  curr_range_str = strategy_str;
>>>> +
>>>> +  do {
>>>> +
>>>> +    int mins, maxs;
>>>> +    stringop_alg alg;
>>>> +    char alg_name[128];
>>>> +    char align[16];
>>>> +
>>>> +    next_range_str = strchr (curr_range_str, ',');
>>>> +    if (next_range_str)
>>>> +      *next_range_str++ = '\0';
>>>> +
>>>> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
>>>> +      {
>>>> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>> +        return;
>>>> +      }
>>>> +
>>>> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
>>>> +      {
>>>> +        warning (0, "Size ranges of option %s should be increasing",
>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>> +        return;
>>>> +      }
>>>> +
>>>> +    for (i = 0; i < last_alg; i++)
>>>> +      {
>>>> +        if (!strcmp (alg_name, stringop_alg_names[i]))
>>>> +       {
>>>> +         alg = (stringop_alg) i;
>>>> +         break;
>>>> +          }
>>>> +      }
>>>> +
>>>> +    if (i == last_alg)
>>>> +      {
>>>> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
>>>> +              alg_name,
>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>> +     return;
>>>> +      }
>>>> +
>>>> +    input_ranges[n].min = mins;
>>>> +    input_ranges[n].max = maxs;
>>>> +    input_ranges[n].alg = alg;
>>>> +    if (!strcmp (align, "align"))
>>>> +      input_ranges[n].noalign = false;
>>>> +    else if (!strcmp (align, "noalign"))
>>>> +      input_ranges[n].noalign = true;
>>>> +    else
>>>> +      {
>>>> +        warning (0, "Unknown alignment %s specified for option %s",
>>>> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>> +        return;
>>>> +      }
>>>> +    n++;
>>>> +    curr_range_str = next_range_str;
>>>> +  } while (curr_range_str);
>>>> +
>>>> +  if (input_ranges[n - 1].max != -1)
>>>> +    {
>>>> +      warning (0, "The max value for the last size range should be -1"
>>>> +               " for option %s",
>>>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>> +      return;
>>>> +    }
>>>> +
>>>> +  if (n > MAX_STRINGOP_ALGS)
>>>> +    {
>>>> +      warning (0, "Too many size ranges specified in option %s",
>>>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>> +      return;
>>>> +    }
>>>> +
>>>> +  /* Now override the default algs array  */
>>>> +  for (i = 0; i < n; i++)
>>>> +    {
>>>> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
>>>> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
>>>> +          = input_ranges[i].alg;
>>>> +      *const_cast<int *>(&default_algs->size[i].noalign)
>>>> +          = input_ranges[i].noalign;
>>>> +    }
>>>> +}
>>>> +
>>>>
>>>>  /* Override various settings based on options.  If MAIN_ARGS_P, the
>>>>     options are from the command line, otherwise they are from
>>>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
>>>>    /* Handle stack protector */
>>>>    if (!global_options_set.x_ix86_stack_protector_guard)
>>>>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
>>>> +
>>>> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
>>>> +  if (ix86_tune_memcpy_strategy)
>>>> +    {
>>>> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
>>>> +      ix86_parse_stringop_strategy_string (str, false);
>>>> +      free (str);
>>>> +    }
>>>> +
>>>> +  if (ix86_tune_memset_strategy)
>>>> +    {
>>>> +      char *str = xstrdup (ix86_tune_memset_strategy);
>>>> +      ix86_parse_stringop_strategy_string (str, true);
>>>> +      free (str);
>>>> +    }
>>>>  }
>>>>
>>>>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
>>>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>>>      {
>>>>      case libcall:
>>>>      case no_stringop:
>>>> +    case last_alg:
>>>>        gcc_unreachable ();
>>>>      case loop_1_byte:
>>>>        need_zero_guard = true;
>>>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>>>      {
>>>>      case libcall:
>>>>      case no_stringop:
>>>> +    case last_alg:
>>>>        gcc_unreachable ();
>>>>      case loop_1_byte:
>>>>      case loop:
>>>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>>>      {
>>>>      case libcall:
>>>>      case no_stringop:
>>>> +    case last_alg:
>>>>        gcc_unreachable ();
>>>>      case loop:
>>>>        need_zero_guard = true;
>>>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>>>      {
>>>>      case libcall:
>>>>      case no_stringop:
>>>> +    case last_alg:
>>>>        gcc_unreachable ();
>>>>      case loop_1_byte:
>>>>      case loop:
>>>> Index: config/i386/i386-opts.h
>>>> ===================================================================
>>>> --- config/i386/i386-opts.h   (revision 201458)
>>>> +++ config/i386/i386-opts.h   (working copy)
>>>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
>>>>  /* Algorithm to expand string function with.  */
>>>>  enum stringop_alg
>>>>  {
>>>> -   no_stringop,
>>>> -   libcall,
>>>> -   rep_prefix_1_byte,
>>>> -   rep_prefix_4_byte,
>>>> -   rep_prefix_8_byte,
>>>> -   loop_1_byte,
>>>> -   loop,
>>>> -   unrolled_loop,
>>>> -   vector_loop
>>>> +#undef DEF_ENUM
>>>> +#define DEF_ENUM
>>>> +
>>>> +#undef DEF_ALG
>>>> +#define DEF_ALG(alg, name) alg,
>>>> +
>>>> +#include "stringop.def"
>>>> +last_alg
>>>> +
>>>> +#undef DEF_ENUM
>>>> +#undef DEF_ALG
>>>>  };
>>>>
>>>>  /* Available call abi.  */
>>>> Index: doc/invoke.texi
>>>> ===================================================================
>>>> --- doc/invoke.texi   (revision 201458)
>>>> +++ doc/invoke.texi   (working copy)
>>>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
>>>>  -mbmi2 -mrtm -mlwp -mthreads @gol
>>>>  -mno-align-stringops  -minline-all-stringops @gol
>>>>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
>>>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
>>>>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
>>>>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
>>>>  -mregparm=@var{num}  -msseregparm @gol
>>>> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
>>>>  Always use a library call.
>>>>  @end table
>>>>
>>>> +@item -mmemcpy-strategy=@var{strategy}
>>>> +@opindex mmemcpy-strategy=@var{strategy}
>>>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
>>>> +should be inlined and what inline algorithm to use when the expected size
>>>> +of the copy operation is known. @var{strategy}
>>>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
>>>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
>>>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
>>>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
>>>> +in the list must be specified in increasing order. The minimal byte size for
>>>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
>>>> +preceding range.
>>>> +
>>>> +@item -mmemset-strategy=@var{strategy}
>>>> +@opindex mmemset-strategy=@var{strategy}
>>>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
>>>> +@code{__builtin_memset} expansion.
>>>> +
>>>>  @item -momit-leaf-frame-pointer
>>>>  @opindex momit-leaf-frame-pointer
>>>>  Don't keep the frame pointer in a register for leaf functions.  This
>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
>>>> ===================================================================
>>>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>>>> @@ -0,0 +1,12 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>>>> +
>>>> +char a[2048];
>>>> +char b[2048];
>>>> +void t (void)
>>>> +{
>>>> +  __builtin_memcpy (a, b, 2048);
>>>> +}
>>>> +
>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
>>>> ===================================================================
>>>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>>>> @@ -0,0 +1,12 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>>>> +
>>>> +char a[2048];
>>>> +char b[2048];
>>>> +void t (void)
>>>> +{
>>>> +  __builtin_memcpy (a, b, 2048);
>>>> +}
>>>> +
>>>> Index: testsuite/gcc.target/i386/memset-strategy-1.c
>>>> ===================================================================
>>>> --- testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>>>> +++ testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>>>> @@ -0,0 +1,10 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
>>>> +/* { dg-final { scan-assembler-times "memset" 2  } } */
>>>> +
>>>> +char a[2048];
>>>> +void t (void)
>>>> +{
>>>> +  __builtin_memset (a, 1, 2048);
>>>> +}
>>>> +
>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
>>>> ===================================================================
>>>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>>>> @@ -0,0 +1,11 @@
>>>> +/* { dg-do compile } */
>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
>>>> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
>>>> +
>>>> +char a[2048];
>>>> +char b[2048];
>>>> +void t (void)
>>>> +{
>>>> +  __builtin_memcpy (a, b, 2048);
>>>> +}
>>>> +
>>>
>
>
>
> --
> ---
> Best regards,
> Michael V. Zolotukhin,
> Software Engineer
> Intel Corporation.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-06 16:42         ` Xinliang David Li
@ 2013-08-06 16:45           ` Xinliang David Li
  2013-08-07  7:14           ` Michael Zolotukhin
  1 sibling, 0 replies; 23+ messages in thread
From: Xinliang David Li @ 2013-08-06 16:45 UTC (permalink / raw)
  To: Michael Zolotukhin; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson

[-- Attachment #1: Type: text/plain, Size: 27083 bytes --]

Forgot the patch.

David

On Tue, Aug 6, 2013 at 9:42 AM, Xinliang David Li <davidxl@google.com> wrote:
> Corrected two small problems reported by the style checker (The
> warnings about the EnumValue for options  in stringopt.opt are not
> valid).
>
> On Tue, Aug 6, 2013 at 1:46 AM, Michael Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
>> There are still some formatting issues (like 8 spaces instead of a
>> tab, wrong indentation of do-loop and some other places) - to reveal
>> some of them you could use contrib/check_GNU_style.sh script.
>> But that was a nitpicking again:) Actually I wanted to ask whether
>> you're going to use this option for some performance experiments
>> involving memmov/memset - if so, probably you could tune existing
>> cost-models as well? Is it possible?
>
> the option is designed for purpose like this.
>
> thanks,
>
> David
>
>>
>> Michael
>>
>> On 5 August 2013 20:44, Xinliang David Li <davidxl@google.com> wrote:
>>> thanks. Updated patch attached.
>>>
>>> David
>>>
>>> On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin
>>> <michael.v.zolotukhin@gmail.com> wrote:
>>>> Hi,
>>>> This is a really convenient option, thanks for working on it.
>>>> I can't approve it as I'm not a maintainer, but it looks ok to me,
>>>> except fot a small nitpicking: afair, comments should end with
>>>> dot-space-space.
>>>>
>>>> Michael
>>>>
>>>> On 04 Aug 20:01, Xinliang David Li wrote:
>>>>> The attached is a new patch implementing the stringop inline strategy
>>>>> control using two new -m options:
>>>>>
>>>>> -mmemcpy-strategy=
>>>>> -mmemset-strategy=
>>>>>
>>>>> See changes in doc/invoke.texi for description of the new options. Example:
>>>>>   -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned
>>>>>
>>>>> tells compiler to inline memcpy using rep_8byte when the size is no
>>>>> larger than 64 byte, using unrolled_loop when size is no larger than
>>>>> 2048, and for size > 2048, using library call. In all cases,
>>>>> destination alignment adjustment is not done.
>>>>>
>>>>> Tested on x86-64/linux. Ok for trunk?
>>>>>
>>>>> thanks,
>>>>>
>>>>> David
>>>>>
>>>>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>>>>
>>>>>         * config/i386/stringop.def: New file.
>>>>>         * config/i386/stringop.opt: New file.
>>>>>         * config/i386/i386-opts.h: Include stringopt.def.
>>>>>         * config/i386/i386.opt: Include stringopt.opt.
>>>>>         * config/i386/i386.c (ix86_option_override_internal):
>>>>>         Override default size based stringop inline strategies
>>>>>         with options.
>>>>>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
>>>>>         New function.
>>>>>
>>>>> 2013-08-04  Xinliang David Li  <davidxl@google.com>
>>>>>
>>>>>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
>>>>>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
>>>>>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
>>>>>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
>>>>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
>>>>> > FDO), libcall strategy is used with the size is > 8192. This value is
>>>>> > hard coded, which makes it hard to do performance tuning. This patch
>>>>> > adds two new parameters to do that. Potential usage includes
>>>>> > per-application libcall strategy min-size tuning based on summary data
>>>>> > with FDO (e.g, instruction workset size).
>>>>> >
>>>>> > Bootstrap and tested on x86_64/linux. Ok for trunk?
>>>>> >
>>>>> > thanks,
>>>>> >
>>>>> > David
>>>>> >
>>>>> >
>>>>> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>>>> >
>>>>> >         * params.def: New parameters.
>>>>> >         * config/i386/i386.c (ix86_option_override_internal):
>>>>> >         Override default libcall size limit with parameters.
>>>>
>>>>> Index: config/i386/stringop.def
>>>>> ===================================================================
>>>>> --- config/i386/stringop.def  (revision 0)
>>>>> +++ config/i386/stringop.def  (revision 0)
>>>>> @@ -0,0 +1,42 @@
>>>>> +/* Definitions for option handling for IA-32.
>>>>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>>>>> +
>>>>> +This file is part of GCC.
>>>>> +
>>>>> +GCC is free software; you can redistribute it and/or modify
>>>>> +it under the terms of the GNU General Public License as published by
>>>>> +the Free Software Foundation; either version 3, or (at your option)
>>>>> +any later version.
>>>>> +
>>>>> +GCC is distributed in the hope that it will be useful,
>>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>>> +GNU General Public License for more details.
>>>>> +
>>>>> +Under Section 7 of GPL version 3, you are granted additional
>>>>> +permissions described in the GCC Runtime Library Exception, version
>>>>> +3.1, as published by the Free Software Foundation.
>>>>> +
>>>>> +You should have received a copy of the GNU General Public License and
>>>>> +a copy of the GCC Runtime Library Exception along with this program;
>>>>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>>>> +<http://www.gnu.org/licenses/>.  */
>>>>> +
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (no_stringop, no_stringop)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (libcall, libcall)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (rep_prefix_1_byte, rep_byte)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (loop_1_byte, byte_loop)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (loop, loop)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (unrolled_loop, unrolled_loop)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (vector_loop, vector_loop)
>>>>> Index: config/i386/i386.opt
>>>>> ===================================================================
>>>>> --- config/i386/i386.opt      (revision 201458)
>>>>> +++ config/i386/i386.opt      (working copy)
>>>>> @@ -316,6 +316,14 @@ mstack-arg-probe
>>>>>  Target Report Mask(STACK_PROBE) Save
>>>>>  Enable stack probing
>>>>>
>>>>> +mmemcpy-strategy=
>>>>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
>>>>> +Specify memcpy expansion strategy when expected size is known
>>>>> +
>>>>> +mmemset-strategy=
>>>>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
>>>>> +Specify memset expansion strategy when expected size is known
>>>>> +
>>>>>  mstringop-strategy=
>>>>>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
>>>>>  Chose strategy to generate stringop using
>>>>> Index: config/i386/stringop.opt
>>>>> ===================================================================
>>>>> --- config/i386/stringop.opt  (revision 0)
>>>>> +++ config/i386/stringop.opt  (revision 0)
>>>>> @@ -0,0 +1,36 @@
>>>>> +/* Definitions for option handling for IA-32.
>>>>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>>>>> +
>>>>> +This file is part of GCC.
>>>>> +
>>>>> +GCC is free software; you can redistribute it and/or modify
>>>>> +it under the terms of the GNU General Public License as published by
>>>>> +the Free Software Foundation; either version 3, or (at your option)
>>>>> +any later version.
>>>>> +
>>>>> +GCC is distributed in the hope that it will be useful,
>>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>>> +GNU General Public License for more details.
>>>>> +
>>>>> +Under Section 7 of GPL version 3, you are granted additional
>>>>> +permissions described in the GCC Runtime Library Exception, version
>>>>> +3.1, as published by the Free Software Foundation.
>>>>> +
>>>>> +You should have received a copy of the GNU General Public License and
>>>>> +a copy of the GCC Runtime Library Exception along with this program;
>>>>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>>>> +<http://www.gnu.org/licenses/>.  */
>>>>> +
>>>>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
>>>>> +
>>>>> +#undef DEF_ENUM
>>>>> +#define DEF_ENUM EnumValue
>>>>> +
>>>>> +#undef DEF_ALG
>>>>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
>>>>> +
>>>>> +#include "stringop.def"
>>>>> +
>>>>> +#undef DEF_ENUM
>>>>> +#undef DEF_ALG
>>>>> Index: config/i386/i386.c
>>>>> ===================================================================
>>>>> --- config/i386/i386.c        (revision 201458)
>>>>> +++ config/i386/i386.c        (working copy)
>>>>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
>>>>>  };
>>>>>
>>>>>  /* Processor costs (relative to an add) */
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs i386_cost = { /* 386 specific costs */
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {      /*
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs i486_cost = { /* 486 specific costs */
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {      /*
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs pentium_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs pentiumpro_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs geode_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs k6_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs athlon_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs k8_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs pentium4_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (3),                 /* cost of a lea instruction */
>>>>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs nocona_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs atom_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>>>>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
>>>>>  };
>>>>>
>>>>>  /* Generic64 should produce code tuned for Nocona and K8.  */
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs generic64_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>>>>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
>>>>>  };
>>>>>
>>>>>  /* core_cost should produce code tuned for Core familly of CPUs.  */
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs core_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>>>>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
>>>>>
>>>>>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
>>>>>     Athlon and K8.  */
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs generic32_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>>>>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
>>>>>
>>>>>    return;
>>>>>  }
>>>>> +
>>>>> +static const char *stringop_alg_names[] = {
>>>>> +#define DEF_ENUM
>>>>> +#define DEF_ALG(alg, name) #name,
>>>>> +#include "stringop.def"
>>>>> +#undef DEF_ENUM
>>>>> +#undef DEF_ALG
>>>>> +};
>>>>> +
>>>>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
>>>>> +   The string is of the following form (or comma separated list of it):
>>>>> +
>>>>> +     strategy_alg:max_size:[align|noalign]
>>>>> +
>>>>> +   where the full size range for the strategy is either [0, max_size] or
>>>>> +   [min_size, max_size], in which min_size is the max_size + 1 of the
>>>>> +   preceding range.  The last size range must have max_size == -1.
>>>>> +
>>>>> +   Examples:
>>>>> +
>>>>> +    1.
>>>>> +       -mmemcpy-strategy=libcall:-1:noalign
>>>>> +
>>>>> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
>>>>> +
>>>>> +
>>>>> +   2.
>>>>> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
>>>>> +
>>>>> +      This is to tell the compiler to use the following strategy for memset
>>>>> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
>>>>> +      2) when the size is between [17, 2048], use vector_loop;
>>>>> +      3) when the size is > 2048, use libcall.
>>>>> +
>>>>> +*/
>>>>> +
>>>>> +struct stringop_size_range
>>>>> +{
>>>>> +  int min;
>>>>> +  int max;
>>>>> +  stringop_alg alg;
>>>>> +  bool noalign;
>>>>> +};
>>>>> +
>>>>> +static void
>>>>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
>>>>> +{
>>>>> +  const struct stringop_algs *default_algs;
>>>>> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
>>>>> +  char *curr_range_str, *next_range_str;
>>>>> +  int i = 0, n = 0;
>>>>> +
>>>>> +  if (is_memset)
>>>>> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
>>>>> +  else
>>>>> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
>>>>> +
>>>>> +  curr_range_str = strategy_str;
>>>>> +
>>>>> +  do {
>>>>> +
>>>>> +    int mins, maxs;
>>>>> +    stringop_alg alg;
>>>>> +    char alg_name[128];
>>>>> +    char align[16];
>>>>> +
>>>>> +    next_range_str = strchr (curr_range_str, ',');
>>>>> +    if (next_range_str)
>>>>> +      *next_range_str++ = '\0';
>>>>> +
>>>>> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
>>>>> +      {
>>>>> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
>>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +        return;
>>>>> +      }
>>>>> +
>>>>> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
>>>>> +      {
>>>>> +        warning (0, "Size ranges of option %s should be increasing",
>>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +        return;
>>>>> +      }
>>>>> +
>>>>> +    for (i = 0; i < last_alg; i++)
>>>>> +      {
>>>>> +        if (!strcmp (alg_name, stringop_alg_names[i]))
>>>>> +       {
>>>>> +         alg = (stringop_alg) i;
>>>>> +         break;
>>>>> +          }
>>>>> +      }
>>>>> +
>>>>> +    if (i == last_alg)
>>>>> +      {
>>>>> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
>>>>> +              alg_name,
>>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +     return;
>>>>> +      }
>>>>> +
>>>>> +    input_ranges[n].min = mins;
>>>>> +    input_ranges[n].max = maxs;
>>>>> +    input_ranges[n].alg = alg;
>>>>> +    if (!strcmp (align, "align"))
>>>>> +      input_ranges[n].noalign = false;
>>>>> +    else if (!strcmp (align, "noalign"))
>>>>> +      input_ranges[n].noalign = true;
>>>>> +    else
>>>>> +      {
>>>>> +        warning (0, "Unknown alignment %s specified for option %s",
>>>>> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +        return;
>>>>> +      }
>>>>> +    n++;
>>>>> +    curr_range_str = next_range_str;
>>>>> +  } while (curr_range_str);
>>>>> +
>>>>> +  if (input_ranges[n - 1].max != -1)
>>>>> +    {
>>>>> +      warning (0, "The max value for the last size range should be -1"
>>>>> +               " for option %s",
>>>>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +      return;
>>>>> +    }
>>>>> +
>>>>> +  if (n > MAX_STRINGOP_ALGS)
>>>>> +    {
>>>>> +      warning (0, "Too many size ranges specified in option %s",
>>>>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +      return;
>>>>> +    }
>>>>> +
>>>>> +  /* Now override the default algs array  */
>>>>> +  for (i = 0; i < n; i++)
>>>>> +    {
>>>>> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
>>>>> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
>>>>> +          = input_ranges[i].alg;
>>>>> +      *const_cast<int *>(&default_algs->size[i].noalign)
>>>>> +          = input_ranges[i].noalign;
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>
>>>>>  /* Override various settings based on options.  If MAIN_ARGS_P, the
>>>>>     options are from the command line, otherwise they are from
>>>>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
>>>>>    /* Handle stack protector */
>>>>>    if (!global_options_set.x_ix86_stack_protector_guard)
>>>>>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
>>>>> +
>>>>> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
>>>>> +  if (ix86_tune_memcpy_strategy)
>>>>> +    {
>>>>> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
>>>>> +      ix86_parse_stringop_strategy_string (str, false);
>>>>> +      free (str);
>>>>> +    }
>>>>> +
>>>>> +  if (ix86_tune_memset_strategy)
>>>>> +    {
>>>>> +      char *str = xstrdup (ix86_tune_memset_strategy);
>>>>> +      ix86_parse_stringop_strategy_string (str, true);
>>>>> +      free (str);
>>>>> +    }
>>>>>  }
>>>>>
>>>>>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
>>>>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>>>>      {
>>>>>      case libcall:
>>>>>      case no_stringop:
>>>>> +    case last_alg:
>>>>>        gcc_unreachable ();
>>>>>      case loop_1_byte:
>>>>>        need_zero_guard = true;
>>>>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>>>>      {
>>>>>      case libcall:
>>>>>      case no_stringop:
>>>>> +    case last_alg:
>>>>>        gcc_unreachable ();
>>>>>      case loop_1_byte:
>>>>>      case loop:
>>>>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>>>>      {
>>>>>      case libcall:
>>>>>      case no_stringop:
>>>>> +    case last_alg:
>>>>>        gcc_unreachable ();
>>>>>      case loop:
>>>>>        need_zero_guard = true;
>>>>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>>>>      {
>>>>>      case libcall:
>>>>>      case no_stringop:
>>>>> +    case last_alg:
>>>>>        gcc_unreachable ();
>>>>>      case loop_1_byte:
>>>>>      case loop:
>>>>> Index: config/i386/i386-opts.h
>>>>> ===================================================================
>>>>> --- config/i386/i386-opts.h   (revision 201458)
>>>>> +++ config/i386/i386-opts.h   (working copy)
>>>>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
>>>>>  /* Algorithm to expand string function with.  */
>>>>>  enum stringop_alg
>>>>>  {
>>>>> -   no_stringop,
>>>>> -   libcall,
>>>>> -   rep_prefix_1_byte,
>>>>> -   rep_prefix_4_byte,
>>>>> -   rep_prefix_8_byte,
>>>>> -   loop_1_byte,
>>>>> -   loop,
>>>>> -   unrolled_loop,
>>>>> -   vector_loop
>>>>> +#undef DEF_ENUM
>>>>> +#define DEF_ENUM
>>>>> +
>>>>> +#undef DEF_ALG
>>>>> +#define DEF_ALG(alg, name) alg,
>>>>> +
>>>>> +#include "stringop.def"
>>>>> +last_alg
>>>>> +
>>>>> +#undef DEF_ENUM
>>>>> +#undef DEF_ALG
>>>>>  };
>>>>>
>>>>>  /* Available call abi.  */
>>>>> Index: doc/invoke.texi
>>>>> ===================================================================
>>>>> --- doc/invoke.texi   (revision 201458)
>>>>> +++ doc/invoke.texi   (working copy)
>>>>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
>>>>>  -mbmi2 -mrtm -mlwp -mthreads @gol
>>>>>  -mno-align-stringops  -minline-all-stringops @gol
>>>>>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
>>>>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
>>>>>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
>>>>>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
>>>>>  -mregparm=@var{num}  -msseregparm @gol
>>>>> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
>>>>>  Always use a library call.
>>>>>  @end table
>>>>>
>>>>> +@item -mmemcpy-strategy=@var{strategy}
>>>>> +@opindex mmemcpy-strategy=@var{strategy}
>>>>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
>>>>> +should be inlined and what inline algorithm to use when the expected size
>>>>> +of the copy operation is known. @var{strategy}
>>>>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
>>>>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
>>>>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
>>>>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
>>>>> +in the list must be specified in increasing order. The minimal byte size for
>>>>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
>>>>> +preceding range.
>>>>> +
>>>>> +@item -mmemset-strategy=@var{strategy}
>>>>> +@opindex mmemset-strategy=@var{strategy}
>>>>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
>>>>> +@code{__builtin_memset} expansion.
>>>>> +
>>>>>  @item -momit-leaf-frame-pointer
>>>>>  @opindex momit-leaf-frame-pointer
>>>>>  Don't keep the frame pointer in a register for leaf functions.  This
>>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
>>>>> ===================================================================
>>>>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>>>>> @@ -0,0 +1,12 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
>>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>>>>> +
>>>>> +char a[2048];
>>>>> +char b[2048];
>>>>> +void t (void)
>>>>> +{
>>>>> +  __builtin_memcpy (a, b, 2048);
>>>>> +}
>>>>> +
>>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
>>>>> ===================================================================
>>>>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>>>>> @@ -0,0 +1,12 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
>>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>>>>> +
>>>>> +char a[2048];
>>>>> +char b[2048];
>>>>> +void t (void)
>>>>> +{
>>>>> +  __builtin_memcpy (a, b, 2048);
>>>>> +}
>>>>> +
>>>>> Index: testsuite/gcc.target/i386/memset-strategy-1.c
>>>>> ===================================================================
>>>>> --- testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>>>>> +++ testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>>>>> @@ -0,0 +1,10 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
>>>>> +/* { dg-final { scan-assembler-times "memset" 2  } } */
>>>>> +
>>>>> +char a[2048];
>>>>> +void t (void)
>>>>> +{
>>>>> +  __builtin_memset (a, 1, 2048);
>>>>> +}
>>>>> +
>>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
>>>>> ===================================================================
>>>>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>>>>> @@ -0,0 +1,11 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
>>>>> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
>>>>> +
>>>>> +char a[2048];
>>>>> +char b[2048];
>>>>> +void t (void)
>>>>> +{
>>>>> +  __builtin_memcpy (a, b, 2048);
>>>>> +}
>>>>> +
>>>>
>>
>>
>>
>> --
>> ---
>> Best regards,
>> Michael V. Zolotukhin,
>> Software Engineer
>> Intel Corporation.

[-- Attachment #2: stringop_inl_option.p.txt --]
[-- Type: text/plain, Size: 18836 bytes --]

Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 201458)
+++ doc/invoke.texi	(working copy)
@@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
 -mbmi2 -mrtm -mlwp -mthreads @gol
 -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
+-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} 
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
 -mregparm=@var{num}  -msseregparm @gol
@@ -14598,6 +14599,24 @@ Expand into an inline loop.
 Always use a library call.
 @end table
 
+@item -mmemcpy-strategy=@var{strategy}
+@opindex mmemcpy-strategy=@var{strategy}
+Override the internal decision heuristic to decide if @code{__builtin_memcpy}
+should be inlined and what inline algorithm to use when the expected size
+of the copy operation is known. @var{strategy} 
+is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. 
+@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
+the max byte size with which inline algorithm @var{alg} is allowed.  For the last
+triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
+in the list must be specified in increasing order.  The minimal byte size for 
+@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the 
+preceding range.
+
+@item -mmemset-strategy=@var{strategy}
+@opindex mmemset-strategy=@var{strategy}
+The option is similar to @option{-mmemcpy-strategy=} except that it is to control
+@code{__builtin_memset} expansion.
+
 @item -momit-leaf-frame-pointer
 @opindex momit-leaf-frame-pointer
 Don't keep the frame pointer in a register for leaf functions.  This
Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memset-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memset" 2  } } */
+
+char a[2048];
+void t (void)
+{
+  __builtin_memset (a, 1, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memcpy" 2  } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: config/i386/stringop.opt
===================================================================
--- config/i386/stringop.opt	(revision 0)
+++ config/i386/stringop.opt	(revision 0)
@@ -0,0 +1,36 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
+
+#undef DEF_ENUM
+#define DEF_ENUM EnumValue
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
+
+#include "stringop.def"
+
+#undef DEF_ENUM
+#undef DEF_ALG
Index: config/i386/i386-opts.h
===================================================================
--- config/i386/i386-opts.h	(revision 201458)
+++ config/i386/i386-opts.h	(working copy)
@@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
 /* Algorithm to expand string function with.  */
 enum stringop_alg
 {
-   no_stringop,
-   libcall,
-   rep_prefix_1_byte,
-   rep_prefix_4_byte,
-   rep_prefix_8_byte,
-   loop_1_byte,
-   loop,
-   unrolled_loop,
-   vector_loop
+#undef DEF_ENUM
+#define DEF_ENUM
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) alg, 
+
+#include "stringop.def"
+last_alg
+
+#undef DEF_ENUM
+#undef DEF_ALG
 };
 
 /* Available call abi.  */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 201458)
+++ config/i386/i386.c	(working copy)
@@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
 };
 
 /* Processor costs (relative to an add) */
-static const
+static
 struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -226,7 +226,7 @@ struct processor_costs i386_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -298,7 +298,7 @@ struct processor_costs i486_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k6_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (3),			/* cost of a lea instruction */
@@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
-static const
+static
 struct processor_costs generic64_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
-static const
+static
 struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
 
 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
    Athlon and K8.  */
-static const
+static
 struct processor_costs generic32_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -2900,6 +2900,148 @@ ix86_debug_options (void)
 
   return;
 }
+
+static const char *stringop_alg_names[] = {
+#define DEF_ENUM
+#define DEF_ALG(alg, name) #name,
+#include "stringop.def"
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+   The string is of the following form (or comma separated list of it):
+
+     strategy_alg:max_size:[align|noalign]
+
+   where the full size range for the strategy is either [0, max_size] or
+   [min_size, max_size], in which min_size is the max_size + 1 of the
+   preceding range.  The last size range must have max_size == -1.
+
+   Examples:
+
+    1.
+       -mmemcpy-strategy=libcall:-1:noalign
+
+      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+
+
+   2.
+      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+
+      This is to tell the compiler to use the following strategy for memset
+      1) when the expected size is between [1, 16], use rep_8byte strategy;
+      2) when the size is between [17, 2048], use vector_loop;
+      3) when the size is > 2048, use libcall.  */
+
+struct stringop_size_range
+{
+  int min;
+  int max;
+  stringop_alg alg;
+  bool noalign;
+};
+
+static void
+ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+{
+  const struct stringop_algs *default_algs;
+  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+  char *curr_range_str, *next_range_str;
+  int i = 0, n = 0;
+
+  if (is_memset)
+    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+  else
+    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+  curr_range_str = strategy_str;
+
+  do {
+
+    int mins, maxs;
+    stringop_alg alg;
+    char alg_name[128];
+    char align[16];
+
+    next_range_str = strchr (curr_range_str, ',');
+    if (next_range_str)
+      *next_range_str++ = '\0';
+
+    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
+      {
+        warning (0, "Wrong arg %s to option %s", curr_range_str,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+
+    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
+      {
+        warning (0, "Size ranges of option %s should be increasing",
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+
+    for (i = 0; i < last_alg; i++)
+      {
+        if (!strcmp (alg_name, stringop_alg_names[i]))
+	  {
+	    alg = (stringop_alg) i;
+	    break;
+          }
+      }
+
+    if (i == last_alg)
+      {
+        warning (0, "Wrong stringop strategy name %s specified for option %s",
+	         alg_name,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+	return;
+      }
+
+    input_ranges[n].min = mins;
+    input_ranges[n].max = maxs;
+    input_ranges[n].alg = alg;
+    if (!strcmp (align, "align"))
+      input_ranges[n].noalign = false;
+    else if (!strcmp (align, "noalign"))
+      input_ranges[n].noalign = true;
+    else
+      {
+        warning (0, "Unknown alignment %s specified for option %s",
+                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+        return;
+      }
+    n++;
+    curr_range_str = next_range_str;
+  } while (curr_range_str);
+
+  if (input_ranges[n - 1].max != -1)
+    {
+      warning (0, "The max value for the last size range should be -1"
+               " for option %s",
+               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  if (n > MAX_STRINGOP_ALGS)
+    {
+      warning (0, "Too many size ranges specified in option %s",
+               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  /* Now override the default algs array.  */
+  for (i = 0; i < n; i++)
+    {
+      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+          = input_ranges[i].alg;
+      *const_cast<int *>(&default_algs->size[i].noalign)
+          = input_ranges[i].noalign;
+    }
+}
+
 \f
 /* Override various settings based on options.  If MAIN_ARGS_P, the
    options are from the command line, otherwise they are from
@@ -4021,6 +4163,21 @@ ix86_option_override_internal (bool main
   /* Handle stack protector */
   if (!global_options_set.x_ix86_stack_protector_guard)
     ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
+
+  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+  if (ix86_tune_memcpy_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memcpy_strategy);
+      ix86_parse_stringop_strategy_string (str, false);
+      free (str);
+    }
+
+  if (ix86_tune_memset_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memset_strategy);
+      ix86_parse_stringop_strategy_string (str, true);
+      free (str);
+    }
 }
 
 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
@@ -22903,6 +23060,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
       need_zero_guard = true;
@@ -23093,6 +23251,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
@@ -23304,6 +23463,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop:
       need_zero_guard = true;
@@ -23481,6 +23641,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
Index: config/i386/i386.opt
===================================================================
--- config/i386/i386.opt	(revision 201458)
+++ config/i386/i386.opt	(working copy)
@@ -316,6 +316,14 @@ mstack-arg-probe
 Target Report Mask(STACK_PROBE) Save
 Enable stack probing
 
+mmemcpy-strategy=
+Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
+Specify memcpy expansion strategy when expected size is known
+
+mmemset-strategy=
+Target RejectNegative Joined Var(ix86_tune_memset_strategy)
+Specify memset expansion strategy when expected size is known
+
 mstringop-strategy=
 Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
 Chose strategy to generate stringop using
Index: config/i386/stringop.def
===================================================================
--- config/i386/stringop.def	(revision 0)
+++ config/i386/stringop.def	(revision 0)
@@ -0,0 +1,42 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+DEF_ENUM
+DEF_ALG (no_stringop, no_stringop)
+DEF_ENUM
+DEF_ALG (libcall, libcall)
+DEF_ENUM
+DEF_ALG (rep_prefix_1_byte, rep_byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_4_byte, rep_4byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_8_byte, rep_8byte)
+DEF_ENUM
+DEF_ALG (loop_1_byte, byte_loop)
+DEF_ENUM
+DEF_ALG (loop, loop)
+DEF_ENUM
+DEF_ALG (unrolled_loop, unrolled_loop)
+DEF_ENUM
+DEF_ALG (vector_loop, vector_loop)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-06 16:42         ` Xinliang David Li
  2013-08-06 16:45           ` Xinliang David Li
@ 2013-08-07  7:14           ` Michael Zolotukhin
  1 sibling, 0 replies; 23+ messages in thread
From: Michael Zolotukhin @ 2013-08-07  7:14 UTC (permalink / raw)
  To: Xinliang David Li; +Cc: GCC Patches, Jan Hubicka, Teresa Johnson

> the option is designed for purpose like this.
That's great, thanks!

Michael
> David

On 6 August 2013 20:42, Xinliang David Li <davidxl@google.com> wrote:
> Corrected two small problems reported by the style checker (The
> warnings about the EnumValue for options  in stringopt.opt are not
> valid).
>
> On Tue, Aug 6, 2013 at 1:46 AM, Michael Zolotukhin
> <michael.v.zolotukhin@gmail.com> wrote:
>> There are still some formatting issues (like 8 spaces instead of a
>> tab, wrong indentation of do-loop and some other places) - to reveal
>> some of them you could use contrib/check_GNU_style.sh script.
>> But that was a nitpicking again:) Actually I wanted to ask whether
>> you're going to use this option for some performance experiments
>> involving memmov/memset - if so, probably you could tune existing
>> cost-models as well? Is it possible?
>
> the option is designed for purpose like this.
>
> thanks,
>
> David
>
>>
>> Michael
>>
>> On 5 August 2013 20:44, Xinliang David Li <davidxl@google.com> wrote:
>>> thanks. Updated patch attached.
>>>
>>> David
>>>
>>> On Mon, Aug 5, 2013 at 3:57 AM, Michael V. Zolotukhin
>>> <michael.v.zolotukhin@gmail.com> wrote:
>>>> Hi,
>>>> This is a really convenient option, thanks for working on it.
>>>> I can't approve it as I'm not a maintainer, but it looks ok to me,
>>>> except fot a small nitpicking: afair, comments should end with
>>>> dot-space-space.
>>>>
>>>> Michael
>>>>
>>>> On 04 Aug 20:01, Xinliang David Li wrote:
>>>>> The attached is a new patch implementing the stringop inline strategy
>>>>> control using two new -m options:
>>>>>
>>>>> -mmemcpy-strategy=
>>>>> -mmemset-strategy=
>>>>>
>>>>> See changes in doc/invoke.texi for description of the new options. Example:
>>>>>   -mmemcpy-strategy=rep_8byte:64:unaligned,unrolled_loop:2048:unaligned,libcall:-1:unaligned
>>>>>
>>>>> tells compiler to inline memcpy using rep_8byte when the size is no
>>>>> larger than 64 byte, using unrolled_loop when size is no larger than
>>>>> 2048, and for size > 2048, using library call. In all cases,
>>>>> destination alignment adjustment is not done.
>>>>>
>>>>> Tested on x86-64/linux. Ok for trunk?
>>>>>
>>>>> thanks,
>>>>>
>>>>> David
>>>>>
>>>>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>>>>
>>>>>         * config/i386/stringop.def: New file.
>>>>>         * config/i386/stringop.opt: New file.
>>>>>         * config/i386/i386-opts.h: Include stringopt.def.
>>>>>         * config/i386/i386.opt: Include stringopt.opt.
>>>>>         * config/i386/i386.c (ix86_option_override_internal):
>>>>>         Override default size based stringop inline strategies
>>>>>         with options.
>>>>>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
>>>>>         New function.
>>>>>
>>>>> 2013-08-04  Xinliang David Li  <davidxl@google.com>
>>>>>
>>>>>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
>>>>>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
>>>>>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
>>>>>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.
>>>>>
>>>>>
>>>>>
>>>>>
>>>>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
>>>>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
>>>>> > FDO), libcall strategy is used with the size is > 8192. This value is
>>>>> > hard coded, which makes it hard to do performance tuning. This patch
>>>>> > adds two new parameters to do that. Potential usage includes
>>>>> > per-application libcall strategy min-size tuning based on summary data
>>>>> > with FDO (e.g, instruction workset size).
>>>>> >
>>>>> > Bootstrap and tested on x86_64/linux. Ok for trunk?
>>>>> >
>>>>> > thanks,
>>>>> >
>>>>> > David
>>>>> >
>>>>> >
>>>>> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
>>>>> >
>>>>> >         * params.def: New parameters.
>>>>> >         * config/i386/i386.c (ix86_option_override_internal):
>>>>> >         Override default libcall size limit with parameters.
>>>>
>>>>> Index: config/i386/stringop.def
>>>>> ===================================================================
>>>>> --- config/i386/stringop.def  (revision 0)
>>>>> +++ config/i386/stringop.def  (revision 0)
>>>>> @@ -0,0 +1,42 @@
>>>>> +/* Definitions for option handling for IA-32.
>>>>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>>>>> +
>>>>> +This file is part of GCC.
>>>>> +
>>>>> +GCC is free software; you can redistribute it and/or modify
>>>>> +it under the terms of the GNU General Public License as published by
>>>>> +the Free Software Foundation; either version 3, or (at your option)
>>>>> +any later version.
>>>>> +
>>>>> +GCC is distributed in the hope that it will be useful,
>>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>>> +GNU General Public License for more details.
>>>>> +
>>>>> +Under Section 7 of GPL version 3, you are granted additional
>>>>> +permissions described in the GCC Runtime Library Exception, version
>>>>> +3.1, as published by the Free Software Foundation.
>>>>> +
>>>>> +You should have received a copy of the GNU General Public License and
>>>>> +a copy of the GCC Runtime Library Exception along with this program;
>>>>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>>>> +<http://www.gnu.org/licenses/>.  */
>>>>> +
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (no_stringop, no_stringop)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (libcall, libcall)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (rep_prefix_1_byte, rep_byte)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (loop_1_byte, byte_loop)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (loop, loop)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (unrolled_loop, unrolled_loop)
>>>>> +DEF_ENUM
>>>>> +DEF_ALG (vector_loop, vector_loop)
>>>>> Index: config/i386/i386.opt
>>>>> ===================================================================
>>>>> --- config/i386/i386.opt      (revision 201458)
>>>>> +++ config/i386/i386.opt      (working copy)
>>>>> @@ -316,6 +316,14 @@ mstack-arg-probe
>>>>>  Target Report Mask(STACK_PROBE) Save
>>>>>  Enable stack probing
>>>>>
>>>>> +mmemcpy-strategy=
>>>>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
>>>>> +Specify memcpy expansion strategy when expected size is known
>>>>> +
>>>>> +mmemset-strategy=
>>>>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
>>>>> +Specify memset expansion strategy when expected size is known
>>>>> +
>>>>>  mstringop-strategy=
>>>>>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
>>>>>  Chose strategy to generate stringop using
>>>>> Index: config/i386/stringop.opt
>>>>> ===================================================================
>>>>> --- config/i386/stringop.opt  (revision 0)
>>>>> +++ config/i386/stringop.opt  (revision 0)
>>>>> @@ -0,0 +1,36 @@
>>>>> +/* Definitions for option handling for IA-32.
>>>>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>>>>> +
>>>>> +This file is part of GCC.
>>>>> +
>>>>> +GCC is free software; you can redistribute it and/or modify
>>>>> +it under the terms of the GNU General Public License as published by
>>>>> +the Free Software Foundation; either version 3, or (at your option)
>>>>> +any later version.
>>>>> +
>>>>> +GCC is distributed in the hope that it will be useful,
>>>>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>>>>> +GNU General Public License for more details.
>>>>> +
>>>>> +Under Section 7 of GPL version 3, you are granted additional
>>>>> +permissions described in the GCC Runtime Library Exception, version
>>>>> +3.1, as published by the Free Software Foundation.
>>>>> +
>>>>> +You should have received a copy of the GNU General Public License and
>>>>> +a copy of the GCC Runtime Library Exception along with this program;
>>>>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>>>> +<http://www.gnu.org/licenses/>.  */
>>>>> +
>>>>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
>>>>> +
>>>>> +#undef DEF_ENUM
>>>>> +#define DEF_ENUM EnumValue
>>>>> +
>>>>> +#undef DEF_ALG
>>>>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
>>>>> +
>>>>> +#include "stringop.def"
>>>>> +
>>>>> +#undef DEF_ENUM
>>>>> +#undef DEF_ALG
>>>>> Index: config/i386/i386.c
>>>>> ===================================================================
>>>>> --- config/i386/i386.c        (revision 201458)
>>>>> +++ config/i386/i386.c        (working copy)
>>>>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
>>>>>  };
>>>>>
>>>>>  /* Processor costs (relative to an add) */
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs i386_cost = { /* 386 specific costs */
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {      /*
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs i486_cost = { /* 486 specific costs */
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {      /*
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs pentium_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs pentiumpro_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs geode_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs k6_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs athlon_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs k8_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>>>>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs pentium4_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (3),                 /* cost of a lea instruction */
>>>>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs nocona_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>>>>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
>>>>>    1,                                 /* cond_not_taken_branch_cost.  */
>>>>>  };
>>>>>
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs atom_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>>>>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
>>>>>  };
>>>>>
>>>>>  /* Generic64 should produce code tuned for Nocona and K8.  */
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs generic64_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>>>>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
>>>>>  };
>>>>>
>>>>>  /* core_cost should produce code tuned for Core familly of CPUs.  */
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs core_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>>>>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
>>>>>
>>>>>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
>>>>>     Athlon and K8.  */
>>>>> -static const
>>>>> +static
>>>>>  struct processor_costs generic32_cost = {
>>>>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>>>>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>>>>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
>>>>>
>>>>>    return;
>>>>>  }
>>>>> +
>>>>> +static const char *stringop_alg_names[] = {
>>>>> +#define DEF_ENUM
>>>>> +#define DEF_ALG(alg, name) #name,
>>>>> +#include "stringop.def"
>>>>> +#undef DEF_ENUM
>>>>> +#undef DEF_ALG
>>>>> +};
>>>>> +
>>>>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
>>>>> +   The string is of the following form (or comma separated list of it):
>>>>> +
>>>>> +     strategy_alg:max_size:[align|noalign]
>>>>> +
>>>>> +   where the full size range for the strategy is either [0, max_size] or
>>>>> +   [min_size, max_size], in which min_size is the max_size + 1 of the
>>>>> +   preceding range.  The last size range must have max_size == -1.
>>>>> +
>>>>> +   Examples:
>>>>> +
>>>>> +    1.
>>>>> +       -mmemcpy-strategy=libcall:-1:noalign
>>>>> +
>>>>> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
>>>>> +
>>>>> +
>>>>> +   2.
>>>>> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
>>>>> +
>>>>> +      This is to tell the compiler to use the following strategy for memset
>>>>> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
>>>>> +      2) when the size is between [17, 2048], use vector_loop;
>>>>> +      3) when the size is > 2048, use libcall.
>>>>> +
>>>>> +*/
>>>>> +
>>>>> +struct stringop_size_range
>>>>> +{
>>>>> +  int min;
>>>>> +  int max;
>>>>> +  stringop_alg alg;
>>>>> +  bool noalign;
>>>>> +};
>>>>> +
>>>>> +static void
>>>>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
>>>>> +{
>>>>> +  const struct stringop_algs *default_algs;
>>>>> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
>>>>> +  char *curr_range_str, *next_range_str;
>>>>> +  int i = 0, n = 0;
>>>>> +
>>>>> +  if (is_memset)
>>>>> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
>>>>> +  else
>>>>> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
>>>>> +
>>>>> +  curr_range_str = strategy_str;
>>>>> +
>>>>> +  do {
>>>>> +
>>>>> +    int mins, maxs;
>>>>> +    stringop_alg alg;
>>>>> +    char alg_name[128];
>>>>> +    char align[16];
>>>>> +
>>>>> +    next_range_str = strchr (curr_range_str, ',');
>>>>> +    if (next_range_str)
>>>>> +      *next_range_str++ = '\0';
>>>>> +
>>>>> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
>>>>> +      {
>>>>> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
>>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +        return;
>>>>> +      }
>>>>> +
>>>>> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
>>>>> +      {
>>>>> +        warning (0, "Size ranges of option %s should be increasing",
>>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +        return;
>>>>> +      }
>>>>> +
>>>>> +    for (i = 0; i < last_alg; i++)
>>>>> +      {
>>>>> +        if (!strcmp (alg_name, stringop_alg_names[i]))
>>>>> +       {
>>>>> +         alg = (stringop_alg) i;
>>>>> +         break;
>>>>> +          }
>>>>> +      }
>>>>> +
>>>>> +    if (i == last_alg)
>>>>> +      {
>>>>> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
>>>>> +              alg_name,
>>>>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +     return;
>>>>> +      }
>>>>> +
>>>>> +    input_ranges[n].min = mins;
>>>>> +    input_ranges[n].max = maxs;
>>>>> +    input_ranges[n].alg = alg;
>>>>> +    if (!strcmp (align, "align"))
>>>>> +      input_ranges[n].noalign = false;
>>>>> +    else if (!strcmp (align, "noalign"))
>>>>> +      input_ranges[n].noalign = true;
>>>>> +    else
>>>>> +      {
>>>>> +        warning (0, "Unknown alignment %s specified for option %s",
>>>>> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +        return;
>>>>> +      }
>>>>> +    n++;
>>>>> +    curr_range_str = next_range_str;
>>>>> +  } while (curr_range_str);
>>>>> +
>>>>> +  if (input_ranges[n - 1].max != -1)
>>>>> +    {
>>>>> +      warning (0, "The max value for the last size range should be -1"
>>>>> +               " for option %s",
>>>>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +      return;
>>>>> +    }
>>>>> +
>>>>> +  if (n > MAX_STRINGOP_ALGS)
>>>>> +    {
>>>>> +      warning (0, "Too many size ranges specified in option %s",
>>>>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>>>>> +      return;
>>>>> +    }
>>>>> +
>>>>> +  /* Now override the default algs array  */
>>>>> +  for (i = 0; i < n; i++)
>>>>> +    {
>>>>> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
>>>>> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
>>>>> +          = input_ranges[i].alg;
>>>>> +      *const_cast<int *>(&default_algs->size[i].noalign)
>>>>> +          = input_ranges[i].noalign;
>>>>> +    }
>>>>> +}
>>>>> +
>>>>>
>>>>>  /* Override various settings based on options.  If MAIN_ARGS_P, the
>>>>>     options are from the command line, otherwise they are from
>>>>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
>>>>>    /* Handle stack protector */
>>>>>    if (!global_options_set.x_ix86_stack_protector_guard)
>>>>>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
>>>>> +
>>>>> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
>>>>> +  if (ix86_tune_memcpy_strategy)
>>>>> +    {
>>>>> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
>>>>> +      ix86_parse_stringop_strategy_string (str, false);
>>>>> +      free (str);
>>>>> +    }
>>>>> +
>>>>> +  if (ix86_tune_memset_strategy)
>>>>> +    {
>>>>> +      char *str = xstrdup (ix86_tune_memset_strategy);
>>>>> +      ix86_parse_stringop_strategy_string (str, true);
>>>>> +      free (str);
>>>>> +    }
>>>>>  }
>>>>>
>>>>>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
>>>>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>>>>      {
>>>>>      case libcall:
>>>>>      case no_stringop:
>>>>> +    case last_alg:
>>>>>        gcc_unreachable ();
>>>>>      case loop_1_byte:
>>>>>        need_zero_guard = true;
>>>>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>>>>>      {
>>>>>      case libcall:
>>>>>      case no_stringop:
>>>>> +    case last_alg:
>>>>>        gcc_unreachable ();
>>>>>      case loop_1_byte:
>>>>>      case loop:
>>>>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>>>>      {
>>>>>      case libcall:
>>>>>      case no_stringop:
>>>>> +    case last_alg:
>>>>>        gcc_unreachable ();
>>>>>      case loop:
>>>>>        need_zero_guard = true;
>>>>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>>>>>      {
>>>>>      case libcall:
>>>>>      case no_stringop:
>>>>> +    case last_alg:
>>>>>        gcc_unreachable ();
>>>>>      case loop_1_byte:
>>>>>      case loop:
>>>>> Index: config/i386/i386-opts.h
>>>>> ===================================================================
>>>>> --- config/i386/i386-opts.h   (revision 201458)
>>>>> +++ config/i386/i386-opts.h   (working copy)
>>>>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
>>>>>  /* Algorithm to expand string function with.  */
>>>>>  enum stringop_alg
>>>>>  {
>>>>> -   no_stringop,
>>>>> -   libcall,
>>>>> -   rep_prefix_1_byte,
>>>>> -   rep_prefix_4_byte,
>>>>> -   rep_prefix_8_byte,
>>>>> -   loop_1_byte,
>>>>> -   loop,
>>>>> -   unrolled_loop,
>>>>> -   vector_loop
>>>>> +#undef DEF_ENUM
>>>>> +#define DEF_ENUM
>>>>> +
>>>>> +#undef DEF_ALG
>>>>> +#define DEF_ALG(alg, name) alg,
>>>>> +
>>>>> +#include "stringop.def"
>>>>> +last_alg
>>>>> +
>>>>> +#undef DEF_ENUM
>>>>> +#undef DEF_ALG
>>>>>  };
>>>>>
>>>>>  /* Available call abi.  */
>>>>> Index: doc/invoke.texi
>>>>> ===================================================================
>>>>> --- doc/invoke.texi   (revision 201458)
>>>>> +++ doc/invoke.texi   (working copy)
>>>>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
>>>>>  -mbmi2 -mrtm -mlwp -mthreads @gol
>>>>>  -mno-align-stringops  -minline-all-stringops @gol
>>>>>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
>>>>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
>>>>>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
>>>>>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
>>>>>  -mregparm=@var{num}  -msseregparm @gol
>>>>> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
>>>>>  Always use a library call.
>>>>>  @end table
>>>>>
>>>>> +@item -mmemcpy-strategy=@var{strategy}
>>>>> +@opindex mmemcpy-strategy=@var{strategy}
>>>>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
>>>>> +should be inlined and what inline algorithm to use when the expected size
>>>>> +of the copy operation is known. @var{strategy}
>>>>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
>>>>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
>>>>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
>>>>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
>>>>> +in the list must be specified in increasing order. The minimal byte size for
>>>>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
>>>>> +preceding range.
>>>>> +
>>>>> +@item -mmemset-strategy=@var{strategy}
>>>>> +@opindex mmemset-strategy=@var{strategy}
>>>>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
>>>>> +@code{__builtin_memset} expansion.
>>>>> +
>>>>>  @item -momit-leaf-frame-pointer
>>>>>  @opindex momit-leaf-frame-pointer
>>>>>  Don't keep the frame pointer in a register for leaf functions.  This
>>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
>>>>> ===================================================================
>>>>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>>>>> @@ -0,0 +1,12 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
>>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>>>>> +
>>>>> +char a[2048];
>>>>> +char b[2048];
>>>>> +void t (void)
>>>>> +{
>>>>> +  __builtin_memcpy (a, b, 2048);
>>>>> +}
>>>>> +
>>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
>>>>> ===================================================================
>>>>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>>>>> @@ -0,0 +1,12 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
>>>>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>>>>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>>>>> +
>>>>> +char a[2048];
>>>>> +char b[2048];
>>>>> +void t (void)
>>>>> +{
>>>>> +  __builtin_memcpy (a, b, 2048);
>>>>> +}
>>>>> +
>>>>> Index: testsuite/gcc.target/i386/memset-strategy-1.c
>>>>> ===================================================================
>>>>> --- testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>>>>> +++ testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>>>>> @@ -0,0 +1,10 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
>>>>> +/* { dg-final { scan-assembler-times "memset" 2  } } */
>>>>> +
>>>>> +char a[2048];
>>>>> +void t (void)
>>>>> +{
>>>>> +  __builtin_memset (a, 1, 2048);
>>>>> +}
>>>>> +
>>>>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
>>>>> ===================================================================
>>>>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>>>>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>>>>> @@ -0,0 +1,11 @@
>>>>> +/* { dg-do compile } */
>>>>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
>>>>> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
>>>>> +
>>>>> +char a[2048];
>>>>> +char b[2048];
>>>>> +void t (void)
>>>>> +{
>>>>> +  __builtin_memcpy (a, b, 2048);
>>>>> +}
>>>>> +
>>>>
>>
>>
>>
>> --
>> ---
>> Best regards,
>> Michael V. Zolotukhin,
>> Software Engineer
>> Intel Corporation.



-- 
---
Best regards,
Michael V. Zolotukhin,
Software Engineer
Intel Corporation.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-06  9:42         ` Jan Hubicka
  2013-08-06 16:08           ` Xinliang David Li
@ 2013-08-07 17:06           ` Xinliang David Li
  2013-08-08  0:23             ` Joseph S. Myers
  1 sibling, 1 reply; 23+ messages in thread
From: Xinliang David Li @ 2013-08-07 17:06 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: GCC Patches, Teresa Johnson

[-- Attachment #1: Type: text/plain, Size: 26171 bytes --]

Fixed the do while formatting.  Ok for trunk with this version?

thanks,

David

On Tue, Aug 6, 2013 at 2:42 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> >>> 2013-08-02  Xinliang David Li  <davidxl@google.com>
>> >>>
>> >>>         * config/i386/stringop.def: New file.
>> >>>         * config/i386/stringop.opt: New file.
>> >>>         * config/i386/i386-opts.h: Include stringopt.def.
>> >>>         * config/i386/i386.opt: Include stringopt.opt.
>> >>>         * config/i386/i386.c (ix86_option_override_internal):
>> >>>         Override default size based stringop inline strategies
>> >>>         with options.
>> >>>         * config/i386/i386.c (ix86_parse_stringop_strategy_string):
>> >>>         New function.
>> >>>
>> >>> 2013-08-04  Xinliang David Li  <davidxl@google.com>
>> >>>
>> >>>         * testsuite/gcc.target/i386/memcpy-strategy-1.c: New test.
>> >>>         * testsuite/gcc.target/i386/memcpy-strategy-2.c: Ditto.
>> >>>         * testsuite/gcc.target/i386/memset-strategy-1.c: Ditto.
>> >>>         * testsuite/gcc.target/i386/memcpy-strategy-3.c: Ditto.
>
> The patch looks resonable to me in general.  I wonder why we need to bring
> all the cost tables non-const instead of just having writable storage for
> the "current strategy" like we do with other flags anyway.
>
> Your strings are definitely more readable than the in-memory representation
> I came up with. Perhaps we can even turn the cost tables into strings
> for easier maintenance?  I guess they are bit confusing for people
> not familiar with a code.
>
> Honza
>> >>>
>> >>>
>> >>>
>> >>>
>> >>> On Fri, Aug 2, 2013 at 9:21 PM, Xinliang David Li <davidxl@google.com> wrote:
>> >>> > On x86_64, when the expected size of memcpy/memset is known (e.g, with
>> >>> > FDO), libcall strategy is used with the size is > 8192. This value is
>> >>> > hard coded, which makes it hard to do performance tuning. This patch
>> >>> > adds two new parameters to do that. Potential usage includes
>> >>> > per-application libcall strategy min-size tuning based on summary data
>> >>> > with FDO (e.g, instruction workset size).
>> >>> >
>> >>> > Bootstrap and tested on x86_64/linux. Ok for trunk?
>> >>> >
>> >>> > thanks,
>> >>> >
>> >>> > David
>> >>> >
>> >>> >
>> >>> > 2013-08-02  Xinliang David Li  <davidxl@google.com>
>> >>> >
>> >>> >         * params.def: New parameters.
>> >>> >         * config/i386/i386.c (ix86_option_override_internal):
>> >>> >         Override default libcall size limit with parameters.
>> >>
>> >>> Index: config/i386/stringop.def
>> >>> ===================================================================
>> >>> --- config/i386/stringop.def  (revision 0)
>> >>> +++ config/i386/stringop.def  (revision 0)
>> >>> @@ -0,0 +1,42 @@
>> >>> +/* Definitions for option handling for IA-32.
>> >>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>> >>> +
>> >>> +This file is part of GCC.
>> >>> +
>> >>> +GCC is free software; you can redistribute it and/or modify
>> >>> +it under the terms of the GNU General Public License as published by
>> >>> +the Free Software Foundation; either version 3, or (at your option)
>> >>> +any later version.
>> >>> +
>> >>> +GCC is distributed in the hope that it will be useful,
>> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> >>> +GNU General Public License for more details.
>> >>> +
>> >>> +Under Section 7 of GPL version 3, you are granted additional
>> >>> +permissions described in the GCC Runtime Library Exception, version
>> >>> +3.1, as published by the Free Software Foundation.
>> >>> +
>> >>> +You should have received a copy of the GNU General Public License and
>> >>> +a copy of the GCC Runtime Library Exception along with this program;
>> >>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>> >>> +<http://www.gnu.org/licenses/>.  */
>> >>> +
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (no_stringop, no_stringop)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (libcall, libcall)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (rep_prefix_1_byte, rep_byte)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (rep_prefix_4_byte, rep_4byte)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (rep_prefix_8_byte, rep_8byte)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (loop_1_byte, byte_loop)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (loop, loop)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (unrolled_loop, unrolled_loop)
>> >>> +DEF_ENUM
>> >>> +DEF_ALG (vector_loop, vector_loop)
>> >>> Index: config/i386/i386.opt
>> >>> ===================================================================
>> >>> --- config/i386/i386.opt      (revision 201458)
>> >>> +++ config/i386/i386.opt      (working copy)
>> >>> @@ -316,6 +316,14 @@ mstack-arg-probe
>> >>>  Target Report Mask(STACK_PROBE) Save
>> >>>  Enable stack probing
>> >>>
>> >>> +mmemcpy-strategy=
>> >>> +Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
>> >>> +Specify memcpy expansion strategy when expected size is known
>> >>> +
>> >>> +mmemset-strategy=
>> >>> +Target RejectNegative Joined Var(ix86_tune_memset_strategy)
>> >>> +Specify memset expansion strategy when expected size is known
>> >>> +
>> >>>  mstringop-strategy=
>> >>>  Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
>> >>>  Chose strategy to generate stringop using
>> >>> Index: config/i386/stringop.opt
>> >>> ===================================================================
>> >>> --- config/i386/stringop.opt  (revision 0)
>> >>> +++ config/i386/stringop.opt  (revision 0)
>> >>> @@ -0,0 +1,36 @@
>> >>> +/* Definitions for option handling for IA-32.
>> >>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>> >>> +
>> >>> +This file is part of GCC.
>> >>> +
>> >>> +GCC is free software; you can redistribute it and/or modify
>> >>> +it under the terms of the GNU General Public License as published by
>> >>> +the Free Software Foundation; either version 3, or (at your option)
>> >>> +any later version.
>> >>> +
>> >>> +GCC is distributed in the hope that it will be useful,
>> >>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> >>> +GNU General Public License for more details.
>> >>> +
>> >>> +Under Section 7 of GPL version 3, you are granted additional
>> >>> +permissions described in the GCC Runtime Library Exception, version
>> >>> +3.1, as published by the Free Software Foundation.
>> >>> +
>> >>> +You should have received a copy of the GNU General Public License and
>> >>> +a copy of the GCC Runtime Library Exception along with this program;
>> >>> +see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>> >>> +<http://www.gnu.org/licenses/>.  */
>> >>> +
>> >>> +Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
>> >>> +
>> >>> +#undef DEF_ENUM
>> >>> +#define DEF_ENUM EnumValue
>> >>> +
>> >>> +#undef DEF_ALG
>> >>> +#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
>> >>> +
>> >>> +#include "stringop.def"
>> >>> +
>> >>> +#undef DEF_ENUM
>> >>> +#undef DEF_ALG
>> >>> Index: config/i386/i386.c
>> >>> ===================================================================
>> >>> --- config/i386/i386.c        (revision 201458)
>> >>> +++ config/i386/i386.c        (working copy)
>> >>> @@ -156,7 +156,7 @@ struct processor_costs ix86_size_cost =
>> >>>  };
>> >>>
>> >>>  /* Processor costs (relative to an add) */
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs i386_cost = { /* 386 specific costs */
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -226,7 +226,7 @@ struct processor_costs i386_cost = {      /*
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs i486_cost = { /* 486 specific costs */
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -298,7 +298,7 @@ struct processor_costs i486_cost = {      /*
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs pentium_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -368,7 +368,7 @@ struct processor_costs pentium_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs pentiumpro_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -447,7 +447,7 @@ struct processor_costs pentiumpro_cost =
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs geode_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -518,7 +518,7 @@ struct processor_costs geode_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs k6_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> >>> @@ -591,7 +591,7 @@ struct processor_costs k6_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs athlon_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> >>> @@ -664,7 +664,7 @@ struct processor_costs athlon_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs k8_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (2),                 /* cost of a lea instruction */
>> >>> @@ -1265,7 +1265,7 @@ struct processor_costs btver2_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs pentium4_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (3),                 /* cost of a lea instruction */
>> >>> @@ -1336,7 +1336,7 @@ struct processor_costs pentium4_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs nocona_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1),                 /* cost of a lea instruction */
>> >>> @@ -1409,7 +1409,7 @@ struct processor_costs nocona_cost = {
>> >>>    1,                                 /* cond_not_taken_branch_cost.  */
>> >>>  };
>> >>>
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs atom_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>> >>> @@ -1556,7 +1556,7 @@ struct processor_costs slm_cost = {
>> >>>  };
>> >>>
>> >>>  /* Generic64 should produce code tuned for Nocona and K8.  */
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs generic64_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>> >>> @@ -1635,7 +1635,7 @@ struct processor_costs generic64_cost =
>> >>>  };
>> >>>
>> >>>  /* core_cost should produce code tuned for Core familly of CPUs.  */
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs core_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    /* On all chips taken into consideration lea is 2 cycles and more.  With
>> >>> @@ -1717,7 +1717,7 @@ struct processor_costs core_cost = {
>> >>>
>> >>>  /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
>> >>>     Athlon and K8.  */
>> >>> -static const
>> >>> +static
>> >>>  struct processor_costs generic32_cost = {
>> >>>    COSTS_N_INSNS (1),                 /* cost of an add instruction */
>> >>>    COSTS_N_INSNS (1) + 1,             /* cost of a lea instruction */
>> >>> @@ -2900,6 +2900,150 @@ ix86_debug_options (void)
>> >>>
>> >>>    return;
>> >>>  }
>> >>> +
>> >>> +static const char *stringop_alg_names[] = {
>> >>> +#define DEF_ENUM
>> >>> +#define DEF_ALG(alg, name) #name,
>> >>> +#include "stringop.def"
>> >>> +#undef DEF_ENUM
>> >>> +#undef DEF_ALG
>> >>> +};
>> >>> +
>> >>> +/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
>> >>> +   The string is of the following form (or comma separated list of it):
>> >>> +
>> >>> +     strategy_alg:max_size:[align|noalign]
>> >>> +
>> >>> +   where the full size range for the strategy is either [0, max_size] or
>> >>> +   [min_size, max_size], in which min_size is the max_size + 1 of the
>> >>> +   preceding range.  The last size range must have max_size == -1.
>> >>> +
>> >>> +   Examples:
>> >>> +
>> >>> +    1.
>> >>> +       -mmemcpy-strategy=libcall:-1:noalign
>> >>> +
>> >>> +      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
>> >>> +
>> >>> +
>> >>> +   2.
>> >>> +      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
>> >>> +
>> >>> +      This is to tell the compiler to use the following strategy for memset
>> >>> +      1) when the expected size is between [1, 16], use rep_8byte strategy;
>> >>> +      2) when the size is between [17, 2048], use vector_loop;
>> >>> +      3) when the size is > 2048, use libcall.
>> >>> +
>> >>> +*/
>> >>> +
>> >>> +struct stringop_size_range
>> >>> +{
>> >>> +  int min;
>> >>> +  int max;
>> >>> +  stringop_alg alg;
>> >>> +  bool noalign;
>> >>> +};
>> >>> +
>> >>> +static void
>> >>> +ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
>> >>> +{
>> >>> +  const struct stringop_algs *default_algs;
>> >>> +  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
>> >>> +  char *curr_range_str, *next_range_str;
>> >>> +  int i = 0, n = 0;
>> >>> +
>> >>> +  if (is_memset)
>> >>> +    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
>> >>> +  else
>> >>> +    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
>> >>> +
>> >>> +  curr_range_str = strategy_str;
>> >>> +
>> >>> +  do {
>> >>> +
>> >>> +    int mins, maxs;
>> >>> +    stringop_alg alg;
>> >>> +    char alg_name[128];
>> >>> +    char align[16];
>> >>> +
>> >>> +    next_range_str = strchr (curr_range_str, ',');
>> >>> +    if (next_range_str)
>> >>> +      *next_range_str++ = '\0';
>> >>> +
>> >>> +    if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
>> >>> +      {
>> >>> +        warning (0, "Wrong arg %s to option %s", curr_range_str,
>> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +        return;
>> >>> +      }
>> >>> +
>> >>> +    if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
>> >>> +      {
>> >>> +        warning (0, "Size ranges of option %s should be increasing",
>> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +        return;
>> >>> +      }
>> >>> +
>> >>> +    for (i = 0; i < last_alg; i++)
>> >>> +      {
>> >>> +        if (!strcmp (alg_name, stringop_alg_names[i]))
>> >>> +       {
>> >>> +         alg = (stringop_alg) i;
>> >>> +         break;
>> >>> +          }
>> >>> +      }
>> >>> +
>> >>> +    if (i == last_alg)
>> >>> +      {
>> >>> +        warning (0, "Wrong stringop strategy name %s specified for option %s",
>> >>> +              alg_name,
>> >>> +                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +     return;
>> >>> +      }
>> >>> +
>> >>> +    input_ranges[n].min = mins;
>> >>> +    input_ranges[n].max = maxs;
>> >>> +    input_ranges[n].alg = alg;
>> >>> +    if (!strcmp (align, "align"))
>> >>> +      input_ranges[n].noalign = false;
>> >>> +    else if (!strcmp (align, "noalign"))
>> >>> +      input_ranges[n].noalign = true;
>> >>> +    else
>> >>> +      {
>> >>> +        warning (0, "Unknown alignment %s specified for option %s",
>> >>> +                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +        return;
>> >>> +      }
>> >>> +    n++;
>> >>> +    curr_range_str = next_range_str;
>> >>> +  } while (curr_range_str);
>> >>> +
>> >>> +  if (input_ranges[n - 1].max != -1)
>> >>> +    {
>> >>> +      warning (0, "The max value for the last size range should be -1"
>> >>> +               " for option %s",
>> >>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +      return;
>> >>> +    }
>> >>> +
>> >>> +  if (n > MAX_STRINGOP_ALGS)
>> >>> +    {
>> >>> +      warning (0, "Too many size ranges specified in option %s",
>> >>> +               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> >>> +      return;
>> >>> +    }
>> >>> +
>> >>> +  /* Now override the default algs array  */
>> >>> +  for (i = 0; i < n; i++)
>> >>> +    {
>> >>> +      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
>> >>> +      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
>> >>> +          = input_ranges[i].alg;
>> >>> +      *const_cast<int *>(&default_algs->size[i].noalign)
>> >>> +          = input_ranges[i].noalign;
>> >>> +    }
>> >>> +}
>> >>> +
>> >>>
>> >>>  /* Override various settings based on options.  If MAIN_ARGS_P, the
>> >>>     options are from the command line, otherwise they are from
>> >>> @@ -4021,6 +4165,21 @@ ix86_option_override_internal (bool main
>> >>>    /* Handle stack protector */
>> >>>    if (!global_options_set.x_ix86_stack_protector_guard)
>> >>>      ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
>> >>> +
>> >>> +  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
>> >>> +  if (ix86_tune_memcpy_strategy)
>> >>> +    {
>> >>> +      char *str = xstrdup (ix86_tune_memcpy_strategy);
>> >>> +      ix86_parse_stringop_strategy_string (str, false);
>> >>> +      free (str);
>> >>> +    }
>> >>> +
>> >>> +  if (ix86_tune_memset_strategy)
>> >>> +    {
>> >>> +      char *str = xstrdup (ix86_tune_memset_strategy);
>> >>> +      ix86_parse_stringop_strategy_string (str, true);
>> >>> +      free (str);
>> >>> +    }
>> >>>  }
>> >>>
>> >>>  /* Implement the TARGET_OPTION_OVERRIDE hook.  */
>> >>> @@ -22903,6 +23062,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>> >>>      {
>> >>>      case libcall:
>> >>>      case no_stringop:
>> >>> +    case last_alg:
>> >>>        gcc_unreachable ();
>> >>>      case loop_1_byte:
>> >>>        need_zero_guard = true;
>> >>> @@ -23093,6 +23253,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>> >>>      {
>> >>>      case libcall:
>> >>>      case no_stringop:
>> >>> +    case last_alg:
>> >>>        gcc_unreachable ();
>> >>>      case loop_1_byte:
>> >>>      case loop:
>> >>> @@ -23304,6 +23465,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>> >>>      {
>> >>>      case libcall:
>> >>>      case no_stringop:
>> >>> +    case last_alg:
>> >>>        gcc_unreachable ();
>> >>>      case loop:
>> >>>        need_zero_guard = true;
>> >>> @@ -23481,6 +23643,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
>> >>>      {
>> >>>      case libcall:
>> >>>      case no_stringop:
>> >>> +    case last_alg:
>> >>>        gcc_unreachable ();
>> >>>      case loop_1_byte:
>> >>>      case loop:
>> >>> Index: config/i386/i386-opts.h
>> >>> ===================================================================
>> >>> --- config/i386/i386-opts.h   (revision 201458)
>> >>> +++ config/i386/i386-opts.h   (working copy)
>> >>> @@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
>> >>>  /* Algorithm to expand string function with.  */
>> >>>  enum stringop_alg
>> >>>  {
>> >>> -   no_stringop,
>> >>> -   libcall,
>> >>> -   rep_prefix_1_byte,
>> >>> -   rep_prefix_4_byte,
>> >>> -   rep_prefix_8_byte,
>> >>> -   loop_1_byte,
>> >>> -   loop,
>> >>> -   unrolled_loop,
>> >>> -   vector_loop
>> >>> +#undef DEF_ENUM
>> >>> +#define DEF_ENUM
>> >>> +
>> >>> +#undef DEF_ALG
>> >>> +#define DEF_ALG(alg, name) alg,
>> >>> +
>> >>> +#include "stringop.def"
>> >>> +last_alg
>> >>> +
>> >>> +#undef DEF_ENUM
>> >>> +#undef DEF_ALG
>> >>>  };
>> >>>
>> >>>  /* Available call abi.  */
>> >>> Index: doc/invoke.texi
>> >>> ===================================================================
>> >>> --- doc/invoke.texi   (revision 201458)
>> >>> +++ doc/invoke.texi   (working copy)
>> >>> @@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
>> >>>  -mbmi2 -mrtm -mlwp -mthreads @gol
>> >>>  -mno-align-stringops  -minline-all-stringops @gol
>> >>>  -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
>> >>> +-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy}
>> >>>  -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
>> >>>  -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
>> >>>  -mregparm=@var{num}  -msseregparm @gol
>> >>> @@ -14598,6 +14599,24 @@ Expand into an inline loop.
>> >>>  Always use a library call.
>> >>>  @end table
>> >>>
>> >>> +@item -mmemcpy-strategy=@var{strategy}
>> >>> +@opindex mmemcpy-strategy=@var{strategy}
>> >>> +Override the internal decision heuristic to decide if @code{__builtin_memcpy}
>> >>> +should be inlined and what inline algorithm to use when the expected size
>> >>> +of the copy operation is known. @var{strategy}
>> >>> +is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets.
>> >>> +@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
>> >>> +the max byte size with which inline algorithm @var{alg} is allowed. For the last
>> >>> +triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
>> >>> +in the list must be specified in increasing order. The minimal byte size for
>> >>> +@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the
>> >>> +preceding range.
>> >>> +
>> >>> +@item -mmemset-strategy=@var{strategy}
>> >>> +@opindex mmemset-strategy=@var{strategy}
>> >>> +The option is similar to @option{-mmemcpy-strategy=} except that it is to control
>> >>> +@code{__builtin_memset} expansion.
>> >>> +
>> >>>  @item -momit-leaf-frame-pointer
>> >>>  @opindex momit-leaf-frame-pointer
>> >>>  Don't keep the frame pointer in a register for leaf functions.  This
>> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
>> >>> ===================================================================
>> >>> --- testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-1.c     (revision 0)
>> >>> @@ -0,0 +1,12 @@
>> >>> +/* { dg-do compile } */
>> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
>> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>> >>> +
>> >>> +char a[2048];
>> >>> +char b[2048];
>> >>> +void t (void)
>> >>> +{
>> >>> +  __builtin_memcpy (a, b, 2048);
>> >>> +}
>> >>> +
>> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
>> >>> ===================================================================
>> >>> --- testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-2.c     (revision 0)
>> >>> @@ -0,0 +1,12 @@
>> >>> +/* { dg-do compile } */
>> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
>> >>> +/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
>> >>> +/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
>> >>> +
>> >>> +char a[2048];
>> >>> +char b[2048];
>> >>> +void t (void)
>> >>> +{
>> >>> +  __builtin_memcpy (a, b, 2048);
>> >>> +}
>> >>> +
>> >>> Index: testsuite/gcc.target/i386/memset-strategy-1.c
>> >>> ===================================================================
>> >>> --- testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>> >>> +++ testsuite/gcc.target/i386/memset-strategy-1.c     (revision 0)
>> >>> @@ -0,0 +1,10 @@
>> >>> +/* { dg-do compile } */
>> >>> +/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
>> >>> +/* { dg-final { scan-assembler-times "memset" 2  } } */
>> >>> +
>> >>> +char a[2048];
>> >>> +void t (void)
>> >>> +{
>> >>> +  __builtin_memset (a, 1, 2048);
>> >>> +}
>> >>> +
>> >>> Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
>> >>> ===================================================================
>> >>> --- testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>> >>> +++ testsuite/gcc.target/i386/memcpy-strategy-3.c     (revision 0)
>> >>> @@ -0,0 +1,11 @@
>> >>> +/* { dg-do compile } */
>> >>> +/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
>> >>> +/* { dg-final { scan-assembler-times "memcpy" 2  } } */
>> >>> +
>> >>> +char a[2048];
>> >>> +char b[2048];
>> >>> +void t (void)
>> >>> +{
>> >>> +  __builtin_memcpy (a, b, 2048);
>> >>> +}
>> >>> +
>> >>
>>
>>
>>
>> --
>> ---
>> Best regards,
>> Michael V. Zolotukhin,
>> Software Engineer
>> Intel Corporation.

[-- Attachment #2: stringop_inl_option.p.txt --]
[-- Type: text/plain, Size: 18977 bytes --]

Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memcpy" 2  } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memset-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memset" 2  } } */
+
+char a[2048];
+void t (void)
+{
+  __builtin_memset (a, 1, 2048);
+}
+
Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 201540)
+++ doc/invoke.texi	(working copy)
@@ -649,6 +649,7 @@ Objective-C and Objective-C++ Dialects}.
 -mbmi2 -mrtm -mlwp -mthreads @gol
 -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
+-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} 
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
 -mregparm=@var{num}  -msseregparm @gol
@@ -14598,6 +14599,24 @@ Expand into an inline loop.
 Always use a library call.
 @end table
 
+@item -mmemcpy-strategy=@var{strategy}
+@opindex mmemcpy-strategy=@var{strategy}
+Override the internal decision heuristic to decide if @code{__builtin_memcpy}
+should be inlined and what inline algorithm to use when the expected size
+of the copy operation is known. @var{strategy} 
+is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. 
+@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
+the max byte size with which inline algorithm @var{alg} is allowed.  For the last
+triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
+in the list must be specified in increasing order.  The minimal byte size for 
+@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the 
+preceding range.
+
+@item -mmemset-strategy=@var{strategy}
+@opindex mmemset-strategy=@var{strategy}
+The option is similar to @option{-mmemcpy-strategy=} except that it is to control
+@code{__builtin_memset} expansion.
+
 @item -momit-leaf-frame-pointer
 @opindex momit-leaf-frame-pointer
 Don't keep the frame pointer in a register for leaf functions.  This
Index: config/i386/stringop.def
===================================================================
--- config/i386/stringop.def	(revision 0)
+++ config/i386/stringop.def	(revision 0)
@@ -0,0 +1,42 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+DEF_ENUM
+DEF_ALG (no_stringop, no_stringop)
+DEF_ENUM
+DEF_ALG (libcall, libcall)
+DEF_ENUM
+DEF_ALG (rep_prefix_1_byte, rep_byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_4_byte, rep_4byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_8_byte, rep_8byte)
+DEF_ENUM
+DEF_ALG (loop_1_byte, byte_loop)
+DEF_ENUM
+DEF_ALG (loop, loop)
+DEF_ENUM
+DEF_ALG (unrolled_loop, unrolled_loop)
+DEF_ENUM
+DEF_ALG (vector_loop, vector_loop)
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 201540)
+++ config/i386/i386.c	(working copy)
@@ -158,7 +158,7 @@ struct processor_costs ix86_size_cost =
 };
 
 /* Processor costs (relative to an add) */
-static const
+static
 struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -228,7 +228,7 @@ struct processor_costs i386_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -300,7 +300,7 @@ struct processor_costs i486_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -370,7 +370,7 @@ struct processor_costs pentium_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -449,7 +449,7 @@ struct processor_costs pentiumpro_cost =
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -520,7 +520,7 @@ struct processor_costs geode_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k6_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -593,7 +593,7 @@ struct processor_costs k6_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -666,7 +666,7 @@ struct processor_costs athlon_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -1267,7 +1267,7 @@ struct processor_costs btver2_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (3),			/* cost of a lea instruction */
@@ -1338,7 +1338,7 @@ struct processor_costs pentium4_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -1411,7 +1411,7 @@ struct processor_costs nocona_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -1558,7 +1558,7 @@ struct processor_costs slm_cost = {
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
-static const
+static
 struct processor_costs generic64_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1637,7 +1637,7 @@ struct processor_costs generic64_cost =
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
-static const
+static
 struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1719,7 +1719,7 @@ struct processor_costs core_cost = {
 
 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
    Athlon and K8.  */
-static const
+static
 struct processor_costs generic32_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -2919,6 +2919,149 @@ ix86_debug_options (void)
 
   return;
 }
+
+static const char *stringop_alg_names[] = {
+#define DEF_ENUM
+#define DEF_ALG(alg, name) #name,
+#include "stringop.def"
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+   The string is of the following form (or comma separated list of it):
+
+     strategy_alg:max_size:[align|noalign]
+
+   where the full size range for the strategy is either [0, max_size] or
+   [min_size, max_size], in which min_size is the max_size + 1 of the
+   preceding range.  The last size range must have max_size == -1.
+
+   Examples:
+
+    1.
+       -mmemcpy-strategy=libcall:-1:noalign
+
+      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+
+
+   2.
+      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+
+      This is to tell the compiler to use the following strategy for memset
+      1) when the expected size is between [1, 16], use rep_8byte strategy;
+      2) when the size is between [17, 2048], use vector_loop;
+      3) when the size is > 2048, use libcall.  */
+
+struct stringop_size_range
+{
+  int min;
+  int max;
+  stringop_alg alg;
+  bool noalign;
+};
+
+static void
+ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+{
+  const struct stringop_algs *default_algs;
+  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+  char *curr_range_str, *next_range_str;
+  int i = 0, n = 0;
+
+  if (is_memset)
+    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+  else
+    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+  curr_range_str = strategy_str;
+
+  do
+    {
+      int mins, maxs;
+      stringop_alg alg;
+      char alg_name[128];
+      char align[16];
+
+      next_range_str = strchr (curr_range_str, ',');
+      if (next_range_str)
+        *next_range_str++ = '\0';
+
+      if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
+        {
+          warning (0, "Wrong arg %s to option %s", curr_range_str,
+                   is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
+        {
+          warning (0, "Size ranges of option %s should be increasing",
+                   is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      for (i = 0; i < last_alg; i++)
+        {
+          if (!strcmp (alg_name, stringop_alg_names[i]))
+            {
+              alg = (stringop_alg) i;
+              break;
+            }
+        }
+
+      if (i == last_alg)
+        {
+          warning (0, "Wrong stringop strategy name %s specified for option %s",
+                   alg_name,
+                   is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      input_ranges[n].min = mins;
+      input_ranges[n].max = maxs;
+      input_ranges[n].alg = alg;
+      if (!strcmp (align, "align"))
+        input_ranges[n].noalign = false;
+      else if (!strcmp (align, "noalign"))
+        input_ranges[n].noalign = true;
+      else
+        {
+          warning (0, "Unknown alignment %s specified for option %s",
+                   align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+      n++;
+      curr_range_str = next_range_str;
+    }
+  while (curr_range_str);
+
+  if (input_ranges[n - 1].max != -1)
+    {
+      warning (0, "The max value for the last size range should be -1"
+               " for option %s",
+               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  if (n > MAX_STRINGOP_ALGS)
+    {
+      warning (0, "Too many size ranges specified in option %s",
+               is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  /* Now override the default algs array.  */
+  for (i = 0; i < n; i++)
+    {
+      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+          = input_ranges[i].alg;
+      *const_cast<int *>(&default_algs->size[i].noalign)
+          = input_ranges[i].noalign;
+    }
+}
+
 \f
 /* Override various settings based on options.  If MAIN_ARGS_P, the
    options are from the command line, otherwise they are from
@@ -4040,6 +4183,21 @@ ix86_option_override_internal (bool main
   /* Handle stack protector */
   if (!global_options_set.x_ix86_stack_protector_guard)
     ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
+
+  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+  if (ix86_tune_memcpy_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memcpy_strategy);
+      ix86_parse_stringop_strategy_string (str, false);
+      free (str);
+    }
+
+  if (ix86_tune_memset_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memset_strategy);
+      ix86_parse_stringop_strategy_string (str, true);
+      free (str);
+    }
 }
 
 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
@@ -22923,6 +23081,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
       need_zero_guard = true;
@@ -23113,6 +23272,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
@@ -23324,6 +23484,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop:
       need_zero_guard = true;
@@ -23501,6 +23662,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
Index: config/i386/i386-opts.h
===================================================================
--- config/i386/i386-opts.h	(revision 201540)
+++ config/i386/i386-opts.h	(working copy)
@@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
 /* Algorithm to expand string function with.  */
 enum stringop_alg
 {
-   no_stringop,
-   libcall,
-   rep_prefix_1_byte,
-   rep_prefix_4_byte,
-   rep_prefix_8_byte,
-   loop_1_byte,
-   loop,
-   unrolled_loop,
-   vector_loop
+#undef DEF_ENUM
+#define DEF_ENUM
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) alg, 
+
+#include "stringop.def"
+last_alg
+
+#undef DEF_ENUM
+#undef DEF_ALG
 };
 
 /* Available call abi.  */
Index: config/i386/stringop.opt
===================================================================
--- config/i386/stringop.opt	(revision 0)
+++ config/i386/stringop.opt	(revision 0)
@@ -0,0 +1,36 @@
+/* Definitions for option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
+
+#undef DEF_ENUM
+#define DEF_ENUM EnumValue
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
+
+#include "stringop.def"
+
+#undef DEF_ENUM
+#undef DEF_ALG
Index: config/i386/i386.opt
===================================================================
--- config/i386/i386.opt	(revision 201540)
+++ config/i386/i386.opt	(working copy)
@@ -316,6 +316,14 @@ mstack-arg-probe
 Target Report Mask(STACK_PROBE) Save
 Enable stack probing
 
+mmemcpy-strategy=
+Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
+Specify memcpy expansion strategy when expected size is known
+
+mmemset-strategy=
+Target RejectNegative Joined Var(ix86_tune_memset_strategy)
+Specify memset expansion strategy when expected size is known
+
 mstringop-strategy=
 Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
 Chose strategy to generate stringop using

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-07 17:06           ` Xinliang David Li
@ 2013-08-08  0:23             ` Joseph S. Myers
  2013-08-08  0:29               ` Xinliang David Li
  0 siblings, 1 reply; 23+ messages in thread
From: Joseph S. Myers @ 2013-08-08  0:23 UTC (permalink / raw)
  To: Xinliang David Li; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson

On Wed, 7 Aug 2013, Xinliang David Li wrote:

> Index: config/i386/stringop.def
> ===================================================================
> --- config/i386/stringop.def	(revision 0)
> +++ config/i386/stringop.def	(revision 0)
> @@ -0,0 +1,42 @@
> +/* Definitions for option handling for IA-32.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify
> +it under the terms of the GNU General Public License as published by
> +the Free Software Foundation; either version 3, or (at your option)
> +any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +Under Section 7 of GPL version 3, you are granted additional
> +permissions described in the GCC Runtime Library Exception, version
> +3.1, as published by the Free Software Foundation.

Why the exception?  This should only be used on the host, not the target.

> +  do
> +    {
> +      int mins, maxs;
> +      stringop_alg alg;
> +      char alg_name[128];
> +      char align[16];
> +
> +      next_range_str = strchr (curr_range_str, ',');
> +      if (next_range_str)
> +        *next_range_str++ = '\0';
> +
> +      if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))

This appears to introduce buffer overruns, which is never OK - whatever 
the length of strings in the command-line arguments, you must not overflow 
fixed-width buffers, so you must specify maximum field widths for the %[] 
and %s.

> +        {
> +          warning (0, "Wrong arg %s to option %s", curr_range_str,
> +                   is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
> +          return;

Invalid option arguments should be errors, not warnings, and diagnostics 
should not start with a capital letter.  Same applies to other diagnostics 
here.

> Index: config/i386/stringop.opt
> ===================================================================
> --- config/i386/stringop.opt	(revision 0)
> +++ config/i386/stringop.opt	(revision 0)
> @@ -0,0 +1,36 @@
> +/* Definitions for option handling for IA-32.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify
> +it under the terms of the GNU General Public License as published by
> +the Free Software Foundation; either version 3, or (at your option)
> +any later version.
> +
> +GCC is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU General Public License for more details.
> +
> +Under Section 7 of GPL version 3, you are granted additional
> +permissions described in the GCC Runtime Library Exception, version
> +3.1, as published by the Free Software Foundation.

Again, why the exception?

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-08  0:23             ` Joseph S. Myers
@ 2013-08-08  0:29               ` Xinliang David Li
  2013-08-08  1:04                 ` Joseph S. Myers
  0 siblings, 1 reply; 23+ messages in thread
From: Xinliang David Li @ 2013-08-08  0:29 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson

On Wed, Aug 7, 2013 at 5:23 PM, Joseph S. Myers <joseph@codesourcery.com> wrote:
> On Wed, 7 Aug 2013, Xinliang David Li wrote:
>
>> Index: config/i386/stringop.def
>> ===================================================================
>> --- config/i386/stringop.def  (revision 0)
>> +++ config/i386/stringop.def  (revision 0)
>> @@ -0,0 +1,42 @@
>> +/* Definitions for option handling for IA-32.
>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>> +
>> +This file is part of GCC.
>> +
>> +GCC is free software; you can redistribute it and/or modify
>> +it under the terms of the GNU General Public License as published by
>> +the Free Software Foundation; either version 3, or (at your option)
>> +any later version.
>> +
>> +GCC is distributed in the hope that it will be useful,
>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +GNU General Public License for more details.
>> +
>> +Under Section 7 of GPL version 3, you are granted additional
>> +permissions described in the GCC Runtime Library Exception, version
>> +3.1, as published by the Free Software Foundation.
>
> Why the exception?  This should only be used on the host, not the target.


Sorry, I copied the boiler-plate header from i386.h -- is it wrong there too?


>
>> +  do
>> +    {
>> +      int mins, maxs;
>> +      stringop_alg alg;
>> +      char alg_name[128];
>> +      char align[16];
>> +
>> +      next_range_str = strchr (curr_range_str, ',');
>> +      if (next_range_str)
>> +        *next_range_str++ = '\0';
>> +
>> +      if (3 != sscanf (curr_range_str, "%[^:]:%d:%s", alg_name, &maxs, align))
>
> This appears to introduce buffer overruns, which is never OK - whatever
> the length of strings in the command-line arguments, you must not overflow
> fixed-width buffers, so you must specify maximum field widths for the %[]
> and %s.
>

Ok will fix.


>> +        {
>> +          warning (0, "Wrong arg %s to option %s", curr_range_str,
>> +                   is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
>> +          return;
>
> Invalid option arguments should be errors, not warnings, and diagnostics
> should not start with a capital letter.  Same applies to other diagnostics
> here.
>

Ok will fix.


>> Index: config/i386/stringop.opt
>> ===================================================================
>> --- config/i386/stringop.opt  (revision 0)
>> +++ config/i386/stringop.opt  (revision 0)
>> @@ -0,0 +1,36 @@
>> +/* Definitions for option handling for IA-32.
>> +   Copyright (C) 2013 Free Software Foundation, Inc.
>> +
>> +This file is part of GCC.
>> +
>> +GCC is free software; you can redistribute it and/or modify
>> +it under the terms of the GNU General Public License as published by
>> +the Free Software Foundation; either version 3, or (at your option)
>> +any later version.
>> +
>> +GCC is distributed in the hope that it will be useful,
>> +but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> +GNU General Public License for more details.
>> +
>> +Under Section 7 of GPL version 3, you are granted additional
>> +permissions described in the GCC Runtime Library Exception, version
>> +3.1, as published by the Free Software Foundation.
>
> Again, why the exception?

Wrong template used.

thanks,

David

>
> --
> Joseph S. Myers
> joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-08  0:29               ` Xinliang David Li
@ 2013-08-08  1:04                 ` Joseph S. Myers
  2013-08-08  6:17                   ` Xinliang David Li
  0 siblings, 1 reply; 23+ messages in thread
From: Joseph S. Myers @ 2013-08-08  1:04 UTC (permalink / raw)
  To: Xinliang David Li; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson

On Wed, 7 Aug 2013, Xinliang David Li wrote:

> > Why the exception?  This should only be used on the host, not the target.
> 
> Sorry, I copied the boiler-plate header from i386.h -- is it wrong there too?

tm.h gets included in target code because we haven't finished separating 
target macros used on the target from those used on the host.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-08  1:04                 ` Joseph S. Myers
@ 2013-08-08  6:17                   ` Xinliang David Li
  2013-08-08 15:18                     ` Joseph S. Myers
  0 siblings, 1 reply; 23+ messages in thread
From: Xinliang David Li @ 2013-08-08  6:17 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson

[-- Attachment #1: Type: text/plain, Size: 605 bytes --]

Updated patch attached (fixed header, buffer overflow, and warning -->
error problems).

Ok for trunk?

thanks,

David

On Wed, Aug 7, 2013 at 6:04 PM, Joseph S. Myers <joseph@codesourcery.com> wrote:
> On Wed, 7 Aug 2013, Xinliang David Li wrote:
>
>> > Why the exception?  This should only be used on the host, not the target.
>>
>> Sorry, I copied the boiler-plate header from i386.h -- is it wrong there too?
>
> tm.h gets included in target code because we haven't finished separating
> target macros used on the target from those used on the host.
>
> --
> Joseph S. Myers
> joseph@codesourcery.com

[-- Attachment #2: stringop_inl_option.p.txt --]
[-- Type: text/plain, Size: 18415 bytes --]

Index: config/i386/stringop.def
===================================================================
--- config/i386/stringop.def	(revision 0)
+++ config/i386/stringop.def	(revision 0)
@@ -0,0 +1,37 @@
+/* Definitions for stringop strategy for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License 
+along with GCC; see the files COPYING3.  If not,
+see <http://www.gnu.org/licenses/>.  */
+
+DEF_ENUM
+DEF_ALG (no_stringop, no_stringop)
+DEF_ENUM
+DEF_ALG (libcall, libcall)
+DEF_ENUM
+DEF_ALG (rep_prefix_1_byte, rep_byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_4_byte, rep_4byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_8_byte, rep_8byte)
+DEF_ENUM
+DEF_ALG (loop_1_byte, byte_loop)
+DEF_ENUM
+DEF_ALG (loop, loop)
+DEF_ENUM
+DEF_ALG (unrolled_loop, unrolled_loop)
+DEF_ENUM
+DEF_ALG (vector_loop, vector_loop)
Index: config/i386/stringop.opt
===================================================================
--- config/i386/stringop.opt	(revision 0)
+++ config/i386/stringop.opt	(revision 0)
@@ -0,0 +1,31 @@
+/* Definitions for stringop option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the files COPYING3.  If not,
+see <http://www.gnu.org/licenses/>.  */
+
+Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
+
+#undef DEF_ENUM
+#define DEF_ENUM EnumValue
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
+
+#include "stringop.def"
+
+#undef DEF_ENUM
+#undef DEF_ALG
Index: config/i386/i386-opts.h
===================================================================
--- config/i386/i386-opts.h	(revision 201581)
+++ config/i386/i386-opts.h	(working copy)
@@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
 /* Algorithm to expand string function with.  */
 enum stringop_alg
 {
-   no_stringop,
-   libcall,
-   rep_prefix_1_byte,
-   rep_prefix_4_byte,
-   rep_prefix_8_byte,
-   loop_1_byte,
-   loop,
-   unrolled_loop,
-   vector_loop
+#undef DEF_ENUM
+#define DEF_ENUM
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) alg, 
+
+#include "stringop.def"
+last_alg
+
+#undef DEF_ENUM
+#undef DEF_ALG
 };
 
 /* Available call abi.  */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 201582)
+++ config/i386/i386.c	(working copy)
@@ -158,7 +158,7 @@ struct processor_costs ix86_size_cost =
 };
 
 /* Processor costs (relative to an add) */
-static const
+static
 struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -228,7 +228,7 @@ struct processor_costs i386_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -300,7 +300,7 @@ struct processor_costs i486_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -370,7 +370,7 @@ struct processor_costs pentium_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -449,7 +449,7 @@ struct processor_costs pentiumpro_cost =
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -520,7 +520,7 @@ struct processor_costs geode_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k6_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -593,7 +593,7 @@ struct processor_costs k6_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -666,7 +666,7 @@ struct processor_costs athlon_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -1267,7 +1267,7 @@ struct processor_costs btver2_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (3),			/* cost of a lea instruction */
@@ -1338,7 +1338,7 @@ struct processor_costs pentium4_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -1411,7 +1411,7 @@ struct processor_costs nocona_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -1558,7 +1558,7 @@ struct processor_costs slm_cost = {
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
-static const
+static
 struct processor_costs generic64_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1637,7 +1637,7 @@ struct processor_costs generic64_cost =
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
-static const
+static
 struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1719,7 +1719,7 @@ struct processor_costs core_cost = {
 
 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
    Athlon and K8.  */
-static const
+static
 struct processor_costs generic32_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -2926,6 +2926,149 @@ ix86_debug_options (void)
 
   return;
 }
+
+static const char *stringop_alg_names[] = {
+#define DEF_ENUM
+#define DEF_ALG(alg, name) #name,
+#include "stringop.def"
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+   The string is of the following form (or comma separated list of it):
+
+     strategy_alg:max_size:[align|noalign]
+
+   where the full size range for the strategy is either [0, max_size] or
+   [min_size, max_size], in which min_size is the max_size + 1 of the
+   preceding range.  The last size range must have max_size == -1.
+
+   Examples:
+
+    1.
+       -mmemcpy-strategy=libcall:-1:noalign
+
+      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+
+
+   2.
+      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+
+      This is to tell the compiler to use the following strategy for memset
+      1) when the expected size is between [1, 16], use rep_8byte strategy;
+      2) when the size is between [17, 2048], use vector_loop;
+      3) when the size is > 2048, use libcall.  */
+
+struct stringop_size_range
+{
+  int min;
+  int max;
+  stringop_alg alg;
+  bool noalign;
+};
+
+static void
+ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+{
+  const struct stringop_algs *default_algs;
+  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+  char *curr_range_str, *next_range_str;
+  int i = 0, n = 0;
+
+  if (is_memset)
+    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+  else
+    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+  curr_range_str = strategy_str;
+
+  do
+    {
+      int mins, maxs;
+      stringop_alg alg;
+      char alg_name[128];
+      char align[16];
+      next_range_str = strchr (curr_range_str, ',');
+      if (next_range_str)
+        *next_range_str++ = '\0';
+
+      if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
+                       alg_name, &maxs, align))
+        {
+          error ("Wrong arg %s to option %s", curr_range_str,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
+        {
+          error ("Size ranges of option %s should be increasing",
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      for (i = 0; i < last_alg; i++)
+        {
+          if (!strcmp (alg_name, stringop_alg_names[i]))
+            {
+              alg = (stringop_alg) i;
+              break;
+            }
+        }
+
+      if (i == last_alg)
+        {
+          error ("Wrong stringop strategy name %s specified for option %s",
+                 alg_name,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      input_ranges[n].min = mins;
+      input_ranges[n].max = maxs;
+      input_ranges[n].alg = alg;
+      if (!strcmp (align, "align"))
+        input_ranges[n].noalign = false;
+      else if (!strcmp (align, "noalign"))
+        input_ranges[n].noalign = true;
+      else
+        {
+          error ("Unknown alignment %s specified for option %s",
+                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+      n++;
+      curr_range_str = next_range_str;
+    }
+  while (curr_range_str);
+
+  if (input_ranges[n - 1].max != -1)
+    {
+      error ("The max value for the last size range should be -1"
+             " for option %s",
+             is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  if (n > MAX_STRINGOP_ALGS)
+    {
+      error ("Too many size ranges specified in option %s",
+             is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  /* Now override the default algs array.  */
+  for (i = 0; i < n; i++)
+    {
+      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+          = input_ranges[i].alg;
+      *const_cast<int *>(&default_algs->size[i].noalign)
+          = input_ranges[i].noalign;
+    }
+}
+
 \f
 /* Override various settings based on options.  If MAIN_ARGS_P, the
    options are from the command line, otherwise they are from
@@ -4081,6 +4224,21 @@ ix86_option_override_internal (bool main
   /* Handle stack protector */
   if (!global_options_set.x_ix86_stack_protector_guard)
     ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
+
+  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+  if (ix86_tune_memcpy_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memcpy_strategy);
+      ix86_parse_stringop_strategy_string (str, false);
+      free (str);
+    }
+
+  if (ix86_tune_memset_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memset_strategy);
+      ix86_parse_stringop_strategy_string (str, true);
+      free (str);
+    }
 }
 
 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
@@ -22964,6 +23122,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
       need_zero_guard = true;
@@ -23154,6 +23313,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
@@ -23365,6 +23525,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop:
       need_zero_guard = true;
@@ -23542,6 +23703,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
Index: config/i386/i386.opt
===================================================================
--- config/i386/i386.opt	(revision 201582)
+++ config/i386/i386.opt	(working copy)
@@ -316,6 +316,14 @@ mstack-arg-probe
 Target Report Mask(STACK_PROBE) Save
 Enable stack probing
 
+mmemcpy-strategy=
+Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
+Specify memcpy expansion strategy when expected size is known
+
+mmemset-strategy=
+Target RejectNegative Joined Var(ix86_tune_memset_strategy)
+Specify memset expansion strategy when expected size is known
+
 mstringop-strategy=
 Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
 Chose strategy to generate stringop using
Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 201581)
+++ doc/invoke.texi	(working copy)
@@ -652,6 +652,7 @@ Objective-C and Objective-C++ Dialects}.
 -mbmi2 -mrtm -mlwp -mthreads @gol
 -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
+-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} 
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
 -mregparm=@var{num}  -msseregparm @gol
@@ -14651,6 +14652,24 @@ Expand into an inline loop.
 Always use a library call.
 @end table
 
+@item -mmemcpy-strategy=@var{strategy}
+@opindex mmemcpy-strategy=@var{strategy}
+Override the internal decision heuristic to decide if @code{__builtin_memcpy}
+should be inlined and what inline algorithm to use when the expected size
+of the copy operation is known. @var{strategy} 
+is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. 
+@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
+the max byte size with which inline algorithm @var{alg} is allowed.  For the last
+triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
+in the list must be specified in increasing order.  The minimal byte size for 
+@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the 
+preceding range.
+
+@item -mmemset-strategy=@var{strategy}
+@opindex mmemset-strategy=@var{strategy}
+The option is similar to @option{-mmemcpy-strategy=} except that it is to control
+@code{__builtin_memset} expansion.
+
 @item -momit-leaf-frame-pointer
 @opindex momit-leaf-frame-pointer
 Don't keep the frame pointer in a register for leaf functions.  This
Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memset-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memset" 2  } } */
+
+char a[2048];
+void t (void)
+{
+  __builtin_memset (a, 1, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memcpy" 2  } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-08  6:17                   ` Xinliang David Li
@ 2013-08-08 15:18                     ` Joseph S. Myers
  2013-08-08 16:31                       ` Xinliang David Li
  0 siblings, 1 reply; 23+ messages in thread
From: Joseph S. Myers @ 2013-08-08 15:18 UTC (permalink / raw)
  To: Xinliang David Li; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson

On Wed, 7 Aug 2013, Xinliang David Li wrote:

> Updated patch attached (fixed header, buffer overflow, and warning -->
> error problems).

You still have diagnostics starting with a capital letter, contrary to the 
GNU Coding Standards.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-08 15:18                     ` Joseph S. Myers
@ 2013-08-08 16:31                       ` Xinliang David Li
  2013-08-09 18:25                         ` Xinliang David Li
  0 siblings, 1 reply; 23+ messages in thread
From: Xinliang David Li @ 2013-08-08 16:31 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson

[-- Attachment #1: Type: text/plain, Size: 404 bytes --]

Updated.

thanks,

David

On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote:
> On Wed, 7 Aug 2013, Xinliang David Li wrote:
>
>> Updated patch attached (fixed header, buffer overflow, and warning -->
>> error problems).
>
> You still have diagnostics starting with a capital letter, contrary to the
> GNU Coding Standards.
>
> --
> Joseph S. Myers
> joseph@codesourcery.com

[-- Attachment #2: stringop_inl_option.p.txt --]
[-- Type: text/plain, Size: 18415 bytes --]

Index: testsuite/gcc.target/i386/memcpy-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-1.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-2.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-2.c	(revision 0)
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:3000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "movdqa" 8 { target { ! { ia32 } } } } } */
+/* { dg-final { scan-assembler-times "movdqa" 4 { target { ia32 } } } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
+
Index: testsuite/gcc.target/i386/memset-strategy-1.c
===================================================================
--- testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
+++ testsuite/gcc.target/i386/memset-strategy-1.c	(revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemset-strategy=libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memset" 2  } } */
+
+char a[2048];
+void t (void)
+{
+  __builtin_memset (a, 1, 2048);
+}
+
Index: testsuite/gcc.target/i386/memcpy-strategy-3.c
===================================================================
--- testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
+++ testsuite/gcc.target/i386/memcpy-strategy-3.c	(revision 0)
@@ -0,0 +1,10 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=atom -mmemcpy-strategy=vector_loop:2000:align,libcall:-1:align" } */
+/* { dg-final { scan-assembler-times "memcpy" 2  } } */
+
+char a[2048];
+char b[2048];
+void t (void)
+{
+  __builtin_memcpy (a, b, 2048);
+}
Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 201581)
+++ doc/invoke.texi	(working copy)
@@ -652,6 +652,7 @@ Objective-C and Objective-C++ Dialects}.
 -mbmi2 -mrtm -mlwp -mthreads @gol
 -mno-align-stringops  -minline-all-stringops @gol
 -minline-stringops-dynamically -mstringop-strategy=@var{alg} @gol
+-mmemcpy-strategy=@var{strategy} -mmemset-strategy=@var{strategy} 
 -mpush-args  -maccumulate-outgoing-args  -m128bit-long-double @gol
 -m96bit-long-double -mlong-double-64 -mlong-double-80 @gol
 -mregparm=@var{num}  -msseregparm @gol
@@ -14651,6 +14652,24 @@ Expand into an inline loop.
 Always use a library call.
 @end table
 
+@item -mmemcpy-strategy=@var{strategy}
+@opindex mmemcpy-strategy=@var{strategy}
+Override the internal decision heuristic to decide if @code{__builtin_memcpy}
+should be inlined and what inline algorithm to use when the expected size
+of the copy operation is known. @var{strategy} 
+is a comma-separated list of @var{alg}:@var{max_size}:@var{dest_align} triplets. 
+@var{alg} is specified in @option{-mstringop-strategy}, @var{max_size} specifies
+the max byte size with which inline algorithm @var{alg} is allowed.  For the last
+triplet, the @var{max_size} must be @code{-1}. The @var{max_size} of the triplets
+in the list must be specified in increasing order.  The minimal byte size for 
+@var{alg} is @code{0} for the first triplet and @code{@var{max_size} + 1} of the 
+preceding range.
+
+@item -mmemset-strategy=@var{strategy}
+@opindex mmemset-strategy=@var{strategy}
+The option is similar to @option{-mmemcpy-strategy=} except that it is to control
+@code{__builtin_memset} expansion.
+
 @item -momit-leaf-frame-pointer
 @opindex momit-leaf-frame-pointer
 Don't keep the frame pointer in a register for leaf functions.  This
Index: config/i386/stringop.def
===================================================================
--- config/i386/stringop.def	(revision 0)
+++ config/i386/stringop.def	(revision 0)
@@ -0,0 +1,37 @@
+/* Definitions for stringop strategy for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License 
+along with GCC; see the files COPYING3.  If not,
+see <http://www.gnu.org/licenses/>.  */
+
+DEF_ENUM
+DEF_ALG (no_stringop, no_stringop)
+DEF_ENUM
+DEF_ALG (libcall, libcall)
+DEF_ENUM
+DEF_ALG (rep_prefix_1_byte, rep_byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_4_byte, rep_4byte)
+DEF_ENUM
+DEF_ALG (rep_prefix_8_byte, rep_8byte)
+DEF_ENUM
+DEF_ALG (loop_1_byte, byte_loop)
+DEF_ENUM
+DEF_ALG (loop, loop)
+DEF_ENUM
+DEF_ALG (unrolled_loop, unrolled_loop)
+DEF_ENUM
+DEF_ALG (vector_loop, vector_loop)
Index: config/i386/stringop.opt
===================================================================
--- config/i386/stringop.opt	(revision 0)
+++ config/i386/stringop.opt	(revision 0)
@@ -0,0 +1,31 @@
+/* Definitions for stringop option handling for IA-32.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3, or (at your option)
+any later version.
+
+GCC is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the files COPYING3.  If not,
+see <http://www.gnu.org/licenses/>.  */
+
+Enum(stringop_alg) String(rep_byte) Value(rep_prefix_1_byte)
+
+#undef DEF_ENUM
+#define DEF_ENUM EnumValue
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) Enum(stringop_alg) String(name) Value(alg)
+
+#include "stringop.def"
+
+#undef DEF_ENUM
+#undef DEF_ALG
Index: config/i386/i386-opts.h
===================================================================
--- config/i386/i386-opts.h	(revision 201581)
+++ config/i386/i386-opts.h	(working copy)
@@ -28,15 +28,17 @@ see the files COPYING3 and COPYING.RUNTI
 /* Algorithm to expand string function with.  */
 enum stringop_alg
 {
-   no_stringop,
-   libcall,
-   rep_prefix_1_byte,
-   rep_prefix_4_byte,
-   rep_prefix_8_byte,
-   loop_1_byte,
-   loop,
-   unrolled_loop,
-   vector_loop
+#undef DEF_ENUM
+#define DEF_ENUM
+
+#undef DEF_ALG
+#define DEF_ALG(alg, name) alg, 
+
+#include "stringop.def"
+last_alg
+
+#undef DEF_ENUM
+#undef DEF_ALG
 };
 
 /* Available call abi.  */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 201582)
+++ config/i386/i386.c	(working copy)
@@ -158,7 +158,7 @@ struct processor_costs ix86_size_cost =
 };
 
 /* Processor costs (relative to an add) */
-static const
+static
 struct processor_costs i386_cost = {	/* 386 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -228,7 +228,7 @@ struct processor_costs i386_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs i486_cost = {	/* 486 specific costs */
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -300,7 +300,7 @@ struct processor_costs i486_cost = {	/*
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -370,7 +370,7 @@ struct processor_costs pentium_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentiumpro_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -449,7 +449,7 @@ struct processor_costs pentiumpro_cost =
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs geode_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -520,7 +520,7 @@ struct processor_costs geode_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k6_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -593,7 +593,7 @@ struct processor_costs k6_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs athlon_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -666,7 +666,7 @@ struct processor_costs athlon_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs k8_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (2),			/* cost of a lea instruction */
@@ -1267,7 +1267,7 @@ struct processor_costs btver2_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs pentium4_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (3),			/* cost of a lea instruction */
@@ -1338,7 +1338,7 @@ struct processor_costs pentium4_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs nocona_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1),			/* cost of a lea instruction */
@@ -1411,7 +1411,7 @@ struct processor_costs nocona_cost = {
   1,					/* cond_not_taken_branch_cost.  */
 };
 
-static const
+static
 struct processor_costs atom_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -1558,7 +1558,7 @@ struct processor_costs slm_cost = {
 };
 
 /* Generic64 should produce code tuned for Nocona and K8.  */
-static const
+static
 struct processor_costs generic64_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1637,7 +1637,7 @@ struct processor_costs generic64_cost =
 };
 
 /* core_cost should produce code tuned for Core familly of CPUs.  */
-static const
+static
 struct processor_costs core_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   /* On all chips taken into consideration lea is 2 cycles and more.  With
@@ -1719,7 +1719,7 @@ struct processor_costs core_cost = {
 
 /* Generic32 should produce code tuned for PPro, Pentium4, Nocona,
    Athlon and K8.  */
-static const
+static
 struct processor_costs generic32_cost = {
   COSTS_N_INSNS (1),			/* cost of an add instruction */
   COSTS_N_INSNS (1) + 1,		/* cost of a lea instruction */
@@ -2926,6 +2926,149 @@ ix86_debug_options (void)
 
   return;
 }
+
+static const char *stringop_alg_names[] = {
+#define DEF_ENUM
+#define DEF_ALG(alg, name) #name,
+#include "stringop.def"
+#undef DEF_ENUM
+#undef DEF_ALG
+};
+
+/* Parse parameter string passed to -mmemcpy-strategy= or -mmemset-strategy=.
+   The string is of the following form (or comma separated list of it):
+
+     strategy_alg:max_size:[align|noalign]
+
+   where the full size range for the strategy is either [0, max_size] or
+   [min_size, max_size], in which min_size is the max_size + 1 of the
+   preceding range.  The last size range must have max_size == -1.
+
+   Examples:
+
+    1.
+       -mmemcpy-strategy=libcall:-1:noalign
+
+      this is equivalent to (for known size memcpy) -mstringop-strategy=libcall
+
+
+   2.
+      -mmemset-strategy=rep_8byte:16:noalign,vector_loop:2048:align,libcall:-1:noalign
+
+      This is to tell the compiler to use the following strategy for memset
+      1) when the expected size is between [1, 16], use rep_8byte strategy;
+      2) when the size is between [17, 2048], use vector_loop;
+      3) when the size is > 2048, use libcall.  */
+
+struct stringop_size_range
+{
+  int min;
+  int max;
+  stringop_alg alg;
+  bool noalign;
+};
+
+static void
+ix86_parse_stringop_strategy_string (char *strategy_str, bool is_memset)
+{
+  const struct stringop_algs *default_algs;
+  stringop_size_range input_ranges[MAX_STRINGOP_ALGS];
+  char *curr_range_str, *next_range_str;
+  int i = 0, n = 0;
+
+  if (is_memset)
+    default_algs = &ix86_cost->memset[TARGET_64BIT != 0];
+  else
+    default_algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
+
+  curr_range_str = strategy_str;
+
+  do
+    {
+      int mins, maxs;
+      stringop_alg alg;
+      char alg_name[128];
+      char align[16];
+      next_range_str = strchr (curr_range_str, ',');
+      if (next_range_str)
+        *next_range_str++ = '\0';
+
+      if (3 != sscanf (curr_range_str, "%20[^:]:%d:%10s",
+                       alg_name, &maxs, align))
+        {
+          error ("wrong arg %s to option %s", curr_range_str,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      if (n > 0 && (maxs < (mins = input_ranges[n - 1].max + 1) && maxs != -1))
+        {
+          error ("size ranges of option %s should be increasing",
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      for (i = 0; i < last_alg; i++)
+        {
+          if (!strcmp (alg_name, stringop_alg_names[i]))
+            {
+              alg = (stringop_alg) i;
+              break;
+            }
+        }
+
+      if (i == last_alg)
+        {
+          error ("wrong stringop strategy name %s specified for option %s",
+                 alg_name,
+                 is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+
+      input_ranges[n].min = mins;
+      input_ranges[n].max = maxs;
+      input_ranges[n].alg = alg;
+      if (!strcmp (align, "align"))
+        input_ranges[n].noalign = false;
+      else if (!strcmp (align, "noalign"))
+        input_ranges[n].noalign = true;
+      else
+        {
+          error ("unknown alignment %s specified for option %s",
+                 align, is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+          return;
+        }
+      n++;
+      curr_range_str = next_range_str;
+    }
+  while (curr_range_str);
+
+  if (input_ranges[n - 1].max != -1)
+    {
+      error ("the max value for the last size range should be -1"
+             " for option %s",
+             is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  if (n > MAX_STRINGOP_ALGS)
+    {
+      error ("too many size ranges specified in option %s",
+             is_memset ? "-mmemset_strategy=" : "-mmemcpy_strategy=");
+      return;
+    }
+
+  /* Now override the default algs array.  */
+  for (i = 0; i < n; i++)
+    {
+      *const_cast<int *>(&default_algs->size[i].max) = input_ranges[i].max;
+      *const_cast<stringop_alg *>(&default_algs->size[i].alg)
+          = input_ranges[i].alg;
+      *const_cast<int *>(&default_algs->size[i].noalign)
+          = input_ranges[i].noalign;
+    }
+}
+
 \f
 /* Override various settings based on options.  If MAIN_ARGS_P, the
    options are from the command line, otherwise they are from
@@ -4081,6 +4224,21 @@ ix86_option_override_internal (bool main
   /* Handle stack protector */
   if (!global_options_set.x_ix86_stack_protector_guard)
     ix86_stack_protector_guard = TARGET_HAS_BIONIC ? SSP_GLOBAL : SSP_TLS;
+
+  /* Handle -mmemcpy-strategy= and -mmemset-strategy=  */
+  if (ix86_tune_memcpy_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memcpy_strategy);
+      ix86_parse_stringop_strategy_string (str, false);
+      free (str);
+    }
+
+  if (ix86_tune_memset_strategy)
+    {
+      char *str = xstrdup (ix86_tune_memset_strategy);
+      ix86_parse_stringop_strategy_string (str, true);
+      free (str);
+    }
 }
 
 /* Implement the TARGET_OPTION_OVERRIDE hook.  */
@@ -22964,6 +23122,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
       need_zero_guard = true;
@@ -23154,6 +23313,7 @@ ix86_expand_movmem (rtx dst, rtx src, rt
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
@@ -23365,6 +23525,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop:
       need_zero_guard = true;
@@ -23542,6 +23703,7 @@ ix86_expand_setmem (rtx dst, rtx count_e
     {
     case libcall:
     case no_stringop:
+    case last_alg:
       gcc_unreachable ();
     case loop_1_byte:
     case loop:
Index: config/i386/i386.opt
===================================================================
--- config/i386/i386.opt	(revision 201582)
+++ config/i386/i386.opt	(working copy)
@@ -316,6 +316,14 @@ mstack-arg-probe
 Target Report Mask(STACK_PROBE) Save
 Enable stack probing
 
+mmemcpy-strategy=
+Target RejectNegative Joined Var(ix86_tune_memcpy_strategy)
+Specify memcpy expansion strategy when expected size is known
+
+mmemset-strategy=
+Target RejectNegative Joined Var(ix86_tune_memset_strategy)
+Specify memset expansion strategy when expected size is known
+
 mstringop-strategy=
 Target RejectNegative Joined Enum(stringop_alg) Var(ix86_stringop_alg) Init(no_stringop)
 Chose strategy to generate stringop using

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-08 16:31                       ` Xinliang David Li
@ 2013-08-09 18:25                         ` Xinliang David Li
  2013-08-09 18:33                           ` Jan Hubicka
  0 siblings, 1 reply; 23+ messages in thread
From: Xinliang David Li @ 2013-08-09 18:25 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: Jan Hubicka, GCC Patches, Teresa Johnson

Is this version ok for trunk?

thanks,

David

On Thu, Aug 8, 2013 at 9:31 AM, Xinliang David Li <davidxl@google.com> wrote:
> Updated.
>
> thanks,
>
> David
>
> On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote:
>> On Wed, 7 Aug 2013, Xinliang David Li wrote:
>>
>>> Updated patch attached (fixed header, buffer overflow, and warning -->
>>> error problems).
>>
>> You still have diagnostics starting with a capital letter, contrary to the
>> GNU Coding Standards.
>>
>> --
>> Joseph S. Myers
>> joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-09 18:25                         ` Xinliang David Li
@ 2013-08-09 18:33                           ` Jan Hubicka
  2013-08-09 19:41                             ` Xinliang David Li
  0 siblings, 1 reply; 23+ messages in thread
From: Jan Hubicka @ 2013-08-09 18:33 UTC (permalink / raw)
  To: Xinliang David Li
  Cc: Joseph S. Myers, Jan Hubicka, GCC Patches, Teresa Johnson

> Is this version ok for trunk?

It looks resonable, but I still do not like much the removal of const for tables.
Doing so will push them all into David Malcom's per-thread global universe.

Currently the algorithm is selected based on cost->memset/cost->memcpy.
Instead of removing the const of all the CPU tables, I would preffer
introducing two readwrite global variables memset_algs/memcpy_algs and feed
them with proper table at a time we set up ix86_tune_features.

This has chance to do the right thing with optimize attribute specifying algorithms
and with the longer term threading plan.

Honza
> 
> thanks,
> 
> David
> 
> On Thu, Aug 8, 2013 at 9:31 AM, Xinliang David Li <davidxl@google.com> wrote:
> > Updated.
> >
> > thanks,
> >
> > David
> >
> > On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote:
> >> On Wed, 7 Aug 2013, Xinliang David Li wrote:
> >>
> >>> Updated patch attached (fixed header, buffer overflow, and warning -->
> >>> error problems).
> >>
> >> You still have diagnostics starting with a capital letter, contrary to the
> >> GNU Coding Standards.
> >>
> >> --
> >> Joseph S. Myers
> >> joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-09 18:33                           ` Jan Hubicka
@ 2013-08-09 19:41                             ` Xinliang David Li
  2013-08-09 19:49                               ` Jan Hubicka
  0 siblings, 1 reply; 23+ messages in thread
From: Xinliang David Li @ 2013-08-09 19:41 UTC (permalink / raw)
  To: Jan Hubicka; +Cc: Joseph S. Myers, GCC Patches, Teresa Johnson

On Fri, Aug 9, 2013 at 11:33 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
>> Is this version ok for trunk?
>
> It looks resonable, but I still do not like much the removal of const for tables.
> Doing so will push them all into David Malcom's per-thread global universe.
>
> Currently the algorithm is selected based on cost->memset/cost->memcpy.
> Instead of removing the const of all the CPU tables, I would preffer
> introducing two readwrite global variables memset_algs/memcpy_algs and feed
> them with proper table at a time we set up ix86_tune_features.
>

I can do that in this patch. In the future, when we need to do tunings
for those constants, we can revisit it.

thanks,

David

> This has chance to do the right thing with optimize attribute specifying algorithms
> and with the longer term threading plan.
>
> Honza
>>
>> thanks,
>>
>> David
>>
>> On Thu, Aug 8, 2013 at 9:31 AM, Xinliang David Li <davidxl@google.com> wrote:
>> > Updated.
>> >
>> > thanks,
>> >
>> > David
>> >
>> > On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote:
>> >> On Wed, 7 Aug 2013, Xinliang David Li wrote:
>> >>
>> >>> Updated patch attached (fixed header, buffer overflow, and warning -->
>> >>> error problems).
>> >>
>> >> You still have diagnostics starting with a capital letter, contrary to the
>> >> GNU Coding Standards.
>> >>
>> >> --
>> >> Joseph S. Myers
>> >> joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: New parameters to control stringop expansion libcall strategy
  2013-08-09 19:41                             ` Xinliang David Li
@ 2013-08-09 19:49                               ` Jan Hubicka
  0 siblings, 0 replies; 23+ messages in thread
From: Jan Hubicka @ 2013-08-09 19:49 UTC (permalink / raw)
  To: Xinliang David Li
  Cc: Jan Hubicka, Joseph S. Myers, GCC Patches, Teresa Johnson

> On Fri, Aug 9, 2013 at 11:33 AM, Jan Hubicka <hubicka@ucw.cz> wrote:
> >> Is this version ok for trunk?
> >
> > It looks resonable, but I still do not like much the removal of const for tables.
> > Doing so will push them all into David Malcom's per-thread global universe.
> >
> > Currently the algorithm is selected based on cost->memset/cost->memcpy.
> > Instead of removing the const of all the CPU tables, I would preffer
> > introducing two readwrite global variables memset_algs/memcpy_algs and feed
> > them with proper table at a time we set up ix86_tune_features.
> >
> 
> I can do that in this patch. In the future, when we need to do tunings
> for those constants, we can revisit it.

Yep, I think we can follow same strategy and just move them to a global constant.
Those are part of the context/universum since they will be user rewritable then.

Thanks, the patch is OK with this change.
Honza
> 
> thanks,
> 
> David
> 
> > This has chance to do the right thing with optimize attribute specifying algorithms
> > and with the longer term threading plan.
> >
> > Honza
> >>
> >> thanks,
> >>
> >> David
> >>
> >> On Thu, Aug 8, 2013 at 9:31 AM, Xinliang David Li <davidxl@google.com> wrote:
> >> > Updated.
> >> >
> >> > thanks,
> >> >
> >> > David
> >> >
> >> > On Thu, Aug 8, 2013 at 8:18 AM, Joseph S. Myers <joseph@codesourcery.com> wrote:
> >> >> On Wed, 7 Aug 2013, Xinliang David Li wrote:
> >> >>
> >> >>> Updated patch attached (fixed header, buffer overflow, and warning -->
> >> >>> error problems).
> >> >>
> >> >> You still have diagnostics starting with a capital letter, contrary to the
> >> >> GNU Coding Standards.
> >> >>
> >> >> --
> >> >> Joseph S. Myers
> >> >> joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2013-08-09 19:49 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-03  4:22 New parameters to control stringop expansion libcall strategy Xinliang David Li
2013-08-03  8:07 ` Backend specific params.def? (Was Re: New parameters to control stringop expansion libcall strategy) Jan Hubicka
2013-08-03 15:40   ` Xinliang David Li
2013-08-05  3:01 ` New parameters to control stringop expansion libcall strategy Xinliang David Li
2013-08-05 10:57   ` Michael V. Zolotukhin
2013-08-05 16:44     ` Xinliang David Li
2013-08-06  8:46       ` Michael Zolotukhin
2013-08-06  9:42         ` Jan Hubicka
2013-08-06 16:08           ` Xinliang David Li
2013-08-07 17:06           ` Xinliang David Li
2013-08-08  0:23             ` Joseph S. Myers
2013-08-08  0:29               ` Xinliang David Li
2013-08-08  1:04                 ` Joseph S. Myers
2013-08-08  6:17                   ` Xinliang David Li
2013-08-08 15:18                     ` Joseph S. Myers
2013-08-08 16:31                       ` Xinliang David Li
2013-08-09 18:25                         ` Xinliang David Li
2013-08-09 18:33                           ` Jan Hubicka
2013-08-09 19:41                             ` Xinliang David Li
2013-08-09 19:49                               ` Jan Hubicka
2013-08-06 16:42         ` Xinliang David Li
2013-08-06 16:45           ` Xinliang David Li
2013-08-07  7:14           ` Michael Zolotukhin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).