public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH,i386] fix PR 11001
@ 2007-10-03 15:01 Nathan Froyd
  0 siblings, 0 replies; 11+ messages in thread
From: Nathan Froyd @ 2007-10-03 15:01 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2075 bytes --]

The attached patch fixes PR11001, a long-standing bad interaction
between register globals and strlen/memset/memcpy on the x86.  The
primary issue is that the expanders for those functions would generate
(in some cases) x86 instructions that required particular registers
without checking whether those registers were available for use.

The fix is to ensure that the registers are available before generating
the instructions.  Note that the code is not optimal: in the memset
case, for instance, if we choose an inlining strategy requiring 'rep
stosl' and then discover that the necessary registers are not available,
we generate a full call to 'memset' rather than generating an inline
copy loop.  I don't see this as a serious defect; if you are using
register globals on the x86, you deserve a performance penalty.

The testcase in the PR no longer ICEs on mainline--it does produce
incorrect code, but that is mostly due to the semantics of
parameter-scope register variables differing from global reigster
variables and is therefore a bug in the code.  It is not included.  The
nine testcases included in the patch *do* ICE on current mainline,
however, and no longer ICE with the patch.

Tested on x86_64-unknown-linux-gnu with no regressions.  OK to commit?

-Nathan

2007-10-03  Nathan Froyd  <froydnj@codesourcery.com>

	PR 11001
	gcc/
	* config/i386/i386.md (strmov): Check for esi and edi usage.
	* config/i386/i386.c (ix86_expand_movmem): Check for ecx, esi,
	and edi usage.
	(ix86_expand_setmem): Check for eax, ecx, and edi usage.
	(ix86_expand_strlen): Likewise.

	gcc/testsuite/
	* gcc.target/i386/pr11001-strlen-1.c: New testcase.
	* gcc.target/i386/pr11001-strlen-2.c: New testcase.
	* gcc.target/i386/pr11001-strlen-3.c: New testcase.
	* gcc.target/i386/pr11001-memset-1.c: New testcase.
	* gcc.target/i386/pr11001-memset-2.c: New testcase.
	* gcc.target/i386/pr11001-memset-3.c: New testcase.
	* gcc.target/i386/pr11001-memcpy-1.c: New testcase.
	* gcc.target/i386/pr11001-memcpy-2.c: New testcase.
	* gcc.target/i386/pr11001-memcpy-3.c: New testcase.


[-- Attachment #2: pr11001.patch --]
[-- Type: text/x-diff, Size: 9006 bytes --]

Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md	(revision 128981)
+++ gcc/config/i386/i386.md	(working copy)
@@ -18699,7 +18699,9 @@ (define_expand "strmov"
   operands[5] = gen_rtx_PLUS (Pmode, operands[0], adjust);
   operands[6] = gen_rtx_PLUS (Pmode, operands[2], adjust);
 
-  if (TARGET_SINGLE_STRINGOP || optimize_size)
+  /* Can't use this if the user has appropriated esi or edi.  */
+  if ((TARGET_SINGLE_STRINGOP || optimize_size)
+      && !(global_regs[4] || global_regs[5]))
     {
       emit_insn (gen_strmov_singleop (operands[0], operands[1],
 				      operands[2], operands[3],
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 128981)
+++ gcc/config/i386/i386.c	(working copy)
@@ -15286,6 +15286,13 @@ ix86_expand_movmem (rtx dst, rtx src, rt
       break;
     }
 
+  /* Can't use this if the user has appropriated ecx, esi, or edi.  */
+  if ((alg == rep_prefix_1_byte
+       || alg == rep_prefix_4_byte
+       || alg == rep_prefix_8_byte)
+      && (global_regs[2] || global_regs[4] || global_regs[5]))
+    return 0;
+
   epilogue_size_needed = size_needed;
 
   /* Step 1: Prologue guard.  */
@@ -15594,6 +15601,14 @@ ix86_expand_setmem (rtx dst, rtx count_e
       size_needed = 1;
       break;
     }
+
+  /* Can't use this if the user has appropriated eax, ecx, or edi.  */
+  if ((alg == rep_prefix_1_byte
+       || alg == rep_prefix_4_byte
+       || alg == rep_prefix_8_byte)
+      && (global_regs[0] || global_regs[2] || global_regs[5]))
+    return 0;
+
   epilogue_size_needed = size_needed;
 
   /* Step 1: Prologue guard.  */
@@ -15985,6 +16000,11 @@ ix86_expand_strlen (rtx out, rtx src, rt
   else
     {
       rtx unspec;
+
+      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
+      if (global_regs[0] || global_regs[2] || global_regs[5])
+        return false;
+
       scratch2 = gen_reg_rtx (Pmode);
       scratch3 = gen_reg_rtx (Pmode);
       scratch4 = force_reg (Pmode, constm1_rtx);
Index: gcc/testsuite/gcc.target/i386/pr11001-strlen-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-strlen-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-strlen-2.c	(revision 0)
@@ -0,0 +1,16 @@
+/* Ensure that we don't use 'repnz scasb' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 -m32" } */
+
+extern __SIZE_TYPE__ strlen (const char *);
+extern void *malloc (__SIZE_TYPE__);
+
+register int regvar asm("%eax"); /* { dg-warning "call-clobbered register" } */
+
+char *
+do_copy (char *str)
+{
+  return malloc (strlen (str) + 1);
+}
+
+/* { dg-final { scan-assembler-not "repnz scasb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memset-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memset-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memset-2.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep stoX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memset (void *, int, __SIZE_TYPE__);
+
+register int regvar asm("%ecx"); /* { dg-warning "call-clobbered register" } */
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memset (foo, 0, sizeof foo);
+  memset (baz, 0, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep stosl" } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-strlen-3.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-strlen-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-strlen-3.c	(revision 0)
@@ -0,0 +1,16 @@
+/* Ensure that we don't use 'repnz scasb' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 -m32" } */
+
+extern __SIZE_TYPE__ strlen (const char *);
+extern void *malloc (__SIZE_TYPE__);
+
+register int regvar asm("%ecx"); /* { dg-warning "call-clobbered register" } */
+
+char *
+do_copy (char *str)
+{
+  return malloc (strlen (str) + 1);
+}
+
+/* { dg-final { scan-assembler-not "repnz scasb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memset-3.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memset-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memset-3.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep stoX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memset (void *, int, __SIZE_TYPE__);
+
+register int regvar asm("%edi");
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memset (foo, 0, sizeof foo);
+  memset (baz, 0, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep stosl" } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memcpy-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memcpy-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memcpy-1.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep movX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memcpy (void *, const void *, __SIZE_TYPE__);
+
+register int regvar asm("%esi");
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memcpy (foo, bar, sizeof foo);
+  memcpy (baz, quux, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep movsl" } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memcpy-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memcpy-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memcpy-2.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep movX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memcpy (void *, const void *, __SIZE_TYPE__);
+
+register int regvar asm("%edi");
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memcpy (foo, bar, sizeof foo);
+  memcpy (baz, quux, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep movsl" } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memcpy-3.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memcpy-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memcpy-3.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep movX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memcpy (void *, const void *, __SIZE_TYPE__);
+
+register int regvar asm("%ecx"); /* { dg-warning "call-clobbered register" } */
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memcpy (foo, bar, sizeof foo);
+  memcpy (baz, quux, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep movsl" } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-strlen-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-strlen-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-strlen-1.c	(revision 0)
@@ -0,0 +1,16 @@
+/* Ensure that we don't use 'repnz scasb' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 -m32" } */
+
+extern __SIZE_TYPE__ strlen (const char *);
+extern void *malloc (__SIZE_TYPE__);
+
+register int regvar asm("%edi");
+
+char *
+do_copy (char *str)
+{
+  return malloc (strlen (str) + 1);
+}
+
+/* { dg-final { scan-assembler-not "repnz scasb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memset-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memset-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memset-1.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep stoX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memset (void *, int, __SIZE_TYPE__);
+
+register int regvar asm("%eax"); /* { dg-warning "call-clobbered register" } */
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memset (foo, 0, sizeof foo);
+  memset (baz, 0, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep stosl" } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
  2007-10-12 19:34           ` H.J. Lu
@ 2007-10-12 19:40             ` Nathan Froyd
  0 siblings, 0 replies; 11+ messages in thread
From: Nathan Froyd @ 2007-10-12 19:40 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Uros Bizjak, GCC Patches, Jan Hubicka

On Fri, Oct 12, 2007 at 12:33:56PM -0700, H.J. Lu wrote:
> On Fri, Oct 12, 2007 at 10:13:23AM -0700, Nathan Froyd wrote:
> > On Fri, Oct 12, 2007 at 09:40:27AM -0700, H.J. Lu wrote:
> > > Can you use/add AX_REG, CX_REG, DI_REG, SI_REG instead using 0, 2,                                       
> > > 4, 5?
> > 
> > Sure.  Patch below.  OK to commit if it passes bootstrap/testing?
> 
> Did you use the old ChangeLog entries for checkin? I didn't
> see SI_REG, DI_REG in ChangeLog.

I checked in the original patch prior to receiving your mail.  The patch
above was a separate patch, which I will check in shortly.

-Nathan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
  2007-10-12 17:13         ` Nathan Froyd
  2007-10-12 17:28           ` Uros Bizjak
@ 2007-10-12 19:34           ` H.J. Lu
  2007-10-12 19:40             ` Nathan Froyd
  1 sibling, 1 reply; 11+ messages in thread
From: H.J. Lu @ 2007-10-12 19:34 UTC (permalink / raw)
  To: Uros Bizjak, GCC Patches, Jan Hubicka

On Fri, Oct 12, 2007 at 10:13:23AM -0700, Nathan Froyd wrote:
> On Fri, Oct 12, 2007 at 09:40:27AM -0700, H.J. Lu wrote:
> > On Fri, Oct 12, 2007 at 09:12:47AM -0700, Nathan Froyd wrote:
> > > +      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
> > > +      if (global_regs[0] || global_regs[2] || global_regs[5])
> > > +        return false;
> > > +
> >
> > Can you use/add AX_REG, CX_REG, DI_REG, SI_REG instead using 0, 2,                                       
> > 4, 5?
> 
> Sure.  Patch below.  OK to commit if it passes bootstrap/testing?
> 
> -Nathan
> 
> 2007-10-12  Nathan Froyd  <froydnj@codesourcery.com>
> 
> 	* config/i386/i386.md (SI_REG, DI_REG): New constants.
> 	(strmov): Use defined constants.
> 	(cmpstrnsi): Likewise.
> 	* config/i386/i386.c (decide_alg): Use defined constants.
> 	(ix86_expand_strlen): Likewise.
> 

Did you use the old ChangeLog entries for checkin? I didn't
see SI_REG, DI_REG in ChangeLog.


H.J.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
  2007-10-12 17:13         ` Nathan Froyd
@ 2007-10-12 17:28           ` Uros Bizjak
  2007-10-12 19:34           ` H.J. Lu
  1 sibling, 0 replies; 11+ messages in thread
From: Uros Bizjak @ 2007-10-12 17:28 UTC (permalink / raw)
  To: H.J. Lu, Uros Bizjak, GCC Patches, Jan Hubicka

Nathan Froyd wrote:

>>> +      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
>>> +      if (global_regs[0] || global_regs[2] || global_regs[5])
>>> +        return false;
>>> +
>>>       
>> Can you use/add AX_REG, CX_REG, DI_REG, SI_REG instead using 0, 2,                                       
>> 4, 5?
>>     
>
> Sure.  Patch below.  OK to commit if it passes bootstrap/testing?
>
> -Nathan
>
> 2007-10-12  Nathan Froyd  <froydnj@codesourcery.com>
>
> 	* config/i386/i386.md (SI_REG, DI_REG): New constants.
> 	(strmov): Use defined constants.
> 	(cmpstrnsi): Likewise.
> 	* config/i386/i386.c (decide_alg): Use defined constants.
> 	(ix86_expand_strlen): Likewise.
>
>   

Uh, I was late for a second ;)

Patch is OK for mainline.

Thanks,
Uros.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
@ 2007-10-12 17:26 Uros Bizjak
  0 siblings, 0 replies; 11+ messages in thread
From: Uros Bizjak @ 2007-10-12 17:26 UTC (permalink / raw)
  To: GCC Patches; +Cc: Nathan Froyd, H. J. Lu

Hello!

> > +      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
> > +      if (global_regs[0] || global_regs[2] || global_regs[5])
> > +        return false;
> > +
>
> Can you use/add AX_REG, CX_REG, DI_REG, SI_REG instead using 0, 2,
> 4, 5?
>   

I'll commit the update as soon as bootstrap finishes. A couple of 
constants has to be added to i386.md.

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
  2007-10-12 16:40       ` H.J. Lu
@ 2007-10-12 17:13         ` Nathan Froyd
  2007-10-12 17:28           ` Uros Bizjak
  2007-10-12 19:34           ` H.J. Lu
  0 siblings, 2 replies; 11+ messages in thread
From: Nathan Froyd @ 2007-10-12 17:13 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Uros Bizjak, GCC Patches, Jan Hubicka

On Fri, Oct 12, 2007 at 09:40:27AM -0700, H.J. Lu wrote:
> On Fri, Oct 12, 2007 at 09:12:47AM -0700, Nathan Froyd wrote:
> > +      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
> > +      if (global_regs[0] || global_regs[2] || global_regs[5])
> > +        return false;
> > +
>
> Can you use/add AX_REG, CX_REG, DI_REG, SI_REG instead using 0, 2,                                       
> 4, 5?

Sure.  Patch below.  OK to commit if it passes bootstrap/testing?

-Nathan

2007-10-12  Nathan Froyd  <froydnj@codesourcery.com>

	* config/i386/i386.md (SI_REG, DI_REG): New constants.
	(strmov): Use defined constants.
	(cmpstrnsi): Likewise.
	* config/i386/i386.c (decide_alg): Use defined constants.
	(ix86_expand_strlen): Likewise.

Index: i386.c
===================================================================
--- i386.c	(revision 129265)
+++ i386.c	(working copy)
@@ -15060,8 +15060,8 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
      additionally, memset wants eax and memcpy wants esi.  Don't
      consider such algorithms if the user has appropriated those
      registers for their own purposes.	*/
-  bool rep_prefix_usable = !(global_regs[2] || global_regs[5]
-                             || (memset ? global_regs[0] : global_regs[4]));
+  bool rep_prefix_usable = !(global_regs[CX_REG] || global_regs[DI_REG]
+                             || (memset ? global_regs[AX_REG] : global_regs[SI_REG]));
 
 #define ALG_USABLE_P(alg) (rep_prefix_usable			\
 			   || (alg != rep_prefix_1_byte		\
@@ -16022,7 +16022,7 @@ ix86_expand_strlen (rtx out, rtx src, rt
       rtx unspec;
 
       /* Can't use this if the user has appropriated eax, ecx, or edi.  */
-      if (global_regs[0] || global_regs[2] || global_regs[5])
+      if (global_regs[AX_REG] || global_regs[CX_REG] || global_regs[DI_REG])
         return false;
 
       scratch2 = gen_reg_rtx (Pmode);
Index: i386.md
===================================================================
--- i386.md	(revision 129265)
+++ i386.md	(working copy)
@@ -222,6 +222,8 @@ (define_constants
   [(AX_REG			 0)
    (DX_REG			 1)
    (CX_REG			 2)
+   (SI_REG			 4)
+   (DI_REG			 5)
    (BP_REG			 6)
    (SP_REG			 7)
    (FLAGS_REG			17)
@@ -18704,7 +18706,7 @@ (define_expand "strmov"
 
   /* Can't use this if the user has appropriated esi or edi.  */
   if ((TARGET_SINGLE_STRINGOP || optimize_size)
-      && !(global_regs[4] || global_regs[5]))
+      && !(global_regs[SI_REG] || global_regs[DI_REG]))
     {
       emit_insn (gen_strmov_singleop (operands[0], operands[1],
 				      operands[2], operands[3],
@@ -19190,7 +19192,7 @@ (define_expand "cmpstrnsi"
   rtx addr1, addr2, out, outlow, count, countreg, align;
 
   /* Can't use this if the user has appropriated esi or edi.  */
-  if (global_regs[4] || global_regs[5])
+  if (global_regs[SI_REG] || global_regs[DI_REG])
     FAIL;
 
   out = operands[0];

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
  2007-10-12 16:13     ` Nathan Froyd
@ 2007-10-12 16:40       ` H.J. Lu
  2007-10-12 17:13         ` Nathan Froyd
  0 siblings, 1 reply; 11+ messages in thread
From: H.J. Lu @ 2007-10-12 16:40 UTC (permalink / raw)
  To: Uros Bizjak, GCC Patches, Jan Hubicka

On Fri, Oct 12, 2007 at 09:12:47AM -0700, Nathan Froyd wrote:
> On Fri, Oct 05, 2007 at 07:59:04AM +0200, Uros Bizjak wrote:
> > Sorry for not being clear here. TARGET_INLINE_ALL_STRINGOPS* should be
> > respected, and libcalls should not be generated in this case. For
> > example, we use -minline-all-stringops to build crtfastmath.o. For
> > some targets, we don't  want to link against libc to use memset
> > routine.
> > 
> > The patch is OK  with above change, but please wait a day or two for
> > possilbe comments from Jan, as he is the original author of stringops
> > code.
> 
> Thanks, the below is what I committed.
> 
> -Nathan
> 
> gcc/
> 	PR 11001
> 	* config/i386/i386.md (strmov): Check for esi and edi usage.
> 	* config/i386/i386.c (decide_alg): Check whether we can use a
> 	rep prefix and adjust algorithm choice accordingly.
> 	(ix86_expand_strlen): Check for eax, ecx, and edi usage.
> 
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c	(revision 129263)
> +++ gcc/config/i386/i386.c	(working copy)
> @@ -15056,21 +15056,32 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
>  	    int *dynamic_check)
>  {
>    const struct stringop_algs * algs;
> +  /* Algorithms using the rep prefix want at least edi and ecx;
> +     additionally, memset wants eax and memcpy wants esi.  Don't
> +     consider such algorithms if the user has appropriated those
> +     registers for their own purposes.	*/
> +  bool rep_prefix_usable = !(global_regs[2] || global_regs[5]
> +                             || (memset ? global_regs[0] : global_regs[4]));
> +
> +
> +      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
> +      if (global_regs[0] || global_regs[2] || global_regs[5])
> +        return false;
> +

Can you use/add AX_REG, CX_REG, DI_REG, SI_REG instead using 0, 2,
4, 5?

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
  2007-10-05  5:59   ` Uros Bizjak
@ 2007-10-12 16:13     ` Nathan Froyd
  2007-10-12 16:40       ` H.J. Lu
  0 siblings, 1 reply; 11+ messages in thread
From: Nathan Froyd @ 2007-10-12 16:13 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches, Jan Hubicka

On Fri, Oct 05, 2007 at 07:59:04AM +0200, Uros Bizjak wrote:
> Sorry for not being clear here. TARGET_INLINE_ALL_STRINGOPS* should be
> respected, and libcalls should not be generated in this case. For
> example, we use -minline-all-stringops to build crtfastmath.o. For
> some targets, we don't  want to link against libc to use memset
> routine.
> 
> The patch is OK  with above change, but please wait a day or two for
> possilbe comments from Jan, as he is the original author of stringops
> code.

Thanks, the below is what I committed.

-Nathan

gcc/
	PR 11001
	* config/i386/i386.md (strmov): Check for esi and edi usage.
	* config/i386/i386.c (decide_alg): Check whether we can use a
	rep prefix and adjust algorithm choice accordingly.
	(ix86_expand_strlen): Check for eax, ecx, and edi usage.

gcc/testsuite/
	PR 11001
	* gcc.target/i386/pr11001-strlen-1.c: New testcase.
	* gcc.target/i386/pr11001-strlen-2.c: New testcase.
	* gcc.target/i386/pr11001-strlen-3.c: New testcase.
	* gcc.target/i386/pr11001-memset-1.c: New testcase.
	* gcc.target/i386/pr11001-memset-2.c: New testcase.
	* gcc.target/i386/pr11001-memset-3.c: New testcase.
	* gcc.target/i386/pr11001-memcpy-1.c: New testcase.
	* gcc.target/i386/pr11001-memcpy-2.c: New testcase.
	* gcc.target/i386/pr11001-memcpy-3.c: New testcase.

Index: gcc/testsuite/gcc.target/i386/pr11001-strlen-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-strlen-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-strlen-2.c	(revision 0)
@@ -0,0 +1,16 @@
+/* Ensure that we don't use 'repnz scasb' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 -m32" } */
+
+extern __SIZE_TYPE__ strlen (const char *);
+extern void *malloc (__SIZE_TYPE__);
+
+register int regvar asm("%eax"); /* { dg-warning "call-clobbered register" } */
+
+char *
+do_copy (char *str)
+{
+  return malloc (strlen (str) + 1);
+}
+
+/* { dg-final { scan-assembler-not "repnz scasb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memset-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memset-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memset-2.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep stoX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memset (void *, int, __SIZE_TYPE__);
+
+register int regvar asm("%ecx"); /* { dg-warning "call-clobbered register" } */
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memset (foo, 0, sizeof foo);
+  memset (baz, 0, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep stosl" } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-strlen-3.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-strlen-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-strlen-3.c	(revision 0)
@@ -0,0 +1,16 @@
+/* Ensure that we don't use 'repnz scasb' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 -m32" } */
+
+extern __SIZE_TYPE__ strlen (const char *);
+extern void *malloc (__SIZE_TYPE__);
+
+register int regvar asm("%ecx"); /* { dg-warning "call-clobbered register" } */
+
+char *
+do_copy (char *str)
+{
+  return malloc (strlen (str) + 1);
+}
+
+/* { dg-final { scan-assembler-not "repnz scasb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memset-3.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memset-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memset-3.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep stoX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memset (void *, int, __SIZE_TYPE__);
+
+register int regvar asm("%edi");
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memset (foo, 0, sizeof foo);
+  memset (baz, 0, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep stosl" } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memcpy-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memcpy-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memcpy-1.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep movX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memcpy (void *, const void *, __SIZE_TYPE__);
+
+register int regvar asm("%esi");
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memcpy (foo, bar, sizeof foo);
+  memcpy (baz, quux, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep movsl" } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memcpy-2.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memcpy-2.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memcpy-2.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep movX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memcpy (void *, const void *, __SIZE_TYPE__);
+
+register int regvar asm("%edi");
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memcpy (foo, bar, sizeof foo);
+  memcpy (baz, quux, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep movsl" } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memcpy-3.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memcpy-3.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memcpy-3.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep movX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memcpy (void *, const void *, __SIZE_TYPE__);
+
+register int regvar asm("%ecx"); /* { dg-warning "call-clobbered register" } */
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memcpy (foo, bar, sizeof foo);
+  memcpy (baz, quux, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep movsl" } } */
+/* { dg-final { scan-assembler-not "rep movsb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-strlen-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-strlen-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-strlen-1.c	(revision 0)
@@ -0,0 +1,16 @@
+/* Ensure that we don't use 'repnz scasb' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-O1 -m32" } */
+
+extern __SIZE_TYPE__ strlen (const char *);
+extern void *malloc (__SIZE_TYPE__);
+
+register int regvar asm("%edi");
+
+char *
+do_copy (char *str)
+{
+  return malloc (strlen (str) + 1);
+}
+
+/* { dg-final { scan-assembler-not "repnz scasb" } } */
Index: gcc/testsuite/gcc.target/i386/pr11001-memset-1.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr11001-memset-1.c	(revision 0)
+++ gcc/testsuite/gcc.target/i386/pr11001-memset-1.c	(revision 0)
@@ -0,0 +1,23 @@
+/* Ensure that we don't use 'rep stoX' in the presence of register globals.  */
+/* { dg-do compile } */
+/* { dg-options "-Os -m32" } */
+
+extern void *memset (void *, int, __SIZE_TYPE__);
+
+register int regvar asm("%eax"); /* { dg-warning "call-clobbered register" } */
+
+int foo[10];
+int bar[10];
+
+char baz[15];
+char quux[15];
+
+void
+do_copy ()
+{
+  memset (foo, 0, sizeof foo);
+  memset (baz, 0, sizeof baz);
+}
+
+/* { dg-final { scan-assembler-not "rep stosl" } } */
+/* { dg-final { scan-assembler-not "rep stosb" } } */
Index: gcc/config/i386/i386.md
===================================================================
--- gcc/config/i386/i386.md	(revision 129263)
+++ gcc/config/i386/i386.md	(working copy)
@@ -18702,7 +18702,9 @@ (define_expand "strmov"
   operands[5] = gen_rtx_PLUS (Pmode, operands[0], adjust);
   operands[6] = gen_rtx_PLUS (Pmode, operands[2], adjust);
 
-  if (TARGET_SINGLE_STRINGOP || optimize_size)
+  /* Can't use this if the user has appropriated esi or edi.  */
+  if ((TARGET_SINGLE_STRINGOP || optimize_size)
+      && !(global_regs[4] || global_regs[5]))
     {
       emit_insn (gen_strmov_singleop (operands[0], operands[1],
 				      operands[2], operands[3],
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 129263)
+++ gcc/config/i386/i386.c	(working copy)
@@ -15056,21 +15056,32 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
 	    int *dynamic_check)
 {
   const struct stringop_algs * algs;
+  /* Algorithms using the rep prefix want at least edi and ecx;
+     additionally, memset wants eax and memcpy wants esi.  Don't
+     consider such algorithms if the user has appropriated those
+     registers for their own purposes.	*/
+  bool rep_prefix_usable = !(global_regs[2] || global_regs[5]
+                             || (memset ? global_regs[0] : global_regs[4]));
+
+#define ALG_USABLE_P(alg) (rep_prefix_usable			\
+			   || (alg != rep_prefix_1_byte		\
+			       && alg != rep_prefix_4_byte      \
+			       && alg != rep_prefix_8_byte))
 
   *dynamic_check = -1;
   if (memset)
     algs = &ix86_cost->memset[TARGET_64BIT != 0];
   else
     algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
-  if (stringop_alg != no_stringop)
+  if (stringop_alg != no_stringop && ALG_USABLE_P (stringop_alg))
     return stringop_alg;
   /* rep; movq or rep; movl is the smallest variant.  */
   else if (optimize_size)
     {
       if (!count || (count & 3))
-	return rep_prefix_1_byte;
+	return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
       else
-	return rep_prefix_4_byte;
+	return rep_prefix_usable ? rep_prefix_4_byte : loop;
     }
   /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
    */
@@ -15082,27 +15093,34 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
       enum stringop_alg alg = libcall;
       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
 	{
-	  gcc_assert (algs->size[i].max);
+	  /* We get here if the algorithms that were not libcall-based
+	     were rep-prefix based and we are unable to use rep prefixes
+	     based on global register usage.  Break out of the loop and
+	     use the heuristic below.  */
+	  if (algs->size[i].max == 0)
+	    break;
 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
 	    {
-	      if (algs->size[i].alg != libcall)
-		alg = algs->size[i].alg;
+	      enum stringop_alg candidate = algs->size[i].alg;
+
+	      if (candidate != libcall && ALG_USABLE_P (candidate))
+		alg = candidate;
 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
-	         last non-libcall inline algorithm.  */
+		 last non-libcall inline algorithm.  */
 	      if (TARGET_INLINE_ALL_STRINGOPS)
 		{
 		  /* When the current size is best to be copied by a libcall,
-		     but we are still forced to inline, run the heuristic bellow
+		     but we are still forced to inline, run the heuristic below
 		     that will pick code for medium sized blocks.  */
 		  if (alg != libcall)
 		    return alg;
 		  break;
 		}
-	      else
-		return algs->size[i].alg;
+	      else if (ALG_USABLE_P (candidate))
+		return candidate;
 	    }
 	}
-      gcc_assert (TARGET_INLINE_ALL_STRINGOPS);
+      gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
     }
   /* When asked to inline the call anyway, try to pick meaningful choice.
      We look for maximal size of block that is faster to copy by hand and
@@ -15112,15 +15130,32 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
      If this turns out to be bad, we might simply specify the preferred
      choice in ix86_costs.  */
   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
-      && algs->unknown_size == libcall)
+      && (algs->unknown_size == libcall || !ALG_USABLE_P (algs->unknown_size)))
     {
       int max = -1;
       enum stringop_alg alg;
       int i;
+      bool any_alg_usable_p = true;
 
       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
-	if (algs->size[i].alg != libcall && algs->size[i].alg)
-	  max = algs->size[i].max;
+        {
+          enum stringop_alg candidate = algs->size[i].alg;
+          any_alg_usable_p = any_alg_usable_p && ALG_USABLE_P (candidate);
+
+          if (candidate != libcall && candidate
+              && ALG_USABLE_P (candidate))
+              max = algs->size[i].max;
+        }
+      /* If there aren't any usable algorithms, then recursing on
+         smaller sizes isn't going to find anything.  Just return the
+         simple byte-at-a-time copy loop.  */
+      if (!any_alg_usable_p)
+        {
+          /* Pick something reasonable.  */
+          if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
+            *dynamic_check = 128;
+          return loop_1_byte;
+        }
       if (max == -1)
 	max = 4096;
       alg = decide_alg (count, max / 2, memset, dynamic_check);
@@ -15130,7 +15165,8 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
 	*dynamic_check = max;
       return alg;
     }
-  return algs->unknown_size;
+  return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
+#undef ALG_USABLE_P
 }
 
 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
@@ -15984,6 +16020,11 @@ ix86_expand_strlen (rtx out, rtx src, rt
   else
     {
       rtx unspec;
+
+      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
+      if (global_regs[0] || global_regs[2] || global_regs[5])
+        return false;
+
       scratch2 = gen_reg_rtx (Pmode);
       scratch3 = gen_reg_rtx (Pmode);
       scratch4 = force_reg (Pmode, constm1_rtx);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
  2007-10-04 20:55 ` Nathan Froyd
@ 2007-10-05  5:59   ` Uros Bizjak
  2007-10-12 16:13     ` Nathan Froyd
  0 siblings, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2007-10-05  5:59 UTC (permalink / raw)
  To: Nathan Froyd, GCC Patches; +Cc: Jan Hubicka

On 10/4/07, Nathan Froyd <froydnj@codesourcery.com> wrote:

> - we don't run the heuristic if we cannot use rep prefixes and default
>   to libcalls if we cannot find another strategy (loops or unrolled
>   loops) for the size of block we are setting/copying--for situations
>   where rep prefixes cannot be used.  I am unsure of what you were
>   suggesting here--I read your first paragraph as saying that
>   TARGET_INLINE_ALL_STRINGOPS* be respected whereas the second paragraph
>   seems to say that it's ok to punt and choose libcalls if we can't do
>   rep-based copying.

Sorry for not being clear here. TARGET_INLINE_ALL_STRINGOPS* should be
respected, and libcalls should not be generated in this case. For
example, we use -minline-all-stringops to build crtfastmath.o. For
some targets, we don't  want to link against libc to use memset
routine.

> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Modified ChangeLog
> entry below; entries for testcases not included.

>         PR 11001
>         gcc/
>         * config/i386/i386.md (strmov): Check for esi and edi usage.
>         * config/i386/i386.c (decide_alg): Check whether we can use a
>         rep prefix and adjust algorithm choice accordingly.
>         (ix86_expand_strlen): Check for eax, ecx, and edi usage.

The patch is OK  with above change, but please wait a day or two for
possilbe comments from Jan, as he is the original author of stringops
code.

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
  2007-10-04  6:34 Uros Bizjak
@ 2007-10-04 20:55 ` Nathan Froyd
  2007-10-05  5:59   ` Uros Bizjak
  0 siblings, 1 reply; 11+ messages in thread
From: Nathan Froyd @ 2007-10-04 20:55 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches

On Thu, Oct 04, 2007 at 08:34:11AM +0200, Uros Bizjak wrote:
> > +  /* Can't use this if the user has appropriated ecx, esi, or edi.  */
> > +  if ((alg == rep_prefix_1_byte
> > +       || alg == rep_prefix_4_byte
> > +       || alg == rep_prefix_8_byte)
> > +      && (global_regs[2] || global_regs[4] || global_regs[5]))
> > +    return 0;
> > +
> 
> I think that you should put this check into decide_alg(). There you
> can decide between copy loop and libcall, also taking into account
> optimize_size flag, as well as TARGET_INLINE_ALL_STRINGOPS and
> TARGET_INLINE_ALL_STRINGOPS_DYNAMICALLY.
> 
> Please note, that rep_prefix_* algorithms are used for larger blocks.
> Perhaps in your case, we should scan algorithms table backwards until
> we hit other non-rep_prefix algorithm. If we found none, we should
> default to a loop algorithm for optimize_size or to a libcall for
> other cases.

OK, I think this patch implements what you want:

- decide_alg now consults global_regs;

- loops are used if optimize_size and we cannot use rep prefixes;

- we don't run the heuristic if we cannot use rep prefixes and default
  to libcalls if we cannot find another strategy (loops or unrolled
  loops) for the size of block we are setting/copying--for situations
  where rep prefixes cannot be used.  I am unsure of what you were
  suggesting here--I read your first paragraph as saying that
  TARGET_INLINE_ALL_STRINGOPS* be respected whereas the second paragraph
  seems to say that it's ok to punt and choose libcalls if we can't do
  rep-based copying.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Modified ChangeLog
entry below; entries for testcases not included.

-Nathan

	PR 11001
	gcc/
	* config/i386/i386.md (strmov): Check for esi and edi usage.
	* config/i386/i386.c (decide_alg): Check whether we can use a
	rep prefix and adjust algorithm choice accordingly.
	(ix86_expand_strlen): Check for eax, ecx, and edi usage.

Index: i386.c
===================================================================
--- i386.c	(revision 128981)
+++ i386.c	(working copy)
@@ -15057,21 +15057,32 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
 	    int *dynamic_check)
 {
   const struct stringop_algs * algs;
+  /* Algorithms using the rep prefix want at least edi and ecx;
+     additionally, memset wants eax and memcpy wants esi.  Don't
+     consider such algorithms if the user has appropriated those
+     registers for their own purposes.	*/
+  int rep_prefix_usable = !(global_regs[2] || global_regs[5]
+			    || (memset ? global_regs[0] : global_regs[4]));
+
+#define ALG_USABLE_P(alg) ((alg != rep_prefix_1_byte		\
+			    && alg != rep_prefix_4_byte		\
+			    && alg != rep_prefix_8_byte)	\
+			   || rep_prefix_usable)
 
   *dynamic_check = -1;
   if (memset)
     algs = &ix86_cost->memset[TARGET_64BIT != 0];
   else
     algs = &ix86_cost->memcpy[TARGET_64BIT != 0];
-  if (stringop_alg != no_stringop)
+  if (stringop_alg != no_stringop && ALG_USABLE_P (stringop_alg))
     return stringop_alg;
   /* rep; movq or rep; movl is the smallest variant.  */
   else if (optimize_size)
     {
       if (!count || (count & 3))
-	return rep_prefix_1_byte;
+	return rep_prefix_usable ? rep_prefix_1_byte : loop_1_byte;
       else
-	return rep_prefix_4_byte;
+	return rep_prefix_usable ? rep_prefix_4_byte : loop;
     }
   /* Very tiny blocks are best handled via the loop, REP is expensive to setup.
    */
@@ -15083,27 +15094,34 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
       enum stringop_alg alg = libcall;
       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
 	{
-	  gcc_assert (algs->size[i].max);
+	  /* We get here if the algorithms that were not libcall-based
+	     were rep-prefix based and we are unable to use rep prefixes
+	     based on global register usage.  Break out of the loop and
+	     use the heuristic below.  */
+	  if (algs->size[i].max == 0)
+	    break;
 	  if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
 	    {
-	      if (algs->size[i].alg != libcall)
-		alg = algs->size[i].alg;
+	      enum stringop_alg candidate = algs->size[i].alg;
+
+	      if (candidate != libcall && ALG_USABLE_P (candidate))
+		alg = candidate;
 	      /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
-	         last non-libcall inline algorithm.  */
+		 last non-libcall inline algorithm.  */
 	      if (TARGET_INLINE_ALL_STRINGOPS)
 		{
 		  /* When the current size is best to be copied by a libcall,
-		     but we are still forced to inline, run the heuristic bellow
+		     but we are still forced to inline, run the heuristic below
 		     that will pick code for medium sized blocks.  */
 		  if (alg != libcall)
 		    return alg;
 		  break;
 		}
-	      else
-		return algs->size[i].alg;
+	      else if (ALG_USABLE_P (candidate))
+		return candidate;
 	    }
 	}
-      gcc_assert (TARGET_INLINE_ALL_STRINGOPS);
+      gcc_assert (TARGET_INLINE_ALL_STRINGOPS || !rep_prefix_usable);
     }
   /* When asked to inline the call anyway, try to pick meaningful choice.
      We look for maximal size of block that is faster to copy by hand and
@@ -15113,14 +15131,15 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
      If this turns out to be bad, we might simply specify the preferred
      choice in ix86_costs.  */
   if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
-      && algs->unknown_size == libcall)
+      && algs->unknown_size == libcall && rep_prefix_usable)
     {
       int max = -1;
       enum stringop_alg alg;
       int i;
 
       for (i = 0; i < NAX_STRINGOP_ALGS; i++)
-	if (algs->size[i].alg != libcall && algs->size[i].alg)
+	if (algs->size[i].alg != libcall && algs->size[i].alg
+	    && ALG_USABLE_P (algs->size[i].alg))
 	  max = algs->size[i].max;
       if (max == -1)
 	max = 4096;
@@ -15131,7 +15150,8 @@ decide_alg (HOST_WIDE_INT count, HOST_WI
 	*dynamic_check = max;
       return alg;
     }
-  return algs->unknown_size;
+  return ALG_USABLE_P (algs->unknown_size) ? algs->unknown_size : libcall;
+#undef ALG_USABLE_P
 }
 
 /* Decide on alignment.  We know that the operand is already aligned to ALIGN
@@ -15985,6 +16005,11 @@ ix86_expand_strlen (rtx out, rtx src, rt
   else
     {
       rtx unspec;
+
+      /* Can't use this if the user has appropriated eax, ecx, or edi.  */
+      if (global_regs[0] || global_regs[2] || global_regs[5])
+        return false;
+
       scratch2 = gen_reg_rtx (Pmode);
       scratch3 = gen_reg_rtx (Pmode);
       scratch4 = force_reg (Pmode, constm1_rtx);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH,i386] fix PR 11001
@ 2007-10-04  6:34 Uros Bizjak
  2007-10-04 20:55 ` Nathan Froyd
  0 siblings, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2007-10-04  6:34 UTC (permalink / raw)
  To: GCC Patches; +Cc: Nathan Froyd

Hello!

> The fix is to ensure that the registers are available before generating
> the instructions.  Note that the code is not optimal: in the memset
> case, for instance, if we choose an inlining strategy requiring 'rep
> stosl' and then discover that the necessary registers are not available,
> we generate a full call to 'memset' rather than generating an inline
> copy loop.  I don't see this as a serious defect; if you are using
> register globals on the x86, you deserve a performance penalty.


> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c	(revision 128981)
> +++ gcc/config/i386/i386.c	(working copy)
> @@ -15286,6 +15286,13 @@ ix86_expand_movmem (rtx dst, rtx src, rt
>        break;
>      }
>
> +  /* Can't use this if the user has appropriated ecx, esi, or edi.  */
> +  if ((alg == rep_prefix_1_byte
> +       || alg == rep_prefix_4_byte
> +       || alg == rep_prefix_8_byte)
> +      && (global_regs[2] || global_regs[4] || global_regs[5]))
> +    return 0;
> +

I think that you should put this check into decide_alg(). There you
can decide between copy loop and libcall, also taking into account
optimize_size flag, as well as TARGET_INLINE_ALL_STRINGOPS and
TARGET_INLINE_ALL_STRINGOPS_DYNAMICALLY.

Please note, that rep_prefix_* algorithms are used for larger blocks.
Perhaps in your case, we should scan algorithms table backwards until
we hit other non-rep_prefix algorithm. If we found none, we should
default to a loop algorithm for optimize_size or to a libcall for
other cases.

Uros.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2007-10-12 19:40 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-10-03 15:01 [PATCH,i386] fix PR 11001 Nathan Froyd
2007-10-04  6:34 Uros Bizjak
2007-10-04 20:55 ` Nathan Froyd
2007-10-05  5:59   ` Uros Bizjak
2007-10-12 16:13     ` Nathan Froyd
2007-10-12 16:40       ` H.J. Lu
2007-10-12 17:13         ` Nathan Froyd
2007-10-12 17:28           ` Uros Bizjak
2007-10-12 19:34           ` H.J. Lu
2007-10-12 19:40             ` Nathan Froyd
2007-10-12 17:26 Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).