public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, i386]: Avoid store forwarding stalls for floatdi (fildll) insns
@ 2008-04-02 14:24 Uros Bizjak
  0 siblings, 0 replies; only message in thread
From: Uros Bizjak @ 2008-04-02 14:24 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2244 bytes --]

Hello!

This is similar problem as PR 13958 for unsigned ints, but this one is
with long long values. The problem is the same - two SImode values are
stored in two consecutive memory locations and read back as DImode
value. This kills store forwarding feature. The testcase, slightly
modified from PR 13958:

--cut here--
unsigned a[2]={1,2};

inline long long foo1(int i) { return a[i]; }
inline int       foo2(int i) { return a[i]; }

int main()
{
    double x=0;
    int    i;

    for ( i=0; i<100000000; ++i )
        x+=foo1(i%2);

    return (int)x;
}
--cut here--

Without store forwarding fix, following code is created for the loop:

.L6:
        movl    %ecx, %eax
        xorl    %edx, %edx
        andl    $1, %eax
        movl    %edx, -20(%ebp)
        movl    a(,%eax,4), %eax
        incl    %ecx
        movl    %eax, -24(%ebp)
        cmpl    $100000000, %ecx
        fildll  -24(%ebp)
        faddp   %st, %st(1)
        jne     .L6

When xmm registers are available, we can construct DImode value in XMM
register and store to memory from there, and for -O2 -march=pentium4
following code is generated for the loop:

.L6:
        movl    %ecx, %eax
        xorl    %edx, %edx
        andl    $1, %eax
        movd    %edx, %xmm1
        movl    a(,%eax,4), %eax
        incl    %ecx
        movd    %eax, %xmm0
        cmpl    $100000000, %ecx
        punpckldq       %xmm1, %xmm0
        movq    %xmm0, -24(%ebp)
        fildll  -24(%ebp)
        faddp   %st, %st(1)
        jne     .L6

Timings on Intel(R) Xeon(TM) CPU 3.60GHz show benefits of the second approach:

real    0m2.110s
user    0m2.112s
sys     0m0.000s

for the former code  vs.

real    0m0.306s
user    0m0.304s
sys     0m0.000s

for the later.

2008-04-02  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/i386.md (*float<SSEMODEI24:mode><X87MODEF:mode>2_1):
	Emit gen_floatdi<X87MODEF:mode>2_i387_with_xmm when XMM registers
	are available in 32bit mode to avoid store forwarding stall.
	(floatdi<X87MODEF:mode>2_i387_with_xmm): New insn pattern and
	corresponding post-reload splitters.

The patch was bootstrapped on i686-pc-linux-gnu, regression test
(-mtune=core2) is in progress. Patch will be committed to SVN after
regression test finish.

Uros.

[-- Attachment #2: p.diff.txt --]
[-- Type: text/plain, Size: 3556 bytes --]

Index: i386.md
===================================================================
--- i386.md	(revision 133819)
+++ i386.md	(working copy)
@@ -4919,8 +4919,21 @@
   "&& 1"
   [(parallel [(set (match_dup 0) (float:X87MODEF (match_dup 1)))
 	      (clobber (match_dup 2))])]
-  "operands[2] = assign_386_stack_local (<SSEMODEI24:MODE>mode, SLOT_TEMP);")
+{
+  operands[2] = assign_386_stack_local (<SSEMODEI24:MODE>mode, SLOT_TEMP);
 
+  /* Avoid store forwarding (partial memory) stall penalty
+     by passing DImode value through XMM registers.  */
+  if (TARGET_80387 && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES 
+      && !TARGET_64BIT && !optimize_size)
+    {
+      emit_insn (gen_floatdi<X87MODEF:mode>2_i387_with_xmm (operands[0],
+							    operands[1],
+							    operands[2]));
+      DONE;
+    }
+})
+
 (define_insn "*floatsi<mode>2_vector_mixed_with_temp"
   [(set (match_operand:MODEF 0 "register_operand" "=f,f,x,x,x")
 	(float:MODEF
@@ -5304,6 +5317,61 @@
   [(set (match_dup 0) (float:X87MODEF (match_dup 1)))]
   "")
 
+;; Avoid store forwarding (partial memory) stall penalty
+;; by passing DImode value through XMM registers.  */
+
+(define_insn "floatdi<X87MODEF:mode>2_i387_with_xmm"
+  [(set (match_operand:X87MODEF 0 "register_operand" "=f,f")
+	(float:X87MODEF
+	  (match_operand:DI 1 "nonimmediate_operand" "m,?r")))
+   (clobber (match_scratch:V4SI 3 "=&x,x"))
+   (clobber (match_scratch:V4SI 4 "=&x,x"))
+   (clobber (match_operand:DI 2 "memory_operand" "=m,m"))]
+  "TARGET_80387 && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES
+   && !TARGET_64BIT && !optimize_size"
+  "#"
+  [(set_attr "type" "multi")
+   (set_attr "mode" "<X87MODEF:MODE>")
+   (set_attr "unit" "i387")
+   (set_attr "fp_int_src" "true")])
+
+(define_split
+  [(set (match_operand:X87MODEF 0 "register_operand" "")
+	(float:X87MODEF (match_operand:DI 1 "register_operand" "")))
+   (clobber (match_operand:V4SI 3 "register_operand" ""))
+   (clobber (match_operand:V4SI 4 "register_operand" ""))
+   (clobber (match_operand:DI 2 "memory_operand" ""))]
+  "TARGET_80387 && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES
+   && !TARGET_64BIT && !optimize_size
+   && reload_completed
+   && FP_REG_P (operands[0])"
+  [(set (match_dup 2) (match_dup 3))
+   (set (match_dup 0) (float:X87MODEF (match_dup 2)))]
+{
+  /* The DImode arrived in a pair of integral registers (e.g. %edx:%eax).
+     Assemble the 64-bit DImode value in an xmm register.  */
+  emit_insn (gen_sse2_loadld (operands[3], CONST0_RTX (V4SImode),
+			      gen_rtx_SUBREG (SImode, operands[1], 0)));
+  emit_insn (gen_sse2_loadld (operands[4], CONST0_RTX (V4SImode),
+			      gen_rtx_SUBREG (SImode, operands[1], 4)));
+  emit_insn (gen_sse2_punpckldq (operands[3], operands[3], operands[4]));
+
+  operands[3] = gen_rtx_REG (DImode, REGNO (operands[3]));
+})
+
+(define_split
+  [(set (match_operand:X87MODEF 0 "register_operand" "")
+	(float:X87MODEF (match_operand:DI 1 "memory_operand" "")))
+   (clobber (match_operand:V4SI 2 "register_operand" ""))
+   (clobber (match_operand:V4SI 3 "register_operand" ""))
+   (clobber (match_operand:DI 4 "memory_operand" ""))]
+  "TARGET_80387 && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES
+   && !TARGET_64BIT && !optimize_size
+   && reload_completed
+   && FP_REG_P (operands[0])"
+  [(set (match_dup 0) (float:X87MODEF (match_dup 1)))]
+  "")
+
 ;; Avoid store forwarding (partial memory) stall penalty by extending
 ;; SImode value to DImode through XMM register instead of pushing two
 ;; SImode values to stack. Note that even !TARGET_INTER_UNIT_MOVES

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2008-04-02 14:24 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-04-02 14:24 [PATCH, i386]: Avoid store forwarding stalls for floatdi (fildll) insns Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).