* gcc 4.2.3 and MMX to mem move oddity
@ 2008-02-22 22:07 Prakash Punnoor
0 siblings, 0 replies; 5+ messages in thread
From: Prakash Punnoor @ 2008-02-22 22:07 UTC (permalink / raw)
To: gcc
[-- Attachment #1: Type: text/plain, Size: 5090 bytes --]
Hi,
I am playing with following code (from ffmpeg) translated to intrinsics:
Original code:
#define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
void diff_pixels_mmx(char *block, const uint8_t *s1, const uint8_t *s2, long
stride)
{
long offset = -128;
MOVQ_ZERO(mm7);
do {
asm volatile(
"movq (%0), %%mm0 \n\t"
"movq (%1), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"movq %%mm0, (%2, %3) \n\t"
"movq %%mm1, 8(%2, %3) \n\t"
: : "r" (s1), "r" (s2), "r" (block+64), "r" (offset)
: "memory");
s1 += stride;
s2 += stride;
offset += 16;
} while (offset < 0);
}
compiles to:
0000000000000000 <diff_pixels_mmx>:
0: 0f ef ff pxor %mm7,%mm7
3: 48 c7 c0 80 ff ff ff mov $0xffffffffffffff80,%rax
a: 48 83 c7 40 add $0x40,%rdi
e: 66 90 xchg %ax,%ax
10: 0f 6f 06 movq (%rsi),%mm0
13: 0f 6f 12 movq (%rdx),%mm2
16: 0f 6f c8 movq %mm0,%mm1
19: 0f 6f da movq %mm2,%mm3
1c: 0f 60 c7 punpcklbw %mm7,%mm0
1f: 0f 68 cf punpckhbw %mm7,%mm1
22: 0f 60 d7 punpcklbw %mm7,%mm2
25: 0f 68 df punpckhbw %mm7,%mm3
28: 0f f9 c2 psubw %mm2,%mm0
2b: 0f f9 cb psubw %mm3,%mm1
2e: 0f 7f 04 07 movq %mm0,(%rdi,%rax,1)
32: 0f 7f 4c 07 08 movq %mm1,0x8(%rdi,%rax,1)
37: 48 01 ce add %rcx,%rsi
3a: 48 01 ca add %rcx,%rdx
3d: 48 83 c0 10 add $0x10,%rax
41: 75 cd jne 10 <diff_pixels_mmx+0x10>
43: f3 c3 repz retq
45: 66 66 2e 0f 1f 84 00 nopw %cs:0x0(%rax,%rax,1)
4c: 00 00 00 00
This is the intrinsic version:
#include <mmintrin.h>
void diff_pixels_mmx3(char *block, const uint8_t *s1, const uint8_t *s2, long
stride)
{
long offset = -128;
block+=64;
__m64 mm7 = _mm_setzero_si64();
do {
__m64 mm0 = *(__m64*)s1;
__m64 mm2 = *(__m64*)s2;
__m64 mm1 = mm0;
__m64 mm3 = mm2;
mm0 = _mm_unpacklo_pi8(mm0, mm7);
mm1 = _mm_unpackhi_pi8(mm1, mm7);
mm2 = _mm_unpacklo_pi8(mm2, mm7);
mm3 = _mm_unpackhi_pi8(mm3, mm7);
mm0 = _mm_sub_pi16(mm0, mm2);
mm1 = _mm_sub_pi16(mm1, mm3);
*(__m64*)(block+offset) = mm0;
*(__m64*)(block+offset+8) = mm1;
s1 += stride;
s2 += stride;
offset +=16;
} while (offset < 0);
}
compiles to
00000000000000c0 <diff_pixels_mmx3>:
c0: 53 push %rbx
c1: 0f ef e4 pxor %mm4,%mm4
c4: 48 c7 c0 80 ff ff ff mov $0xffffffffffffff80,%rax
cb: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
d0: 0f 6f 0e movq (%rsi),%mm1
d3: 48 01 ce add %rcx,%rsi
d6: 0f 6f 02 movq (%rdx),%mm0
d9: 48 01 ca add %rcx,%rdx
dc: 0f 6f d1 movq %mm1,%mm2
df: 0f 6f d9 movq %mm1,%mm3
e2: 0f 6f c8 movq %mm0,%mm1
e5: 0f 68 c4 punpckhbw %mm4,%mm0
e8: 0f 60 d4 punpcklbw %mm4,%mm2
eb: 0f 68 dc punpckhbw %mm4,%mm3
ee: 0f 60 cc punpcklbw %mm4,%mm1
f1: 0f f9 d8 psubw %mm0,%mm3
f4: 0f f9 d1 psubw %mm1,%mm2
f7: 0f 7f 5c 24 f0 movq %mm3,-0x10(%rsp)
fc: 0f 7f 54 24 f8 movq %mm2,-0x8(%rsp)
101: 48 8b 5c 24 f8 mov -0x8(%rsp),%rbx
106: 48 89 5c 38 40 mov %rbx,0x40(%rax,%rdi,1)
10b: 48 8b 5c 24 f0 mov -0x10(%rsp),%rbx
110: 48 89 5c 38 48 mov %rbx,0x48(%rax,%rdi,1)
115: 48 83 c0 10 add $0x10,%rax
119: 75 b5 jne d0 <diff_pixels_mmx3+0x10>
11b: 5b pop %rbx
11c: c3 retq
Flags used are:-O2 -march=k8
Compiler: gcc 4.2.3 (gentoo) x86_64
As you see in the intrinsic version gcc moves to mmx register to the stack,
reloads from the stack and writes to the destination. Why?
I don't know whether earlier gcc 4.2 versions produced such stupid code.
Compiling as 32 does similar stupidity, though gcc reloads into a mmx
register...
(As I note on a side: Why does gcc want to use rbx, requiring push and pop? I
think other registers are free...)
bye,
--
(°= =°)
//\ Prakash Punnoor /\\
V_/ \_V
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: gcc 4.2.3 and MMX to mem move oddity
2008-02-23 7:51 ` Prakash Punnoor
@ 2008-02-23 15:31 ` Uros Bizjak
0 siblings, 0 replies; 5+ messages in thread
From: Uros Bizjak @ 2008-02-23 15:31 UTC (permalink / raw)
To: Prakash Punnoor; +Cc: gcc
Prakash Punnoor wrote:
>> Why is movaps (SSE, floating point data) instead of movdqa (SSE2. integer
>> data) used as store? Bug or feature? Even with -O0 compiled it is used.
>>
>
> Testing further: The -march=k8 seems to cause this. Leaving it out, movdqa is
> used, so I guess it is a feature.
>
This is a feature, X86_TUNE_SSE_TYPELESS_STORES. It is faster on K8.
Regarding MMX stuff, I have just committed the patch to 4.4 that should
fix your (and similar) problems. Unfortunatelly, it won't be backported,
as this is kind of tricky register allocator fine-tuning.
Uros.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: gcc 4.2.3 and MMX to mem move oddity
2008-02-23 7:46 ` Prakash Punnoor
@ 2008-02-23 7:51 ` Prakash Punnoor
2008-02-23 15:31 ` Uros Bizjak
0 siblings, 1 reply; 5+ messages in thread
From: Prakash Punnoor @ 2008-02-23 7:51 UTC (permalink / raw)
To: gcc; +Cc: Uros Bizjak
[-- Attachment #1: Type: text/plain, Size: 3655 bytes --]
On the day of Saturday 23 February 2008 Prakash Punnoor hast written:
> On the day of Saturday 23 February 2008 Uros Bizjak hast written:
> > Hello!
> >
> > > f7: 0f 7f 5c 24 f0 movq %mm3,-0x10(%rsp)
> > > fc: 0f 7f 54 24 f8 movq %mm2,-0x8(%rsp)
> > > 101: 48 8b 5c 24 f8 mov -0x8(%rsp),%rbx
> > > 106: 48 89 5c 38 40 mov %rbx,0x40(%rax,%rdi,1)
> > > 10b: 48 8b 5c 24 f0 mov -0x10(%rsp),%rbx
> > > 110: 48 89 5c 38 48 mov %rbx,0x48(%rax,%rdi,1)
> > >
> > > As you see in the intrinsic version gcc moves to mmx register to the
> > > stack, reloads from the stack and writes to the destination. Why?
> > >
> > > I don't know whether earlier gcc 4.2 versions produced such stupid
> > > code. Compiling as 32 does similar stupidity, though gcc reloads into a
> > > mmx register...
> >
> > This is a variant of "Strange code for MMX register moves" [1] or its
> > dupe "mmx and movd/movq on x86_64" [2]. Since touching %mm register
> > switches x87 register stack to MMX mode, we penalize mmx moves severely
> > in order to prevent gcc to ever allocate %mm for DImode moves, unless
> > really necessary.
>
> [...]
>
> Just as a side note. The equivalent SSE2 code looks fine, but I have
> question regarding the used store instruction:
>
> #include <emmintrin.h>
> void diff_pixels_mmx4(char *block, const uint8_t *s1, const uint8_t *s2,
> long stride)
> {
>
> long offset = -128;
> block+=64;
> __m128i mm7 = _mm_setzero_si128();
> do {
> __m128i mm0 = *(__m128i*)s1;
> __m128i mm2 = *(__m128i*)s2;
> __m128i mm1 = mm0;
> __m128i mm3 = mm2;
> mm0 = _mm_unpacklo_epi8(mm0, mm7);
> mm1 = _mm_unpackhi_epi8(mm1, mm7);
> mm2 = _mm_unpacklo_epi8(mm2, mm7);
> mm3 = _mm_unpackhi_epi8(mm3, mm7);
> mm0 = _mm_sub_epi16(mm0, mm2);
> mm1 = _mm_sub_epi16(mm1, mm3);
> *(__m128i*)(block+offset) = mm0;
> *(__m128i*)(block+offset+16) = mm1;
> s1 += stride;
> s2 += stride;
> offset +=32;
> } while (offset < 0);
> }
>
> generated assembly (-O2 -march=k8):
>
> 0000000000000050 <diff_pixels_mmx4>:
> 50: 66 0f ef e4 pxor %xmm4,%xmm4
> 54: 48 c7 c0 80 ff ff ff mov $0xffffffffffffff80,%rax
> 5b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
> 60: 66 0f 6f 0e movdqa (%rsi),%xmm1
> 64: 48 01 ce add %rcx,%rsi
> 67: 66 0f 6f 02 movdqa (%rdx),%xmm0
> 6b: 48 01 ca add %rcx,%rdx
> 6e: 66 0f 6f d1 movdqa %xmm1,%xmm2
> 72: 66 0f 6f d8 movdqa %xmm0,%xmm3
> 76: 66 0f 68 cc punpckhbw %xmm4,%xmm1
> 7a: 66 0f 60 d4 punpcklbw %xmm4,%xmm2
> 7e: 66 0f 60 dc punpcklbw %xmm4,%xmm3
> 82: 66 0f 68 c4 punpckhbw %xmm4,%xmm0
> 86: 66 0f f9 d3 psubw %xmm3,%xmm2
> 8a: 0f 29 54 38 40 movaps %xmm2,0x40(%rax,%rdi,1)
> 8f: 66 0f f9 c8 psubw %xmm0,%xmm1
> 93: 0f 29 4c 38 50 movaps %xmm1,0x50(%rax,%rdi,1)
> 98: 48 83 c0 20 add $0x20,%rax
> 9c: 75 c2 jne 60 <diff_pixels_mmx4+0x10>
> 9e: f3 c3 repz retq
>
> Why is movaps (SSE, floating point data) instead of movdqa (SSE2. integer
> data) used as store? Bug or feature? Even with -O0 compiled it is used.
Testing further: The -march=k8 seems to cause this. Leaving it out, movdqa is
used, so I guess it is a feature.
--
(°= =°)
//\ Prakash Punnoor /\\
V_/ \_V
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: gcc 4.2.3 and MMX to mem move oddity
2008-02-23 7:11 Uros Bizjak
@ 2008-02-23 7:46 ` Prakash Punnoor
2008-02-23 7:51 ` Prakash Punnoor
0 siblings, 1 reply; 5+ messages in thread
From: Prakash Punnoor @ 2008-02-23 7:46 UTC (permalink / raw)
To: Uros Bizjak; +Cc: GCC
[-- Attachment #1: Type: text/plain, Size: 3348 bytes --]
On the day of Saturday 23 February 2008 Uros Bizjak hast written:
> Hello!
>
> > f7: 0f 7f 5c 24 f0 movq %mm3,-0x10(%rsp)
> > fc: 0f 7f 54 24 f8 movq %mm2,-0x8(%rsp)
> > 101: 48 8b 5c 24 f8 mov -0x8(%rsp),%rbx
> > 106: 48 89 5c 38 40 mov %rbx,0x40(%rax,%rdi,1)
> > 10b: 48 8b 5c 24 f0 mov -0x10(%rsp),%rbx
> > 110: 48 89 5c 38 48 mov %rbx,0x48(%rax,%rdi,1)
> >
> > As you see in the intrinsic version gcc moves to mmx register to the
> > stack, reloads from the stack and writes to the destination. Why?
> >
> > I don't know whether earlier gcc 4.2 versions produced such stupid code.
> > Compiling as 32 does similar stupidity, though gcc reloads into a mmx
> > register...
>
> This is a variant of "Strange code for MMX register moves" [1] or its
> dupe "mmx and movd/movq on x86_64" [2]. Since touching %mm register
> switches x87 register stack to MMX mode, we penalize mmx moves severely
> in order to prevent gcc to ever allocate %mm for DImode moves, unless
> really necessary.
[...]
Just as a side note. The equivalent SSE2 code looks fine, but I have question
regarding the used store instruction:
#include <emmintrin.h>
void diff_pixels_mmx4(char *block, const uint8_t *s1, const uint8_t *s2, long
stride)
{
long offset = -128;
block+=64;
__m128i mm7 = _mm_setzero_si128();
do {
__m128i mm0 = *(__m128i*)s1;
__m128i mm2 = *(__m128i*)s2;
__m128i mm1 = mm0;
__m128i mm3 = mm2;
mm0 = _mm_unpacklo_epi8(mm0, mm7);
mm1 = _mm_unpackhi_epi8(mm1, mm7);
mm2 = _mm_unpacklo_epi8(mm2, mm7);
mm3 = _mm_unpackhi_epi8(mm3, mm7);
mm0 = _mm_sub_epi16(mm0, mm2);
mm1 = _mm_sub_epi16(mm1, mm3);
*(__m128i*)(block+offset) = mm0;
*(__m128i*)(block+offset+16) = mm1;
s1 += stride;
s2 += stride;
offset +=32;
} while (offset < 0);
}
generated assembly (-O2 -march=k8):
0000000000000050 <diff_pixels_mmx4>:
50: 66 0f ef e4 pxor %xmm4,%xmm4
54: 48 c7 c0 80 ff ff ff mov $0xffffffffffffff80,%rax
5b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
60: 66 0f 6f 0e movdqa (%rsi),%xmm1
64: 48 01 ce add %rcx,%rsi
67: 66 0f 6f 02 movdqa (%rdx),%xmm0
6b: 48 01 ca add %rcx,%rdx
6e: 66 0f 6f d1 movdqa %xmm1,%xmm2
72: 66 0f 6f d8 movdqa %xmm0,%xmm3
76: 66 0f 68 cc punpckhbw %xmm4,%xmm1
7a: 66 0f 60 d4 punpcklbw %xmm4,%xmm2
7e: 66 0f 60 dc punpcklbw %xmm4,%xmm3
82: 66 0f 68 c4 punpckhbw %xmm4,%xmm0
86: 66 0f f9 d3 psubw %xmm3,%xmm2
8a: 0f 29 54 38 40 movaps %xmm2,0x40(%rax,%rdi,1)
8f: 66 0f f9 c8 psubw %xmm0,%xmm1
93: 0f 29 4c 38 50 movaps %xmm1,0x50(%rax,%rdi,1)
98: 48 83 c0 20 add $0x20,%rax
9c: 75 c2 jne 60 <diff_pixels_mmx4+0x10>
9e: f3 c3 repz retq
Why is movaps (SSE, floating point data) instead of movdqa (SSE2. integer
data) used as store? Bug or feature? Even with -O0 compiled it is used.
Regards,
--
(°= =°)
//\ Prakash Punnoor /\\
V_/ \_V
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: gcc 4.2.3 and MMX to mem move oddity
@ 2008-02-23 7:11 Uros Bizjak
2008-02-23 7:46 ` Prakash Punnoor
0 siblings, 1 reply; 5+ messages in thread
From: Uros Bizjak @ 2008-02-23 7:11 UTC (permalink / raw)
To: GCC; +Cc: Prakash Punnoor
Hello!
> f7: 0f 7f 5c 24 f0 movq %mm3,-0x10(%rsp)
> fc: 0f 7f 54 24 f8 movq %mm2,-0x8(%rsp)
> 101: 48 8b 5c 24 f8 mov -0x8(%rsp),%rbx
> 106: 48 89 5c 38 40 mov %rbx,0x40(%rax,%rdi,1)
> 10b: 48 8b 5c 24 f0 mov -0x10(%rsp),%rbx
> 110: 48 89 5c 38 48 mov %rbx,0x48(%rax,%rdi,1)
> As you see in the intrinsic version gcc moves to mmx register to the stack,
> reloads from the stack and writes to the destination. Why?
>
> I don't know whether earlier gcc 4.2 versions produced such stupid code.
> Compiling as 32 does similar stupidity, though gcc reloads into a mmx
> register...
This is a variant of "Strange code for MMX register moves" [1] or its
dupe "mmx and movd/movq on x86_64" [2]. Since touching %mm register
switches x87 register stack to MMX mode, we penalize mmx moves severely
in order to prevent gcc to ever allocate %mm for DImode moves, unless
really necessary.
OTOH, in your particular case:
(insn 42 40 43 3 mmx.c:20 (set (mem:V2SI (plus:DI (reg:DI 65 [ ivtmp.43 ])
(const_int -64 [0xffffffffffffffc0])) [0 S8 A64])
(subreg:V2SI (reg:V4HI 90) 0)) 866 {*movv2si_internal_rex64}
(expr_list:REG_DEAD (reg:V4HI 90)
(nil)))
subreg in this pattern confuses RA to create:
(insn 68 36 38 3
/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.3.0/include/mmintrin.h:389
(set (mem/c:V4HI (plus:DI (reg/f:DI 7 sp)
(const_int -8 [0xfffffffffffffff8])) [2 S8 A8])
(reg:V4HI 32 mm3)) 865 {*movv4hi_internal_rex64} (nil))
(insn 70 69 42 3 mmx.c:20 (set (reg:V2SI 3 bx)
(mem/c:V2SI (plus:DI (reg/f:DI 7 sp)
(const_int -8 [0xfffffffffffffff8])) [2 S8 A8])) 866
{*movv2si_internal_rex64} (nil))
(insn:HI 42 70 71 3 mmx.c:20 (set (mem:V2SI (plus:DI (reg:DI 5 di
[orig:65 ivtmp.43 ] [65])
(const_int -64 [0xffffffffffffffc0])) [0 S8 A64])
(reg:V2SI 3 bx)) 866 {*movv2si_internal_rex64} (nil))
(For 32bit targets, %mm is allocated, since no integer register can hold
64bit value.)
I'll see what can be done here to tie V4HI and V2SI together.
[1] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22076
[2] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34256
Uros.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2008-02-23 15:31 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-02-22 22:07 gcc 4.2.3 and MMX to mem move oddity Prakash Punnoor
2008-02-23 7:11 Uros Bizjak
2008-02-23 7:46 ` Prakash Punnoor
2008-02-23 7:51 ` Prakash Punnoor
2008-02-23 15:31 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).