public inbox for gcc@gcc.gnu.org
 help / color / mirror / Atom feed
* gcc 4.2.3 and MMX to mem move oddity
@ 2008-02-22 22:07 Prakash Punnoor
  0 siblings, 0 replies; 5+ messages in thread
From: Prakash Punnoor @ 2008-02-22 22:07 UTC (permalink / raw)
  To: gcc

[-- Attachment #1: Type: text/plain, Size: 5090 bytes --]

Hi,

I am playing with following code (from ffmpeg) translated to intrinsics:


Original code:

#define MOVQ_ZERO(regd)  __asm __volatile ("pxor %%" #regd ", %%" #regd ::)

void diff_pixels_mmx(char *block, const uint8_t *s1, const uint8_t *s2, long 
stride)
{
    long offset = -128;
    MOVQ_ZERO(mm7);
    do {
        asm volatile(
            "movq (%0), %%mm0         \n\t"
            "movq (%1), %%mm2         \n\t"
            "movq %%mm0, %%mm1        \n\t"
            "movq %%mm2, %%mm3        \n\t"
            "punpcklbw %%mm7, %%mm0   \n\t"
            "punpckhbw %%mm7, %%mm1   \n\t"
            "punpcklbw %%mm7, %%mm2   \n\t"
            "punpckhbw %%mm7, %%mm3   \n\t"
            "psubw %%mm2, %%mm0       \n\t"
            "psubw %%mm3, %%mm1       \n\t"
            "movq %%mm0, (%2, %3)     \n\t"
            "movq %%mm1, 8(%2, %3)    \n\t"
            : : "r" (s1), "r" (s2), "r" (block+64),  "r" (offset)
            : "memory");
        s1 += stride;
        s2 += stride;
        offset += 16;
    } while (offset < 0);
}

compiles to:

0000000000000000 <diff_pixels_mmx>:
   0:   0f ef ff                pxor   %mm7,%mm7
   3:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
   a:   48 83 c7 40             add    $0x40,%rdi
   e:   66 90                   xchg   %ax,%ax
  10:   0f 6f 06                movq   (%rsi),%mm0
  13:   0f 6f 12                movq   (%rdx),%mm2
  16:   0f 6f c8                movq   %mm0,%mm1
  19:   0f 6f da                movq   %mm2,%mm3
  1c:   0f 60 c7                punpcklbw %mm7,%mm0
  1f:   0f 68 cf                punpckhbw %mm7,%mm1
  22:   0f 60 d7                punpcklbw %mm7,%mm2
  25:   0f 68 df                punpckhbw %mm7,%mm3
  28:   0f f9 c2                psubw  %mm2,%mm0
  2b:   0f f9 cb                psubw  %mm3,%mm1
  2e:   0f 7f 04 07             movq   %mm0,(%rdi,%rax,1)
  32:   0f 7f 4c 07 08          movq   %mm1,0x8(%rdi,%rax,1)
  37:   48 01 ce                add    %rcx,%rsi
  3a:   48 01 ca                add    %rcx,%rdx
  3d:   48 83 c0 10             add    $0x10,%rax
  41:   75 cd                   jne    10 <diff_pixels_mmx+0x10>
  43:   f3 c3                   repz retq
  45:   66 66 2e 0f 1f 84 00    nopw   %cs:0x0(%rax,%rax,1)
  4c:   00 00 00 00


This is the intrinsic version:

#include <mmintrin.h>
void diff_pixels_mmx3(char *block, const uint8_t *s1, const uint8_t *s2, long 
stride)
{
	
	long offset = -128;
	block+=64;
	__m64 mm7 = _mm_setzero_si64();
	do {
		__m64 mm0 = *(__m64*)s1;
		__m64 mm2 = *(__m64*)s2;
		__m64 mm1 = mm0;
		__m64 mm3 = mm2;
		mm0 = _mm_unpacklo_pi8(mm0, mm7);
		mm1 = _mm_unpackhi_pi8(mm1, mm7);
		mm2 = _mm_unpacklo_pi8(mm2, mm7);
		mm3 = _mm_unpackhi_pi8(mm3, mm7);
		mm0 = _mm_sub_pi16(mm0, mm2);
		mm1 = _mm_sub_pi16(mm1, mm3);
		*(__m64*)(block+offset) = mm0;
		*(__m64*)(block+offset+8) = mm1;
		s1 += stride;
		s2 += stride;
		offset +=16;
	} while (offset < 0);
}

compiles to
00000000000000c0 <diff_pixels_mmx3>:
  c0:   53                      push   %rbx
  c1:   0f ef e4                pxor   %mm4,%mm4
  c4:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
  cb:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  d0:   0f 6f 0e                movq   (%rsi),%mm1
  d3:   48 01 ce                add    %rcx,%rsi
  d6:   0f 6f 02                movq   (%rdx),%mm0
  d9:   48 01 ca                add    %rcx,%rdx
  dc:   0f 6f d1                movq   %mm1,%mm2
  df:   0f 6f d9                movq   %mm1,%mm3
  e2:   0f 6f c8                movq   %mm0,%mm1
  e5:   0f 68 c4                punpckhbw %mm4,%mm0
  e8:   0f 60 d4                punpcklbw %mm4,%mm2
  eb:   0f 68 dc                punpckhbw %mm4,%mm3
  ee:   0f 60 cc                punpcklbw %mm4,%mm1
  f1:   0f f9 d8                psubw  %mm0,%mm3
  f4:   0f f9 d1                psubw  %mm1,%mm2
  f7:   0f 7f 5c 24 f0          movq   %mm3,-0x10(%rsp)
  fc:   0f 7f 54 24 f8          movq   %mm2,-0x8(%rsp)
 101:   48 8b 5c 24 f8          mov    -0x8(%rsp),%rbx
 106:   48 89 5c 38 40          mov    %rbx,0x40(%rax,%rdi,1)
 10b:   48 8b 5c 24 f0          mov    -0x10(%rsp),%rbx
 110:   48 89 5c 38 48          mov    %rbx,0x48(%rax,%rdi,1)
 115:   48 83 c0 10             add    $0x10,%rax
 119:   75 b5                   jne    d0 <diff_pixels_mmx3+0x10>
 11b:   5b                      pop    %rbx
 11c:   c3                      retq


Flags used are:-O2 -march=k8
Compiler: gcc 4.2.3 (gentoo) x86_64

As you see in the intrinsic version gcc moves to mmx register to the stack, 
reloads from the stack and writes to the destination. Why?

I don't know whether earlier gcc 4.2 versions produced such stupid code.
Compiling as 32 does similar stupidity, though gcc reloads into a mmx 
register...

(As I note on a side: Why does gcc want to use rbx, requiring push and pop? I 
think other registers are free...)

bye,
-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: gcc 4.2.3 and MMX to mem move oddity
  2008-02-23  7:51   ` Prakash Punnoor
@ 2008-02-23 15:31     ` Uros Bizjak
  0 siblings, 0 replies; 5+ messages in thread
From: Uros Bizjak @ 2008-02-23 15:31 UTC (permalink / raw)
  To: Prakash Punnoor; +Cc: gcc

Prakash Punnoor wrote:

>> Why is movaps (SSE, floating point data) instead of movdqa (SSE2. integer
>> data) used as store? Bug or feature? Even with -O0 compiled it is used.
>>     
>
> Testing further: The -march=k8 seems to cause this. Leaving it out, movdqa is 
> used, so I guess it is a feature.
>   

This is a feature, X86_TUNE_SSE_TYPELESS_STORES. It is faster on K8.

Regarding MMX stuff, I have just committed the patch to 4.4 that should 
fix your (and similar) problems. Unfortunatelly, it won't be backported, 
as this is kind of tricky register allocator fine-tuning.

Uros.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: gcc 4.2.3 and MMX to mem move oddity
  2008-02-23  7:46 ` Prakash Punnoor
@ 2008-02-23  7:51   ` Prakash Punnoor
  2008-02-23 15:31     ` Uros Bizjak
  0 siblings, 1 reply; 5+ messages in thread
From: Prakash Punnoor @ 2008-02-23  7:51 UTC (permalink / raw)
  To: gcc; +Cc: Uros Bizjak

[-- Attachment #1: Type: text/plain, Size: 3655 bytes --]

On the day of Saturday 23 February 2008 Prakash Punnoor hast written:
> On the day of Saturday 23 February 2008 Uros Bizjak hast written:
> > Hello!
> >
> > >   f7:   0f 7f 5c 24 f0          movq   %mm3,-0x10(%rsp)
> > >   fc:   0f 7f 54 24 f8          movq   %mm2,-0x8(%rsp)
> > >  101:   48 8b 5c 24 f8          mov    -0x8(%rsp),%rbx
> > >  106:   48 89 5c 38 40          mov    %rbx,0x40(%rax,%rdi,1)
> > >  10b:   48 8b 5c 24 f0          mov    -0x10(%rsp),%rbx
> > >  110:   48 89 5c 38 48          mov    %rbx,0x48(%rax,%rdi,1)
> > >
> > > As you see in the intrinsic version gcc moves to mmx register to the
> > > stack, reloads from the stack and writes to the destination. Why?
> > >
> > > I don't know whether earlier gcc 4.2 versions produced such stupid
> > > code. Compiling as 32 does similar stupidity, though gcc reloads into a
> > > mmx register...
> >
> > This is a variant of "Strange code for MMX register moves" [1] or its
> > dupe "mmx and movd/movq on x86_64" [2]. Since touching %mm register
> > switches x87 register stack to MMX mode, we penalize mmx moves severely
> > in order to prevent gcc to ever allocate %mm for DImode moves, unless
> > really necessary.
>
> [...]
>
> Just as a side note. The equivalent SSE2 code looks fine, but I have
> question regarding the used store instruction:
>
> #include <emmintrin.h>
> void diff_pixels_mmx4(char *block, const uint8_t *s1, const uint8_t *s2,
> long stride)
> {
>
> 	long offset = -128;
> 	block+=64;
> 	__m128i mm7 = _mm_setzero_si128();
> 	do {
> 		__m128i mm0 = *(__m128i*)s1;
> 		__m128i mm2 = *(__m128i*)s2;
> 		__m128i mm1 = mm0;
> 		__m128i mm3 = mm2;
> 		mm0 = _mm_unpacklo_epi8(mm0, mm7);
> 		mm1 = _mm_unpackhi_epi8(mm1, mm7);
> 		mm2 = _mm_unpacklo_epi8(mm2, mm7);
> 		mm3 = _mm_unpackhi_epi8(mm3, mm7);
> 		mm0 = _mm_sub_epi16(mm0, mm2);
> 		mm1 = _mm_sub_epi16(mm1, mm3);
> 		*(__m128i*)(block+offset) = mm0;
> 		*(__m128i*)(block+offset+16) = mm1;
> 		s1 += stride;
> 		s2 += stride;
> 		offset +=32;
> 	} while (offset < 0);
> }
>
> generated assembly (-O2 -march=k8):
>
> 0000000000000050 <diff_pixels_mmx4>:
>   50:   66 0f ef e4             pxor   %xmm4,%xmm4
>   54:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
>   5b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
>   60:   66 0f 6f 0e             movdqa (%rsi),%xmm1
>   64:   48 01 ce                add    %rcx,%rsi
>   67:   66 0f 6f 02             movdqa (%rdx),%xmm0
>   6b:   48 01 ca                add    %rcx,%rdx
>   6e:   66 0f 6f d1             movdqa %xmm1,%xmm2
>   72:   66 0f 6f d8             movdqa %xmm0,%xmm3
>   76:   66 0f 68 cc             punpckhbw %xmm4,%xmm1
>   7a:   66 0f 60 d4             punpcklbw %xmm4,%xmm2
>   7e:   66 0f 60 dc             punpcklbw %xmm4,%xmm3
>   82:   66 0f 68 c4             punpckhbw %xmm4,%xmm0
>   86:   66 0f f9 d3             psubw  %xmm3,%xmm2
>   8a:   0f 29 54 38 40          movaps %xmm2,0x40(%rax,%rdi,1)
>   8f:   66 0f f9 c8             psubw  %xmm0,%xmm1
>   93:   0f 29 4c 38 50          movaps %xmm1,0x50(%rax,%rdi,1)
>   98:   48 83 c0 20             add    $0x20,%rax
>   9c:   75 c2                   jne    60 <diff_pixels_mmx4+0x10>
>   9e:   f3 c3                   repz retq
>
> Why is movaps (SSE, floating point data) instead of movdqa (SSE2. integer
> data) used as store? Bug or feature? Even with -O0 compiled it is used.

Testing further: The -march=k8 seems to cause this. Leaving it out, movdqa is 
used, so I guess it is a feature.

-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: gcc 4.2.3 and MMX to mem move oddity
  2008-02-23  7:11 Uros Bizjak
@ 2008-02-23  7:46 ` Prakash Punnoor
  2008-02-23  7:51   ` Prakash Punnoor
  0 siblings, 1 reply; 5+ messages in thread
From: Prakash Punnoor @ 2008-02-23  7:46 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC

[-- Attachment #1: Type: text/plain, Size: 3348 bytes --]

On the day of Saturday 23 February 2008 Uros Bizjak hast written:
> Hello!
>
> >   f7:   0f 7f 5c 24 f0          movq   %mm3,-0x10(%rsp)
> >   fc:   0f 7f 54 24 f8          movq   %mm2,-0x8(%rsp)
> >  101:   48 8b 5c 24 f8          mov    -0x8(%rsp),%rbx
> >  106:   48 89 5c 38 40          mov    %rbx,0x40(%rax,%rdi,1)
> >  10b:   48 8b 5c 24 f0          mov    -0x10(%rsp),%rbx
> >  110:   48 89 5c 38 48          mov    %rbx,0x48(%rax,%rdi,1)
> >
> > As you see in the intrinsic version gcc moves to mmx register to the
> > stack, reloads from the stack and writes to the destination. Why?
> >
> > I don't know whether earlier gcc 4.2 versions produced such stupid code.
> > Compiling as 32 does similar stupidity, though gcc reloads into a mmx
> > register...
>
> This is a variant of "Strange code for MMX register moves" [1] or its
> dupe "mmx and movd/movq on x86_64" [2]. Since touching %mm register
> switches x87 register stack to MMX mode, we penalize mmx moves severely
> in order to prevent gcc to ever allocate %mm for DImode moves, unless
> really necessary.

[...]

Just as a side note. The equivalent SSE2 code looks fine, but I have question 
regarding the used store instruction:

#include <emmintrin.h>
void diff_pixels_mmx4(char *block, const uint8_t *s1, const uint8_t *s2, long 
                      stride)
{
	
	long offset = -128;
	block+=64;
	__m128i mm7 = _mm_setzero_si128();
	do {
		__m128i mm0 = *(__m128i*)s1;
		__m128i mm2 = *(__m128i*)s2;
		__m128i mm1 = mm0;
		__m128i mm3 = mm2;
		mm0 = _mm_unpacklo_epi8(mm0, mm7);
		mm1 = _mm_unpackhi_epi8(mm1, mm7);
		mm2 = _mm_unpacklo_epi8(mm2, mm7);
		mm3 = _mm_unpackhi_epi8(mm3, mm7);
		mm0 = _mm_sub_epi16(mm0, mm2);
		mm1 = _mm_sub_epi16(mm1, mm3);
		*(__m128i*)(block+offset) = mm0;
		*(__m128i*)(block+offset+16) = mm1;
		s1 += stride;
		s2 += stride;
		offset +=32;
	} while (offset < 0);
}

generated assembly (-O2 -march=k8):

0000000000000050 <diff_pixels_mmx4>:
  50:   66 0f ef e4             pxor   %xmm4,%xmm4
  54:   48 c7 c0 80 ff ff ff    mov    $0xffffffffffffff80,%rax
  5b:   0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  60:   66 0f 6f 0e             movdqa (%rsi),%xmm1
  64:   48 01 ce                add    %rcx,%rsi
  67:   66 0f 6f 02             movdqa (%rdx),%xmm0
  6b:   48 01 ca                add    %rcx,%rdx
  6e:   66 0f 6f d1             movdqa %xmm1,%xmm2
  72:   66 0f 6f d8             movdqa %xmm0,%xmm3
  76:   66 0f 68 cc             punpckhbw %xmm4,%xmm1
  7a:   66 0f 60 d4             punpcklbw %xmm4,%xmm2
  7e:   66 0f 60 dc             punpcklbw %xmm4,%xmm3
  82:   66 0f 68 c4             punpckhbw %xmm4,%xmm0
  86:   66 0f f9 d3             psubw  %xmm3,%xmm2
  8a:   0f 29 54 38 40          movaps %xmm2,0x40(%rax,%rdi,1)
  8f:   66 0f f9 c8             psubw  %xmm0,%xmm1
  93:   0f 29 4c 38 50          movaps %xmm1,0x50(%rax,%rdi,1)
  98:   48 83 c0 20             add    $0x20,%rax
  9c:   75 c2                   jne    60 <diff_pixels_mmx4+0x10>
  9e:   f3 c3                   repz retq

Why is movaps (SSE, floating point data) instead of movdqa (SSE2. integer 
data) used as store? Bug or feature? Even with -O0 compiled it is used.

Regards,
-- 
(°=                 =°)
//\ Prakash Punnoor /\\
V_/                 \_V

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: gcc 4.2.3 and MMX to mem move oddity
@ 2008-02-23  7:11 Uros Bizjak
  2008-02-23  7:46 ` Prakash Punnoor
  0 siblings, 1 reply; 5+ messages in thread
From: Uros Bizjak @ 2008-02-23  7:11 UTC (permalink / raw)
  To: GCC; +Cc: Prakash Punnoor

Hello!

>   f7:   0f 7f 5c 24 f0          movq   %mm3,-0x10(%rsp)
>   fc:   0f 7f 54 24 f8          movq   %mm2,-0x8(%rsp)
>  101:   48 8b 5c 24 f8          mov    -0x8(%rsp),%rbx
>  106:   48 89 5c 38 40          mov    %rbx,0x40(%rax,%rdi,1)
>  10b:   48 8b 5c 24 f0          mov    -0x10(%rsp),%rbx
>  110:   48 89 5c 38 48          mov    %rbx,0x48(%rax,%rdi,1)

> As you see in the intrinsic version gcc moves to mmx register to the stack, 
> reloads from the stack and writes to the destination. Why?
>
> I don't know whether earlier gcc 4.2 versions produced such stupid code.
> Compiling as 32 does similar stupidity, though gcc reloads into a mmx 
> register...

This is a variant of "Strange code for MMX register moves" [1] or its 
dupe "mmx and movd/movq on x86_64" [2]. Since touching %mm register 
switches x87 register stack to MMX mode, we penalize mmx moves severely 
in order to prevent gcc to ever allocate %mm for DImode moves, unless 
really necessary.

OTOH, in your particular case:

(insn 42 40 43 3 mmx.c:20 (set (mem:V2SI (plus:DI (reg:DI 65 [ ivtmp.43 ])
                (const_int -64 [0xffffffffffffffc0])) [0 S8 A64])
        (subreg:V2SI (reg:V4HI 90) 0)) 866 {*movv2si_internal_rex64} 
(expr_list:REG_DEAD (reg:V4HI 90)
        (nil)))

subreg in this pattern confuses RA to create:

(insn 68 36 38 3 
/usr/local/lib/gcc/x86_64-unknown-linux-gnu/4.3.0/include/mmintrin.h:389 
(set (mem/c:V4HI (plus:DI (reg/f:DI 7 sp)
                (const_int -8 [0xfffffffffffffff8])) [2 S8 A8])
        (reg:V4HI 32 mm3)) 865 {*movv4hi_internal_rex64} (nil))

(insn 70 69 42 3 mmx.c:20 (set (reg:V2SI 3 bx)
        (mem/c:V2SI (plus:DI (reg/f:DI 7 sp)
                (const_int -8 [0xfffffffffffffff8])) [2 S8 A8])) 866 
{*movv2si_internal_rex64} (nil))

(insn:HI 42 70 71 3 mmx.c:20 (set (mem:V2SI (plus:DI (reg:DI 5 di 
[orig:65 ivtmp.43 ] [65])
                (const_int -64 [0xffffffffffffffc0])) [0 S8 A64])
        (reg:V2SI 3 bx)) 866 {*movv2si_internal_rex64} (nil))


(For 32bit targets, %mm is allocated, since no integer register can hold 
64bit value.)

I'll see what can be done here to tie V4HI and V2SI together.

[1] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22076
[2] http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34256

Uros.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2008-02-23 15:31 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-02-22 22:07 gcc 4.2.3 and MMX to mem move oddity Prakash Punnoor
2008-02-23  7:11 Uros Bizjak
2008-02-23  7:46 ` Prakash Punnoor
2008-02-23  7:51   ` Prakash Punnoor
2008-02-23 15:31     ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).