* GCC gives major speed up with "-pg" flag for SIMD code
@ 2009-08-27 17:18 Tuomas Tonteri
0 siblings, 0 replies; only message in thread
From: Tuomas Tonteri @ 2009-08-27 17:18 UTC (permalink / raw)
To: gcc-help
After adding SSE path into a C++ ray tracer program I noticed some parts
of the program run almost 2x the speed when I compile using the "-pg"
profiling flag. Before these additions "-pg" produced a performance
hit in the total program executing time as expected.
Nice code for vectorization is achieved through a C++ class, that
implements arithmetic operator overloading over the basic SSE unit
__m128 as follows:
class f32x4
{
const f32x4&operator+=(const f32x4&v)
{
m = _mm_add_ps(m, v.m);
return(*this);
}
/*... overload more operators ... */
union
{
__m128 m;
float f[4];
}
};
Here is an example of function that takes almost 2x less time to
execute:
/* C++ */
f32x4b pintersect4(f32x4& a, p3_f32x4& raydir, Vector3& cameyepos,
p3_f32x4& hitpoint, f32x4& distance, p3_f32x4& normal) { f32x4 b =
Dot(raydir, bpart); f32x4 D = b*b - a*c;
// If none of the rays can intersect the sphere then stop
f32x4b mask = D > 0.0f;
if (ForWhich(mask) == 0)
return mask;
D=Sqrt(D);
f32x4 t = -0.5f*((b+D)/a);
// If sphere center is in front of camera surface
mask = mask && t > 1.0f;
if (ForWhich(mask) == 0)
return mask;
distance=a*t; // Should be sqrt(a) for real distance
hitpoint=(t*raydir) + cameyepos;
normal=(hitpoint - pos) / rad;
return mask;
}
I had the same result with both GCC 4.3.2 and 4.4.1. The program is
compiled with "-O3 -march-core2" and lowering the -O level doesn't
affect the behaviour. The speed difference is measured by monitoring
the x86 rdtsc() cycle count and CPU frequency is kept constant. This is
x86_64 platform.
I am seeking for advice on what could cause this.
Here is the assembly output of that function with profiling enabled.
Most difference I can notice between this and the one without profiling
is that different adresses are being used when loading the MMX
registers.
_ZN6Sphere11pintersect4ERN6veclib5f32x4ERN5n_std9cvalarrayIS1_Lm3EEERNS4_IfLm3EEES6_S2_S6_:
.LFB4330:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
pushq %rbp
.cfi_def_cfa_offset 16
movq %rsp, %rbp
.cfi_offset 6, -16
.cfi_def_cfa_register 6
subq $368, %rsp
call mcount
movss 184(%rdi), %xmm2
movss 188(%rdi), %xmm0
movss 192(%rdi), %xmm3
movss 8(%rdx), %xmm1
movss 4(%rdx), %xmm4
leaq 16(%rdx), %r10
mulss %xmm0, %xmm4
mulss %xmm3, %xmm1
movq 16(%rbp), %rax
addss %xmm4, %xmm1
movss (%rdx), %xmm4
mulss %xmm2, %xmm4
addss %xmm4, %xmm1
movss %xmm1, -16(%rbp)
movss 8(%r10), %xmm1
movss 4(%r10), %xmm4
mulss %xmm3, %xmm1
mulss %xmm0, %xmm4
leaq 32(%rdx), %r10
addss %xmm4, %xmm1
movss 16(%rdx), %xmm4
mulss %xmm2, %xmm4
addss %xmm4, %xmm1
movss %xmm1, -12(%rbp)
movss 8(%r10), %xmm1
movss 4(%r10), %xmm4
mulss %xmm3, %xmm1
mulss %xmm0, %xmm4
leaq 48(%rdx), %r10
addss %xmm4, %xmm1
movss 32(%rdx), %xmm4
mulss %xmm2, %xmm4
addss %xmm4, %xmm1
movss %xmm1, -8(%rbp)
mulss 8(%r10), %xmm3
mulss 4(%r10), %xmm0
mulss 48(%rdx), %xmm2
addss %xmm3, %xmm0
xorps %xmm3, %xmm3
addss %xmm2, %xmm0
movss %xmm0, -4(%rbp)
xorps %xmm0, %xmm0
movaps (%rsi), %xmm4
movss 180(%rdi), %xmm2
movlps -16(%rbp), %xmm0
shufps $0, %xmm2, %xmm2
movhps -8(%rbp), %xmm0
mulps %xmm4, %xmm2
movaps %xmm0, %xmm1
mulps %xmm0, %xmm1
subps %xmm2, %xmm1
movaps %xmm3, %xmm2
cmpltps %xmm1, %xmm2
movlps %xmm2, -96(%rbp)
movhps %xmm2, -88(%rbp)
movq -96(%rbp), %r10
movq -88(%rbp), %rsi
movd %r10, %xmm5
shufps $0xe4, %xmm3, %xmm5
movaps %xmm5, %xmm2
movd %rsi, %xmm5
movlhps %xmm5, %xmm2
movmskps %xmm2, %r11d
testl %r11d, %r11d
je .L98
sqrtps %xmm1, %xmm1
addps %xmm0, %xmm1
movaps .LC10(%rip), %xmm0
divps %xmm4, %xmm1
mulps .LC9(%rip), %xmm1
cmpltps %xmm1, %xmm0
movlps %xmm0, -144(%rbp)
movhps %xmm0, -136(%rbp)
movq -144(%rbp), %rsi
movq %rsi, -48(%rbp)
movq -136(%rbp), %rsi
movq %rsi, -40(%rbp)
andps -48(%rbp), %xmm2
movlps %xmm2, -192(%rbp)
movhps %xmm2, -184(%rbp)
movq -192(%rbp), %r10
movq -184(%rbp), %rsi
movd %r10, %xmm2
movd %rsi, %xmm5
shufps $0xe4, %xmm3, %xmm2
movq %r10, -32(%rbp)
movaps %xmm2, %xmm0
movq %rsi, -24(%rbp)
movlhps %xmm5, %xmm0
movmskps %xmm0, %r11d
testl %r11d, %r11d
je .L98
mulps %xmm1, %xmm4
movaps %xmm4, (%r9)
mulps (%rdx), %xmm1
movss (%rcx), %xmm0
leaq -256(%rbp), %rdx
shufps $0, %xmm0, %xmm0
cmpq %rdx, %r8
addps %xmm0, %xmm1
je .L87
.L88:
movaps %xmm1, (%r8)
movaps %xmm3, %xmm0
movlps -240(%rbp), %xmm0
movhps -232(%rbp), %xmm0
movlps %xmm0, 16(%r8)
movhps %xmm0, 24(%r8)
movlps -224(%rbp), %xmm3
movhps -216(%rbp), %xmm3
movlps %xmm3, 32(%r8)
movhps %xmm3, 40(%r8)
.L87:
.L90:
.L92:
movss 8(%rdi), %xmm0
movss 140(%rdi), %xmm2
shufps $0, %xmm0, %xmm0
shufps $0, %xmm2, %xmm2
subps %xmm0, %xmm1
leaq -352(%rbp), %rdx
divps %xmm2, %xmm1
xorps %xmm0, %xmm0
movlps %xmm1, -352(%rbp)
movlps -288(%rbp), %xmm0
movhps %xmm1, -344(%rbp)
movhps -280(%rbp), %xmm0
cmpq %rdx, %rax
divps %xmm2, %xmm0
movlps %xmm0, -336(%rbp)
movhps %xmm0, -328(%rbp)
xorps %xmm0, %xmm0
movlps -272(%rbp), %xmm0
movhps -264(%rbp), %xmm0
divps %xmm2, %xmm0
movlps %xmm0, -320(%rbp)
movhps %xmm0, -312(%rbp)
je .L94
.L95:
xorps %xmm0, %xmm0
movlps -352(%rbp), %xmm0
movhps -344(%rbp), %xmm0
movlps %xmm0, (%rax)
movhps %xmm0, 8(%rax)
xorps %xmm0, %xmm0
movlps -336(%rbp), %xmm0
movhps -328(%rbp), %xmm0
movlps %xmm0, 16(%rax)
movhps %xmm0, 24(%rax)
xorps %xmm0, %xmm0
movlps -320(%rbp), %xmm0
movhps -312(%rbp), %xmm0
movlps %xmm0, 32(%rax)
movhps %xmm0, 40(%rax)
.L94:
.L96:
movq -32(%rbp), %rax
movq %rax, -64(%rbp)
movq -24(%rbp), %rax
movq -64(%rbp), %xmm0
movq %rax, -56(%rbp)
movd %rax, %xmm1
leave
ret
.p2align 4,,10
.p2align 3
.L98:
movq %r10, -64(%rbp)
movq %rsi, -56(%rbp)
movd %r10, %xmm0
movd %rsi, %xmm1
leave
ret
.cfi_endproc
And here is the same thing without profiling:
_ZN6Sphere11pintersect4ERN6veclib5f32x4ERN5n_std9cvalarrayIS1_Lm3EEERNS4_IfLm3EEES6_S2_S6_:
.LFB4330:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
subq $256, %rsp
.cfi_def_cfa_offset 264
leaq 16(%rdx), %r10
movss 8(%rdx), %xmm1
movss 4(%rdx), %xmm4
movss 184(%rdi), %xmm2
movss 188(%rdi), %xmm0
movss 192(%rdi), %xmm3
mulss %xmm0, %xmm4
mulss %xmm3, %xmm1
movq 264(%rsp), %rax
addss %xmm4, %xmm1
movss (%rdx), %xmm4
mulss %xmm2, %xmm4
addss %xmm4, %xmm1
movss %xmm1, 232(%rsp)
movss 8(%r10), %xmm1
movss 4(%r10), %xmm4
mulss %xmm3, %xmm1
mulss %xmm0, %xmm4
leaq 32(%rdx), %r10
addss %xmm4, %xmm1
movss 16(%rdx), %xmm4
mulss %xmm2, %xmm4
addss %xmm4, %xmm1
movss %xmm1, 236(%rsp)
movss 8(%r10), %xmm1
movss 4(%r10), %xmm4
mulss %xmm3, %xmm1
mulss %xmm0, %xmm4
leaq 48(%rdx), %r10
addss %xmm4, %xmm1
movss 32(%rdx), %xmm4
mulss %xmm2, %xmm4
addss %xmm4, %xmm1
movss %xmm1, 240(%rsp)
mulss 8(%r10), %xmm3
mulss 4(%r10), %xmm0
mulss 48(%rdx), %xmm2
addss %xmm3, %xmm0
xorps %xmm3, %xmm3
addss %xmm2, %xmm0
movss 180(%rdi), %xmm2
movss %xmm0, 244(%rsp)
shufps $0, %xmm2, %xmm2
movaps (%rsi), %xmm4
xorps %xmm0, %xmm0
mulps %xmm4, %xmm2
movlps 232(%rsp), %xmm0
movhps 240(%rsp), %xmm0
movaps %xmm0, %xmm1
mulps %xmm0, %xmm1
subps %xmm2, %xmm1
movaps %xmm3, %xmm2
cmpltps %xmm1, %xmm2
movlps %xmm2, 152(%rsp)
movhps %xmm2, 160(%rsp)
movq 152(%rsp), %r10
movq 160(%rsp), %rsi
movd %r10, %xmm5
shufps $0xe4, %xmm3, %xmm5
movaps %xmm5, %xmm2
movd %rsi, %xmm5
movlhps %xmm5, %xmm2
movmskps %xmm2, %r11d
testl %r11d, %r11d
je .L98
sqrtps %xmm1, %xmm1
addps %xmm0, %xmm1
movaps .LC10(%rip), %xmm0
divps %xmm4, %xmm1
mulps .LC9(%rip), %xmm1
cmpltps %xmm1, %xmm0
movlps %xmm0, 104(%rsp)
movhps %xmm0, 112(%rsp)
movq 104(%rsp), %rsi
movq %rsi, 200(%rsp)
movq 112(%rsp), %rsi
movq %rsi, 208(%rsp)
andps 200(%rsp), %xmm2
movlps %xmm2, 56(%rsp)
movhps %xmm2, 64(%rsp)
movq 56(%rsp), %r10
movq 64(%rsp), %rsi
movd %r10, %xmm2
movd %rsi, %xmm5
shufps $0xe4, %xmm3, %xmm2
movq %r10, 216(%rsp)
movaps %xmm2, %xmm0
movq %rsi, 224(%rsp)
movlhps %xmm5, %xmm0
movmskps %xmm0, %r11d
testl %r11d, %r11d
je .L98
mulps %xmm1, %xmm4
movaps %xmm4, (%r9)
mulps (%rdx), %xmm1
movss (%rcx), %xmm0
leaq -8(%rsp), %rdx
shufps $0, %xmm0, %xmm0
cmpq %rdx, %r8
addps %xmm0, %xmm1
je .L87
.L88:
movaps %xmm3, %xmm0
movaps %xmm1, (%r8)
movlps 8(%rsp), %xmm0
movlps 24(%rsp), %xmm3
movhps 16(%rsp), %xmm0
movhps 32(%rsp), %xmm3
movlps %xmm0, 16(%r8)
movhps %xmm0, 24(%r8)
movlps %xmm3, 32(%r8)
movhps %xmm3, 40(%r8)
.L87:
.L90:
.L92:
movss 8(%rdi), %xmm0
movss 140(%rdi), %xmm2
shufps $0, %xmm0, %xmm0
shufps $0, %xmm2, %xmm2
subps %xmm0, %xmm1
leaq -104(%rsp), %rdx
divps %xmm2, %xmm1
xorps %xmm0, %xmm0
movlps %xmm1, -104(%rsp)
movlps -40(%rsp), %xmm0
movhps %xmm1, -96(%rsp)
movhps -32(%rsp), %xmm0
cmpq %rdx, %rax
divps %xmm2, %xmm0
movlps %xmm0, -88(%rsp)
movhps %xmm0, -80(%rsp)
xorps %xmm0, %xmm0
movlps -24(%rsp), %xmm0
movhps -16(%rsp), %xmm0
divps %xmm2, %xmm0
movlps %xmm0, -72(%rsp)
movhps %xmm0, -64(%rsp)
je .L94
.L95:
xorps %xmm0, %xmm0
movlps -104(%rsp), %xmm0
movhps -96(%rsp), %xmm0
movlps %xmm0, (%rax)
movhps %xmm0, 8(%rax)
xorps %xmm0, %xmm0
movlps -88(%rsp), %xmm0
movhps -80(%rsp), %xmm0
movlps %xmm0, 16(%rax)
movhps %xmm0, 24(%rax)
xorps %xmm0, %xmm0
movlps -72(%rsp), %xmm0
movhps -64(%rsp), %xmm0
movlps %xmm0, 32(%rax)
movhps %xmm0, 40(%rax)
.L94:
movq 216(%rsp), %rax
movq %rax, 184(%rsp)
movq 224(%rsp), %rax
movq 184(%rsp), %xmm0
movq %rax, 192(%rsp)
movd %rax, %xmm1
addq $256, %rsp
ret
.p2align 4,,10
.p2align 3
.L98:
.L96:
movq %r10, 184(%rsp)
movq %rsi, 192(%rsp)
movd %r10, %xmm0
movd %rsi, %xmm1
addq $256, %rsp
ret
.cfi_endproc
--
Tuomas Tonteri
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2009-08-27 8:17 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-08-27 17:18 GCC gives major speed up with "-pg" flag for SIMD code Tuomas Tonteri
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).