[Bug c++/12902] Invalid assembly generated when using SSE / xmmintrin.h

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

From: "kbowers at lanl dot gov" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug c++/12902] Invalid assembly generated when using SSE / xmmintrin.h
Date: Wed, 05 Nov 2003 05:51:00 -0000	[thread overview]
Message-ID: <20031105055156.11722.qmail@sources.redhat.com> (raw)
In-Reply-To: <20031105013127.12902.kbowers@lanl.gov>

PLEASE REPLY TO gcc-bugzilla@gcc.gnu.org ONLY, *NOT* gcc-bugs@gcc.gnu.org.

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12902



------- Additional Comments From kbowers at lanl dot gov  2003-11-05 05:51 -------
Subject: Re:  Invalid assembly generated when using SSE / xmmintrin.h

pinskia at gcc dot gnu dot org wrote:
> PLEASE REPLY TO gcc-bugzilla@gcc.gnu.org ONLY, *NOT* gcc-bugs@gcc.gnu.org.
> 
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12902
> 
> 
> pinskia at gcc dot gnu dot org changed:
> 
>            What    |Removed                     |Added
> ----------------------------------------------------------------------------
>              Status|UNCONFIRMED                 |WAITING
> 
> 
> ------- Additional Comments From pinskia at gcc dot gnu dot org  2003-11-05 04:08 -------
> The "a" in the asm corespondes to the variable "a" in foo, not the a in inlined function, swizzle.
> So the movaps is coming from:
>   c.v = _mm_loadl_pi(c.v,((__m64 *)a0)+1); <--- 8(%eax)
 >
> Is that really your problem, or is because main's stack is unaligned and you inlined functions into 
> main that use sse?

I don't think this is a stack alignment problem. The memory pointed to 
by "swizzle:a0"/"foo:a" is not on the stack and is 16-byte aligned (a0 
is the pointer "a" in the the foo). Also, _mm_loadl_pi is intended for 
8-byte aligned as (((__m64 *)a0)+1 is) memory 8-byte loads .

I think gcc has a bug in the implementation of the 
__builtin_ia32_loadlps and __builtin_ia32_loadhps intrinsics when 
dealing with xmm registers stored in a stack temporary.

Here is foo again with the b1 swizzle / math removed.

void foo( const a_t *a, const b_t *b, int n ) {
   vector4 ai, a0, a1, a2, b0, v0, v1, v2;
   __m128 *p0, *p1, *p2, *p3;

   for(;n;n--,a+=4) {
     swizzle(a,a+1,a+2,a+3,ai,a0,a1,a2);
     p0 = (__m128 *)(b + ai.i[0]);
     p1 = (__m128 *)(b + ai.i[1]);
     p2 = (__m128 *)(b + ai.i[2]);
     p3 = (__m128 *)(b + ai.i[3]);
     swizzle(p0++,p1++,p2++,p3++,b0,v0,v1,v2);
     b0.v = _mm_add_ps(_mm_add_ps(b0.v,_mm_mul_ps(a1.v,v0.v)),
            _mm_mul_ps(a2.v,_mm_add_ps(v1.v,_mm_mul_ps(a1.v,v2.v))));
   }
}

Here is the relevant assembly snippet:

% gcc-3.3.2 -S -fverbose-asm -O -msse bug_12902_followup.cpp
% cat bug_12902_followup.s

... snip to around line 45 ...

movaps	-40(%ebp), %xmm3	#  <variable>.v,  <anonymous>
movlps	(%esi), %xmm3	# * a,  <anonymous>
movlps	8(%esi), %xmm6	#  <anonymous>
movhps	16(%esi), %xmm3	#  <anonymous>
movhps	24(%esi), %xmm6	#  <anonymous>
movaps	%xmm6, %xmm4	#  <anonymous>,  <anonymous>
movaps	%xmm3, %xmm1	#  <anonymous>,  <anonymous>
movlps	32(%esi), %xmm1	#  <anonymous>
movaps	%xmm6, %xmm2	#  <anonymous>,  <anonymous>
movlps	40(%esi), %xmm2	#  <anonymous>
movhps	48(%esi), %xmm1	#  <anonymous>
movhps	56(%esi), %xmm2	#  <anonymous>
movaps	%xmm3, %xmm0	#  <anonymous>

... snip ...

This is the correct assembly and the function does not crash. I also 
have much more complex versions of the loop which do not crash (in those 
cases, I guess I got lucky with the register spills).

The third instruction above corresponds to line you cited:

c.v = _mm_loadl_pi(c.v,((__m64 *)a0)+1);

I've tried to figure out what gcc is doing in the original foo but I'm 
not sure I follow:

I think I've figured out the problem. Here is a translation of the first 
swizzle assembly from the original foo():

#  a.v = _mm_loadl_pi(a.v, (__m64 *)a0);
*  a.v is in xmm3
	movlps	(%eax), %xmm3

#  c.v = _mm_loadl_pi(c.v,((__m64 *)a0)+1); ... FAULTS
*  c.v is in -200(%ebp)
	movaps	8(%eax), %xmm0
	movlps	%xmm0, -200(%ebp)

#  a.v = _mm_loadh_pi(a.v, (__m64 *)a1);
*  a.v is in xmm3
	movhps	16(%eax), %xmm3

#  c.v = _mm_loadh_pi(c.v,((__m64 *)a1)+1); ... WOULD FAULT
*  c.v is in -200(%ebp)
	movaps	24(%eax), %xmm2
	movhps	%xmm2, -200(%ebp)

#  b.v = a.v;
*  b.v is in xmm3 (a.v and b.v are copies)

#  d.v = c.v;
*  d.v is in xmm4 (d.v and c.v are copies)
	movaps	-200(%ebp), %xmm4

# t   = _mm_loadl_pi(b.v, (__m64 *)a2);
* t is in xmm1
	movaps	%xmm3, %xmm1
	movlps	32(%eax), %xmm1

# u   = _mm_loadl_pi(d.v,((__m64 *)a2)+1);
* u is in xmm2
	movaps	%xmm4, %xmm2
	movlps	40(%eax), %xmm2

# t   = _mm_loadh_pi(t, (__m64 *)a3);
* t is in xmm1
	movhps	48(%eax), %xmm1

# u   = _mm_loadh_pi(u,((__m64 *)a3)+1);
* u is in xmm2
	movhps	56(%eax), %xmm2

# a.v = _mm_shuffle_ps(a.v,t,0x88);
* a.v is stored is "foo:ai"
* b.v no longer shares xmm3 with a.v
	movaps	%xmm3, %xmm0
	shufps	$136, %xmm1, %xmm0
	movaps	%xmm0, -40(%ebp)

# b.v = _mm_shuffle_ps(b.v,t,0xdd);
* b.v is stored in "foo:a0"
	shufps	$221, %xmm1, %xmm3
	movaps	%xmm3, -216(%ebp)

# c.v = _mm_shuffle_ps(c.v,u,0x88);
* c.v is stored in "foo:a1"
* d.v no longer shares xmm4 with a.v
	movaps	%xmm4, %xmm0
	shufps	$136, %xmm2, %xmm0
	movaps	%xmm0, -200(%ebp)
# d.v = _mm_shuffle_ps(d.v,u,0xdd);
* d.v is stored in "foo:a2"
	shufps	$221, %xmm2, %xmm4
	movaps	%xmm4, -232(%ebp)

The two faulting instructions appear to occur when an _mm_loadl_pi 
(__builtin_ia32_loadlps), _mm_loadh_pi (__builtin_ia32_loadhps) seem to 
work fine when doing a m64 -> xmm register transaction. However, when 
doing a m64 -> xmm stack temporary, they emit invalid assembly. The 
above assembly sequency makes sense if the following changes are made:

#  c.v = _mm_loadl_pi(c.v,((__m64 *)a0)+1); ... FIXED
*  c.v is in -200(%ebp)
	movaps  -200(%ebp), %xmm0 # Retrieve c.v from stack
	movlps	8(%eax), %xmm0    # A valid 64-bit load
	movaps	%xmm0, -200(%ebp) # Store modified c.v onto stack

and

#  c.v = _mm_loadh_pi(c.v,((__m64 *)a1)+1); ... FIXED
*  c.v is in -200(%ebp)
	movaps  -200(%ebp), %xmm2 # Retrive c.v from stack
	movhps	24(%eax), %xmm2   # A valid 64-bit load
	movaps	%xmm2, -200(%ebp) # Store modified c.v onto stack

Thanks for your prompt reply.

next prev parent reply	other threads:[~2003-11-05  5:51 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-11-05  1:31 [Bug c++/12902] New: " kbowers at lanl dot gov
2003-11-05  4:08 ` [Bug c++/12902] " pinskia at gcc dot gnu dot org
2003-11-05  5:51 ` kbowers at lanl dot gov [this message]
2003-11-06 18:53 ` kbowers at lanl dot gov
2003-11-07  1:02 ` kbowers at lanl dot gov
2003-11-07 10:42 ` kbowers at lanl dot gov
2003-12-09 18:01 ` kbowers at lanl dot gov
2003-12-09 20:14 ` [Bug target/12902] " dhazeghi at yahoo dot com
2003-12-09 20:17 ` dhazeghi at yahoo dot com
2003-12-11 16:07 ` bangerth at dealii dot org
2004-12-13 20:54 ` bangerth at dealii dot org
2004-12-14 10:54 ` uros at kss-loka dot si
2005-01-05  9:43 ` [Bug target/12902] [4.0 Regression] " uros at kss-loka dot si
2005-01-05 12:14 ` rth at gcc dot gnu dot org
2005-01-05 19:14 ` cvs-commit at gcc dot gnu dot org
2005-01-05 20:04 ` rth at gcc dot gnu dot org
2005-01-06  8:25 ` uros at kss-loka dot si

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20031105055156.11722.qmail@sources.redhat.com \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).