From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 23079 invoked by alias); 10 Oct 2002 19:26:01 -0000 Mailing-List: contact gcc-prs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Archive: List-Post: List-Help: Sender: gcc-prs-owner@gcc.gnu.org Received: (qmail 23064 invoked by uid 71); 10 Oct 2002 19:26:01 -0000 Date: Thu, 10 Oct 2002 12:26:00 -0000 Message-ID: <20021010192601.23063.qmail@sources.redhat.com> To: nobody@gcc.gnu.org Cc: gcc-prs@gcc.gnu.org, From: Jan Hubicka Subject: Re: optimization/8049: SSE unaligned vector stores crash with -O0 Reply-To: Jan Hubicka X-SW-Source: 2002-10/txt/msg00418.txt.bz2 List-Id: The following reply was made to PR optimization/8049; it has been noted by GNATS. From: Jan Hubicka To: Ian Ollmann Cc: Jan Hubicka , hubicka@gcc.gnu.org, gcc-bugs@gcc.gnu.org, gcc-prs@gcc.gnu.org, gcc-gnats@gcc.gnu.org Subject: Re: optimization/8049: SSE unaligned vector stores crash with -O0 Date: Thu, 10 Oct 2002 21:20:58 +0200 > On Thu, 10 Oct 2002, Jan Hubicka wrote: > > > > The original example is one. The three buffers passed into MatrixMultiply > > > happen to be aligned on my system. The actual crash happens when the stack > > > copy of C1 is loaded, before the _mm_store_ps (uninlined) function is > > > called. > > > > This looks strange. All stores to C1 works properly and then you get > > movaps crash when you load it to store into output? > > I believe it is the output that is missaligned (destination of store) > > because the destination array is missaligned from the caller. > > Hmm, perhaps I got confused between the Intel and ATT argument ordering > scheme. Perhaps this is actually a store to the stack from the earlier > mm_add_ps and gdb misreported the line from the source? What do you think? > > (gdb) run > Program received signal SIGSEGV, Segmentation Fault > 0x0804888b in MatrixMultiply() (A = 0xbffff9d0, B=0xbffff990, C=0xbffff950) > at main.c:74 > > 74 _mm_store_ps( C + 0, C1 ); //.... > > > 0x8048884 : movaps 0xffffff68 (%ebp),%xmm0 > 0x804888b : movaps %xmm0, 0x4 (%esp, 1) > 0x8048890 : call 0x8048930 <_mm_store_ps> This really is load, however it is stored previously as: > 0x80484c9 : movaps %xmm0,0xffffff68(%ebp) And it didn't generated trap, so ebp is aligned that time I suppose, so it is really strange to see that it is not in this case. Perhaps it is _mm_store_ps that messed up the stack and restored ebp incorrectly? Honza > > (gdb) info registers > > esp 0xbffff800 > ebp 0xbffff928 > > > > Dump of assembler code for function MatrixMultiply: > 0x8048418 : push %ebp > 0x8048419 : mov %esp,%ebp > 0x804841b : sub $0x128,%esp > 0x8048421 : mov 0x8(%ebp),%eax > 0x8048424 : mov %eax,(%esp,1) > 0x8048427 : call 0x8048925 <_mm_load_ps> > 0x804842c : movaps %xmm0,0xffffffe8(%ebp) > 0x8048430 : mov 0x8(%ebp),%eax > 0x8048433 : add $0x10,%eax > 0x8048436 : mov %eax,(%esp,1) > 0x8048439 : call 0x8048925 <_mm_load_ps> > 0x804843e : movaps %xmm0,0xffffffd8(%ebp) > 0x8048442 : mov 0x8(%ebp),%eax > 0x8048445 : add $0x20,%eax > 0x8048448 : mov %eax,(%esp,1) > 0x804844b : call 0x8048925 <_mm_load_ps> > 0x8048450 : movaps %xmm0,0xffffffc8(%ebp) > 0x8048454 : mov 0x8(%ebp),%eax > 0x8048457 : add $0x30,%eax > 0x804845a : mov %eax,(%esp,1) > 0x804845d : call 0x8048925 <_mm_load_ps> > 0x8048462 : movaps %xmm0,0xffffffb8(%ebp) > 0x8048466 : mov 0xc(%ebp),%eax > 0x8048469 : mov %eax,(%esp,1) > 0x804846c : call 0x8048925 <_mm_load_ps> > 0x8048471 : movaps %xmm0,0xffffffa8(%ebp) > 0x8048475 : mov 0xc(%ebp),%eax > 0x8048478 : add $0x10,%eax > 0x804847b : mov %eax,(%esp,1) > 0x804847e : call 0x8048925 <_mm_load_ps> > 0x8048483 : movaps %xmm0,0xffffff98(%ebp) > 0x8048487 : mov 0xc(%ebp),%eax > 0x804848a : add $0x20,%eax > 0x804848d : mov %eax,(%esp,1) > 0x8048490 : call 0x8048925 <_mm_load_ps> > 0x8048495 : movaps %xmm0,0xffffff88(%ebp) > 0x8048499 : mov 0xc(%ebp),%eax > 0x804849c : add $0x30,%eax > 0x804849f : mov %eax,(%esp,1) > 0x80484a2 : call 0x8048925 <_mm_load_ps> > 0x80484a7 : movaps %xmm0,0xffffff78(%ebp) > 0x80484ae : movaps 0xffffffe8(%ebp),%xmm0 > 0x80484b2 : shufps $0x0,0xffffffe8(%ebp),%xmm0 > 0x80484b7 : movaps %xmm0,(%esp,1) > 0x80484bb : movaps 0xffffffa8(%ebp),%xmm0 > 0x80484bf : movaps %xmm0,0x10(%esp,1) > 0x80484c4 : call 0x8048905 <_mm_mul_ps> > 0x80484c9 : movaps %xmm0,0xffffff68(%ebp) > 0x80484d0 : movaps 0xffffffd8(%ebp),%xmm0 > 0x80484d4 : shufps $0x0,0xffffffd8(%ebp),%xmm0 > 0x80484d9 : movaps %xmm0,(%esp,1) > 0x80484dd : movaps 0xffffff98(%ebp),%xmm0 > 0x80484e1 : movaps %xmm0,0x10(%esp,1) > 0x80484e6 : call 0x8048905 <_mm_mul_ps> > 0x80484eb : movaps %xmm0,0xffffff58(%ebp) > 0x80484f2 : movaps 0xffffffc8(%ebp),%xmm0 > 0x80484f6 : shufps $0x0,0xffffffc8(%ebp),%xmm0 > 0x80484fb : movaps %xmm0,(%esp,1) > 0x80484ff : movaps 0xffffff88(%ebp),%xmm0 > 0x8048503 : movaps %xmm0,0x10(%esp,1) > 0x8048508 : call 0x8048905 <_mm_mul_ps> > 0x804850d : movaps %xmm0,0xffffff48(%ebp) > 0x8048514 : movaps 0xffffffb8(%ebp),%xmm0 > 0x8048518 : shufps $0x0,0xffffffb8(%ebp),%xmm0 > 0x804851d : movaps %xmm0,(%esp,1) > 0x8048521 : movaps 0xffffff78(%ebp),%xmm0 > 0x8048528 : movaps %xmm0,0x10(%esp,1) > 0x804852d : call 0x8048905 <_mm_mul_ps> > 0x8048532 : movaps %xmm0,0xffffff38(%ebp) > 0x8048539 : movaps 0xffffffe8(%ebp),%xmm0 > 0x804853d : shufps $0x55,0xffffffe8(%ebp),%xmm0 > 0x8048542 : movaps %xmm0,(%esp,1) > 0x8048546 : movaps 0xffffffa8(%ebp),%xmm0 > 0x804854a : movaps %xmm0,0x10(%esp,1) > 0x804854f : call 0x8048905 <_mm_mul_ps> > 0x8048554 : movaps %xmm0,0xffffff28(%ebp) > 0x804855b : movaps 0xffffffd8(%ebp),%xmm0 > 0x804855f : shufps $0x55,0xffffffd8(%ebp),%xmm0 > 0x8048564 : movaps %xmm0,(%esp,1) > 0x8048568 : movaps 0xffffff98(%ebp),%xmm0 > 0x804856c : movaps %xmm0,0x10(%esp,1) > 0x8048571 : call 0x8048905 <_mm_mul_ps> > 0x8048576 : movaps %xmm0,0xffffff18(%ebp) > 0x804857d : movaps 0xffffffc8(%ebp),%xmm0 > 0x8048581 : shufps $0x55,0xffffffc8(%ebp),%xmm0 > 0x8048586 : movaps %xmm0,(%esp,1) > 0x804858a : movaps 0xffffff88(%ebp),%xmm0 > 0x804858e : movaps %xmm0,0x10(%esp,1) > 0x8048593 : call 0x8048905 <_mm_mul_ps> > 0x8048598 : movaps %xmm0,0xffffff08(%ebp) > 0x804859f : movaps 0xffffffb8(%ebp),%xmm0 > 0x80485a3 : shufps $0x55,0xffffffb8(%ebp),%xmm0 > 0x80485a8 : movaps %xmm0,(%esp,1) > 0x80485ac : movaps 0xffffff78(%ebp),%xmm0 > 0x80485b3 : movaps %xmm0,0x10(%esp,1) > 0x80485b8 : call 0x8048905 <_mm_mul_ps> > 0x80485bd : movaps %xmm0,0xfffffef8(%ebp) > 0x80485c4 : movaps 0xffffff68(%ebp),%xmm0 > 0x80485cb : movaps %xmm0,(%esp,1) > 0x80485cf : movaps 0xffffff28(%ebp),%xmm0 > 0x80485d6 : movaps %xmm0,0x10(%esp,1) > 0x80485db : call 0x80488e5 <_mm_add_ps> > 0x80485e0 : movaps %xmm0,0xffffff68(%ebp) > 0x80485e7 : movaps 0xffffff58(%ebp),%xmm0 > 0x80485ee : movaps %xmm0,(%esp,1) > 0x80485f2 : movaps 0xffffff18(%ebp),%xmm0 > 0x80485f9 : movaps %xmm0,0x10(%esp,1) > 0x80485fe : call 0x80488e5 <_mm_add_ps> > 0x8048603 : movaps %xmm0,0xffffff58(%ebp) > 0x804860a : movaps 0xffffff48(%ebp),%xmm0 > 0x8048611 : movaps %xmm0,(%esp,1) > 0x8048615 : movaps 0xffffff08(%ebp),%xmm0 > 0x804861c : movaps %xmm0,0x10(%esp,1) > 0x8048621 : call 0x80488e5 <_mm_add_ps> > 0x8048626 : movaps %xmm0,0xffffff48(%ebp) > 0x804862d : movaps 0xffffff38(%ebp),%xmm0 > 0x8048634 : movaps %xmm0,(%esp,1) > 0x8048638 : movaps 0xfffffef8(%ebp),%xmm0 > 0x804863f : movaps %xmm0,0x10(%esp,1) > 0x8048644 : call 0x80488e5 <_mm_add_ps> > 0x8048649 : movaps %xmm0,0xffffff38(%ebp) > 0x8048650 : movaps 0xffffffe8(%ebp),%xmm0 > 0x8048654 : shufps $0xaa,0xffffffe8(%ebp),%xmm0 > 0x8048659 : movaps %xmm0,(%esp,1) > 0x804865d : movaps 0xffffffa8(%ebp),%xmm0 > 0x8048661 : movaps %xmm0,0x10(%esp,1) > 0x8048666 : call 0x8048905 <_mm_mul_ps> > 0x804866b : movaps %xmm0,0xffffff28(%ebp) > 0x8048672 : movaps 0xffffffd8(%ebp),%xmm0 > 0x8048676 : shufps $0xaa,0xffffffd8(%ebp),%xmm0 > 0x804867b : movaps %xmm0,(%esp,1) > 0x804867f : movaps 0xffffff98(%ebp),%xmm0 > 0x8048683 : movaps %xmm0,0x10(%esp,1) > 0x8048688 : call 0x8048905 <_mm_mul_ps> > 0x804868d : movaps %xmm0,0xffffff18(%ebp) > 0x8048694 : movaps 0xffffffc8(%ebp),%xmm0 > 0x8048698 : shufps $0xaa,0xffffffc8(%ebp),%xmm0 > 0x804869d : movaps %xmm0,(%esp,1) > 0x80486a1 : movaps 0xffffff88(%ebp),%xmm0 > 0x80486a5 : movaps %xmm0,0x10(%esp,1) > 0x80486aa : call 0x8048905 <_mm_mul_ps> > 0x80486af : movaps %xmm0,0xffffff08(%ebp) > 0x80486b6 : movaps 0xffffffb8(%ebp),%xmm0 > 0x80486ba : shufps $0xaa,0xffffffb8(%ebp),%xmm0 > 0x80486bf : movaps %xmm0,(%esp,1) > 0x80486c3 : movaps 0xffffff78(%ebp),%xmm0 > 0x80486ca : movaps %xmm0,0x10(%esp,1) > 0x80486cf : call 0x8048905 <_mm_mul_ps> > 0x80486d4 : movaps %xmm0,0xfffffef8(%ebp) > 0x80486db : movaps 0xffffff68(%ebp),%xmm0 > 0x80486e2 : movaps %xmm0,(%esp,1) > 0x80486e6 : movaps 0xffffff28(%ebp),%xmm0 > 0x80486ed : movaps %xmm0,0x10(%esp,1) > 0x80486f2 : call 0x80488e5 <_mm_add_ps> > 0x80486f7 : movaps %xmm0,0xffffff68(%ebp) > 0x80486fe : movaps 0xffffff58(%ebp),%xmm0 > 0x8048705 : movaps %xmm0,(%esp,1) > 0x8048709 : movaps 0xffffff18(%ebp),%xmm0 > 0x8048710 : movaps %xmm0,0x10(%esp,1) > 0x8048715 : call 0x80488e5 <_mm_add_ps> > 0x804871a : movaps %xmm0,0xffffff58(%ebp) > 0x8048721 : movaps 0xffffff48(%ebp),%xmm0 > 0x8048728 : movaps %xmm0,(%esp,1) > 0x804872c : movaps 0xffffff08(%ebp),%xmm0 > 0x8048733 : movaps %xmm0,0x10(%esp,1) > 0x8048738 : call 0x80488e5 <_mm_add_ps> > 0x804873d : movaps %xmm0,0xffffff48(%ebp) > 0x8048744 : movaps 0xffffff38(%ebp),%xmm0 > 0x804874b : movaps %xmm0,(%esp,1) > 0x804874f : movaps 0xfffffef8(%ebp),%xmm0 > 0x8048756 : movaps %xmm0,0x10(%esp,1) > 0x804875b : call 0x80488e5 <_mm_add_ps> > 0x8048760 : movaps %xmm0,0xffffff38(%ebp) > 0x8048767 : movaps 0xffffffe8(%ebp),%xmm0 > 0x804876b : shufps $0xff,0xffffffe8(%ebp),%xmm0 > 0x8048770 : movaps %xmm0,(%esp,1) > 0x8048774 : movaps 0xffffffa8(%ebp),%xmm0 > 0x8048778 : movaps %xmm0,0x10(%esp,1) > 0x804877d : call 0x8048905 <_mm_mul_ps> > 0x8048782 : movaps %xmm0,0xffffff28(%ebp) > 0x8048789 : movaps 0xffffffd8(%ebp),%xmm0 > 0x804878d : shufps $0xff,0xffffffd8(%ebp),%xmm0 > 0x8048792 : movaps %xmm0,(%esp,1) > 0x8048796 : movaps 0xffffff98(%ebp),%xmm0 > 0x804879a : movaps %xmm0,0x10(%esp,1) > 0x804879f : call 0x8048905 <_mm_mul_ps> > 0x80487a4 : movaps %xmm0,0xffffff18(%ebp) > 0x80487ab : movaps 0xffffffc8(%ebp),%xmm0 > 0x80487af : shufps $0xff,0xffffffc8(%ebp),%xmm0 > 0x80487b4 : movaps %xmm0,(%esp,1) > 0x80487b8 : movaps 0xffffff88(%ebp),%xmm0 > 0x80487bc : movaps %xmm0,0x10(%esp,1) > 0x80487c1 : call 0x8048905 <_mm_mul_ps> > 0x80487c6 : movaps %xmm0,0xffffff08(%ebp) > 0x80487cd : movaps 0xffffffb8(%ebp),%xmm0 > 0x80487d1 : shufps $0xff,0xffffffb8(%ebp),%xmm0 > 0x80487d6 : movaps %xmm0,(%esp,1) > 0x80487da : movaps 0xffffff78(%ebp),%xmm0 > 0x80487e1 : movaps %xmm0,0x10(%esp,1) > 0x80487e6 : call 0x8048905 <_mm_mul_ps> > 0x80487eb : movaps %xmm0,0xfffffef8(%ebp) > 0x80487f2 : movaps 0xffffff68(%ebp),%xmm0 > 0x80487f9 : movaps %xmm0,(%esp,1) > 0x80487fd : movaps 0xffffff28(%ebp),%xmm0 > 0x8048804 : movaps %xmm0,0x10(%esp,1) > 0x8048809 : call 0x80488e5 <_mm_add_ps> > 0x804880e : movaps %xmm0,0xffffff68(%ebp) > 0x8048815 : movaps 0xffffff58(%ebp),%xmm0 > 0x804881c : movaps %xmm0,(%esp,1) > 0x8048820 : movaps 0xffffff18(%ebp),%xmm0 > 0x8048827 : movaps %xmm0,0x10(%esp,1) > 0x804882c : call 0x80488e5 <_mm_add_ps> > 0x8048831 : movaps %xmm0,0xffffff58(%ebp) > 0x8048838 : movaps 0xffffff48(%ebp),%xmm0 > 0x804883f : movaps %xmm0,(%esp,1) > 0x8048843 : movaps 0xffffff08(%ebp),%xmm0 > 0x804884a : movaps %xmm0,0x10(%esp,1) > 0x804884f : call 0x80488e5 <_mm_add_ps> > 0x8048854 : movaps %xmm0,0xffffff48(%ebp) > 0x804885b : movaps 0xffffff38(%ebp),%xmm0 > 0x8048862 : movaps %xmm0,(%esp,1) > 0x8048866 : movaps 0xfffffef8(%ebp),%xmm0 > 0x804886d : movaps %xmm0,0x10(%esp,1) > 0x8048872 : call 0x80488e5 <_mm_add_ps> > 0x8048877 : movaps %xmm0,0xffffff38(%ebp) > 0x804887e : mov 0x10(%ebp),%eax > 0x8048881 : mov %eax,(%esp,1) > 0x8048884 : movaps 0xffffff68(%ebp),%xmm0 > 0x804888b : movaps %xmm0,0x4(%esp,1) > 0x8048890 : call 0x8048930 <_mm_store_ps> > 0x8048895 : mov 0x10(%ebp),%eax > 0x8048898 : add $0x10,%eax > 0x804889b : mov %eax,(%esp,1) > 0x804889e : movaps 0xffffff58(%ebp),%xmm0 > 0x80488a5 : movaps %xmm0,0x4(%esp,1) > 0x80488aa : call 0x8048930 <_mm_store_ps> > 0x80488af : mov 0x10(%ebp),%eax > 0x80488b2 : add $0x20,%eax > 0x80488b5 : mov %eax,(%esp,1) > 0x80488b8 : movaps 0xffffff48(%ebp),%xmm0 > 0x80488bf : movaps %xmm0,0x4(%esp,1) > 0x80488c4 : call 0x8048930 <_mm_store_ps> > 0x80488c9 : mov 0x10(%ebp),%eax > 0x80488cc : add $0x30,%eax > 0x80488cf : mov %eax,(%esp,1) > 0x80488d2 : movaps 0xffffff38(%ebp),%xmm0 > 0x80488d9 : movaps %xmm0,0x4(%esp,1) > 0x80488de : call 0x8048930 <_mm_store_ps> > 0x80488e3 : leave > 0x80488e4 : ret > End of assembler dump. > > > --------------------------------------------------- > Ian Ollmann, Ph.D. iano@cco.caltech.edu > --------------------------------------------------- >