c++/7582: Intel intrinsics cause segfault with gcc 3.1.1 and 3.2

public inbox for gcc-prs@sourceware.org
help / color / mirror / Atom feed

* c++/7582: Intel intrinsics cause segfault with gcc 3.1.1 and 3.2
@ 2002-08-12 16:36 dholm
  0 siblings, 0 replies; 2+ messages in thread
From: dholm @ 2002-08-12 16:36 UTC (permalink / raw)
  To: gcc-gnats


>Number:         7582
>Category:       c++
>Synopsis:       Intel intrinsics cause segfault with gcc 3.1.1 and 3.2
>Confidential:   no
>Severity:       critical
>Priority:       medium
>Responsible:    unassigned
>State:          open
>Class:          sw-bug
>Submitter-Id:   net
>Arrival-Date:   Mon Aug 12 16:16:01 PDT 2002
>Closed-Date:
>Last-Modified:
>Originator:     David Holm
>Release:        gcc version 3.2 2002-07-26 (prerelease)
>Organization:
>Environment:
Gentoo Linux 1.4, Pentium 3 (Coppermine)
>Description:
The following code executes perfectly when compiled with the Intel C++ Compiler v6.0 but segfaults when compiled with gcc 3.1.1 or 3.2 (2002-07-26).
It's compiled with "g++ (-g3) -Wall -msse intrin.cpp -o intrin" and runs without any output.
g++ gives no warnings during compilation.
intrin segfaults on this line "_mm_stream_ps((float*) dst, xmm0);"

"g++ -v" returns:
Reading specs from /usr/lib/gcc-lib/i686-pc-linux-gnu/3.2/specs
Configured with: /var/tmp/portage/gcc-3.2_pre/work/gcc-3.2/configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --enable-shared --host=i686-pc-linux-gnu --build=i686-pc-linux-gnu --target=i686-pc-linux-gnu --with-system-zlib --enable-languages=c,c++,ada,f77,objc,java --enable-threads=posix --enable-long-long --disable-checking --enable-cstdio=stdio --enable-clocale=generic --enable-version-specific-runtime-libs --with-gxx-include-dir=/usr/include/g++-v32 --with-local-prefix=/usr/local --enable-shared --enable-nls --without-included-gettext
Thread model: posix
gcc version 3.2 2002-07-26 (prerelease)

I haven't got 3.1.1 anymore, so I can't give you the -v output from it.
>How-To-Repeat:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <mmintrin.h>
#include <xmmintrin.h>

#define small_memcpy(dst,src,n) \
        register unsigned long int dummy; \
        asm volatile ( \
                "rep; movsb\n\t" \
                :"=&D"(dst), "=&S"(src), "=&c"(dummy) \
        :"0" (dst), "1" (src),"2" (n) \
                : "memory");


/**
 * SIMD Optimized memcpy's are graciously borrowed from DirectFB.
 */

#  define SSE_MMREG_SIZE 16
#  define MIN_LEN 0x40  /* 64-byte blocks */

void *memcpy_sse( void *dst, const void *src, size_t len )
{
        void *retval = dst;
        size_t i;

        _mm_prefetch((char*) src, _MM_HINT_NTA);
        _mm_prefetch((char*) src + 64, _MM_HINT_NTA);
        _mm_prefetch((char*) src + 128, _MM_HINT_NTA);
        _mm_prefetch((char*) src + 192, _MM_HINT_NTA);
        _mm_prefetch((char*) src + 256, _MM_HINT_NTA);

        if (len >= MIN_LEN)
        {
                register unsigned long int delta;
                delta = ((unsigned long int) dst) & (SSE_MMREG_SIZE - 1);
                if (delta)
                {
                        delta = SSE_MMREG_SIZE - delta;
                        len -= delta;
                        small_memcpy(dst, src, delta);
                }
                i = len >> 6;
                len &= 63;

                if (((unsigned long) src) & 15)
                        for (; i > 0; i--)
                        {
                                __m128 xmm0, xmm1, xmm2, xmm3;
                                _mm_prefetch((char*) src + 320, _MM_HINT_NTA);
                                xmm0 = _mm_loadu_ps((float*) src);
                                xmm1 = _mm_loadu_ps((float*) src + 4);
                                xmm2 = _mm_loadu_ps((float*) src + 8);
                                xmm3 = _mm_loadu_ps((float*) src + 12);
                                _mm_stream_ps((float*) dst, xmm0);
                                _mm_stream_ps((float*) dst + 4, xmm1);
                                _mm_stream_ps((float*) dst + 8, xmm2);
                                _mm_stream_ps((float*) dst + 12, xmm3);
#ifdef __GNUC__
                                (char*) src += 64;
                                (char*) dst += 64;
#else
                                src += 64;
                                dst += 64;
#endif
                        }
                else
                        for (; i > 0; i--)
                        {
                                __m128 xmm0, xmm1, xmm2, xmm3;
                                _mm_prefetch((char*) src + 320, _MM_HINT_NTA);
                                xmm0 = _mm_load_ps((float*) src);
                                xmm1 = _mm_load_ps((float*) src + 4);
                                xmm2 = _mm_load_ps((float*) src + 8);
                                xmm3 = _mm_load_ps((float*) src + 12);
                                _mm_stream_ps((float*) dst, xmm0);
                                _mm_stream_ps((float*) dst + 4, xmm1);
                                _mm_stream_ps((float*) dst + 8, xmm2);
                                _mm_stream_ps((float*) dst + 12, xmm3);
#ifdef __GNUC__
                                (char*) src += 64;
                                (char*) dst += 64;
#else
                                src += 64;
                                dst += 64;
#endif
                        }
        }

        if (len)
                memcpy(dst, src, len);

        return retval;
}

int main(void)
{
        char *tmp1, *tmp2;

        (void*) tmp1 = malloc(1024 * 1024 * 10);
        (void*) tmp2 = malloc(1024 * 1024 * 10);

        memcpy_sse(tmp1, tmp2, 1024 * 1024 * 10);

        free(tmp1);
        free(tmp2);

        return 0;
}
>Fix:

>Release-Note:
>Audit-Trail:
>Unformatted:


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: c++/7582: Intel intrinsics cause segfault with gcc 3.1.1 and 3.2
@ 2002-08-13  3:58 Tim Prince
  0 siblings, 0 replies; 2+ messages in thread
From: Tim Prince @ 2002-08-13  3:58 UTC (permalink / raw)
  To: nobody; +Cc: gcc-prs

The following reply was made to PR c++/7582; it has been noted by GNATS.

From: Tim Prince <tprince@computer.org>
To: dholm@telia.com, gcc-gnats@gcc.gnu.org
Cc:  
Subject: Re: c++/7582: Intel intrinsics cause segfault with gcc 3.1.1 and 3.2
Date: Mon, 12 Aug 2002 22:52:16 -0700

 On Monday 12 August 2002 16:07, dholm@telia.com wrote:
 > >Number:         7582
 > >Category:       c++
 > >Synopsis:       Intel intrinsics cause segfault with gcc 3.1.1 and 3.2
 > >Confidential:   no
 > >Severity:       critical
 > >Priority:       medium
 > >Responsible:    unassigned
 > >State:          open
 > >Class:          sw-bug
 > >Submitter-Id:   net
 > >Arrival-Date:   Mon Aug 12 16:16:01 PDT 2002
 > >Closed-Date:
 > >Last-Modified:
 > >Originator:     David Holm
 > >Release:        gcc version 3.2 2002-07-26 (prerelease)
 > >Organization:
 > >Environment:
 >
 > Gentoo Linux 1.4, Pentium 3 (Coppermine)
 >
 > >Description:
 >
 > The following code executes perfectly when compiled with the Intel C++
 > Compiler v6.0 but segfaults when compiled with gcc 3.1.1 or 3.2
 > (2002-07-26). It's compiled with "g++ (-g3) -Wall -msse intrin.cpp -o
 > intrin" and runs without any output. g++ gives no warnings during
 > compilation.
 > intrin segfaults on this line "_mm_stream_ps((float*) dst, xmm0);"
 >
 > "g++ -v" returns:
 > Reading specs from /usr/lib/gcc-lib/i686-pc-linux-gnu/3.2/specs
 > Configured with: /var/tmp/portage/gcc-3.2_pre/work/gcc-3.2/configure
 > --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info
 > --enable-shared --host=i686-pc-linux-gnu --build=i686-pc-linux-gnu
 > --target=i686-pc-linux-gnu --with-system-zlib
 > --enable-languages=c,c++,ada,f77,objc,java --enable-threads=posix
 > --enable-long-long --disable-checking --enable-cstdio=stdio
 > --enable-clocale=generic --enable-version-specific-runtime-libs
 > --with-gxx-include-dir=/usr/include/g++-v32 --with-local-prefix=/usr/local
 > --enable-shared --enable-nls --without-included-gettext Thread model: posix
 > gcc version 3.2 2002-07-26 (prerelease)
 >
 > I haven't got 3.1.1 anymore, so I can't give you the -v output from it.
 >
 > >How-To-Repeat:
 >
 > #include <stdio.h>
 > #include <string.h>
 > #include <stdlib.h>
 > #include <mmintrin.h>
 > #include <xmmintrin.h>
 >
 > #define small_memcpy(dst,src,n) \
 >         register unsigned long int dummy; \
 >         asm volatile ( \
 >                 "rep; movsb\n\t" \
 >
 >                 :"=&D"(dst), "=&S"(src), "=&c"(dummy) \
 >         :
 >         :"0" (dst), "1" (src),"2" (n) \
 >         :
 >                 : "memory");
 >
 > /**
 >  * SIMD Optimized memcpy's are graciously borrowed from DirectFB.
 >  */
 >
 > #  define SSE_MMREG_SIZE 16
 > #  define MIN_LEN 0x40  /* 64-byte blocks */
 >
 > void *memcpy_sse( void *dst, const void *src, size_t len )
 > {
 >         void *retval = dst;
 >         size_t i;
 >
 >         _mm_prefetch((char*) src, _MM_HINT_NTA);
 >         _mm_prefetch((char*) src + 64, _MM_HINT_NTA);
 >         _mm_prefetch((char*) src + 128, _MM_HINT_NTA);
 >         _mm_prefetch((char*) src + 192, _MM_HINT_NTA);
 >         _mm_prefetch((char*) src + 256, _MM_HINT_NTA);
 >
 >         if (len >= MIN_LEN)
 >         {
 >                 register unsigned long int delta;
 >                 delta = ((unsigned long int) dst) & (SSE_MMREG_SIZE - 1);
 >                 if (delta)
 >                 {
 >                         delta = SSE_MMREG_SIZE - delta;
 >                         len -= delta;
 >                         small_memcpy(dst, src, delta);
 >                 }
 >                 i = len >> 6;
 >                 len &= 63;
 >
 >                 if (((unsigned long) src) & 15)
 >                         for (; i > 0; i--)
 >                         {
 >                                 __m128 xmm0, xmm1, xmm2, xmm3;
 >                                 _mm_prefetch((char*) src + 320,
 > _MM_HINT_NTA); xmm0 = _mm_loadu_ps((float*) src); xmm1 =
 > _mm_loadu_ps((float*) src + 4); xmm2 = _mm_loadu_ps((float*) src + 8); xmm3
 > = _mm_loadu_ps((float*) src + 12); _mm_stream_ps((float*) dst, xmm0);
 > _mm_stream_ps((float*) dst + 4, xmm1); _mm_stream_ps((float*) dst + 8,
 > xmm2); _mm_stream_ps((float*) dst + 12, xmm3); #ifdef __GNUC__
 >                                 (char*) src += 64;
 >                                 (char*) dst += 64;
 > #else
 >                                 src += 64;
 >                                 dst += 64;
 > #endif
 >                         }
 >                 else
 >                         for (; i > 0; i--)
 >                         {
 >                                 __m128 xmm0, xmm1, xmm2, xmm3;
 >                                 _mm_prefetch((char*) src + 320,
 > _MM_HINT_NTA); xmm0 = _mm_load_ps((float*) src);
 >                                 xmm1 = _mm_load_ps((float*) src + 4);
 >                                 xmm2 = _mm_load_ps((float*) src + 8);
 >                                 xmm3 = _mm_load_ps((float*) src + 12);
 >                                 _mm_stream_ps((float*) dst, xmm0);
 >                                 _mm_stream_ps((float*) dst + 4, xmm1);
 >                                 _mm_stream_ps((float*) dst + 8, xmm2);
 >                                 _mm_stream_ps((float*) dst + 12, xmm3);
 > #ifdef __GNUC__
 >                                 (char*) src += 64;
 >                                 (char*) dst += 64;
 > #else
 >                                 src += 64;
 >                                 dst += 64;
 > #endif
 >                         }
 >         }
 >
 >         if (len)
 >                 memcpy(dst, src, len);
 >
 >         return retval;
 > }
 >
 > int main(void)
 > {
 >         char *tmp1, *tmp2;
 >
 >         (void*) tmp1 = malloc(1024 * 1024 * 10);
 >         (void*) tmp2 = malloc(1024 * 1024 * 10);
 >
 >         memcpy_sse(tmp1, tmp2, 1024 * 1024 * 10);
 >
 >         free(tmp1);
 >         free(tmp2);
 >
 >         return 0;
 > }
 >
 > >Fix:
 > >
 > >Release-Note:
 > >Audit-Trail:
 > >Unformatted:
 You start off with no attempt to get aligned storage for tmp1 and tmp2.  In 
 an ideal world, malloc would take care of this, but gcc doesn't take 
 responsibility for which malloc you use.  I take it you are using whatever 
 glibc gives you.  Mine gives me 8-byte alignment, but not the required 
 16-byte alignment.  Since you didn't mention whether you stepped into 
 your code with your favorite debugger to check for such problems, some of us 
 may assume you haven't begun to do your homework.  If you did use a strategy 
 to assure alignment, you haven't informed us what it might be.  
 You've gone out of your way to obscure your code, yet you ignore what seems 
 most evident.
 As I understand it, the preference for the Intel compiler would be to use the 
 special aligned entry point _mm_malloc(), in order to make your code portable 
 to Windows, so you are lucky it works with icc.
 -- 
 Tim Prince


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2002-08-13  6:06 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-08-12 16:36 c++/7582: Intel intrinsics cause segfault with gcc 3.1.1 and 3.2 dholm
2002-08-13  3:58 Tim Prince

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).