From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 29478 invoked by alias); 30 Nov 2010 17:25:38 -0000 Received: (qmail 29460 invoked by uid 22791); 30 Nov 2010 17:25:36 -0000 X-SWARE-Spam-Status: No, hits=-2.8 required=5.0 tests=ALL_TRUSTED,AWL,BAYES_00 X-Spam-Check-By: sourceware.org Received: from localhost (HELO gcc.gnu.org) (127.0.0.1) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Tue, 30 Nov 2010 17:25:31 +0000 From: "mathog at caltech dot edu" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/46716] [4.3/4.4/4.5/4.6 Regression] bad code generated with -mno-sse2 -m64 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Keywords: wrong-code X-Bugzilla-Severity: normal X-Bugzilla-Who: mathog at caltech dot edu X-Bugzilla-Status: NEW X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: 4.3.6 X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated Content-Type: text/plain; charset="UTF-8" MIME-Version: 1.0 Date: Tue, 30 Nov 2010 17:50:00 -0000 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org X-SW-Source: 2010-11/txt/msg03707.txt.bz2 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=46716 --- Comment #5 from David Mathog 2010-11-30 17:25:01 UTC --- A (long) side note on how I found this bug - in partial answer to the obvious question - why would anybody run with -mno-sse2 on an X86_64 platform? We have a cluster of Athlon MP machines and one of the applications that run there is Sean Eddy's HMMER which is used to search a database call PFAMDIR. With version 3 of that software PFAMDIR changed format to only work with the newer software. HMMER 3 has a reference version (portable) and an SSE (really SSE2 version). I found that the reference version did not give exactly the same answers as the SSE version, so it wasn't going to be possible to refine the reference version for that platform and get the exact same results as everybody else, but the SSE2 version could not run on the target since that processor has no SSE2. Since I need to make this work on those old machines I wrote an SSE2 emulator which is a replacement emmintrin.h (latest version here) http://saf.bio.caltech.edu/pub/software/linux_or_unix_tools/soft_emmintrin.h This would be used instead of the native SSE2 (in theory) by dropping it into a directory as emmintrin.h and using "-I. -mno-sse2 -DSOFT_SSE2" on the gcc compile line. The idea being to get the SSE2 version running on the target and then optimize from that version for the target, retaining the same numerical results along the way. Since there was a lot of recompiling/retesting during development I used my fastest machine, which happened to be an Opteron running an X86_64 OS. To exercise the software SSE2 code all of the SSE2 tests in the gcc testsuite were run, and these triggered the present bug due to the default implicit -m64. I also have done some preliminary work on a soft_xmmintrin.h, but have my doubts that it is possible to use that successfully in combination with the gcc vector extension, since many strange things happen when -mno-sse is added to the command line. It seems that the gcc vector extension is very much intertwined with SSE on X86 platforms and perhaps cannot be fully separated from it. (A point that is not made at all clear in the documentation.) Additionally, with -msse -mno-sse2 -m32 and levels of optimization above -O0 complex expression like this is used in a real program (with multiple _mm functions used, it does not show up in the testsuite with single _mm function calls): #define EMMMIN(a,b) ((a)<(b)?(a):(b)) #define EMM_UINT1(a) ((unsigned char *)&(a)) /* vector operation: returns the minimum of each pair of the 16 8 bit unsigned integers from __A, __B */ static __inline __m128i __attribute__((__always_inline__)) _mm_min_epu8 (__m128i __A, __m128i __B) { __v16qi __tmp={ EMMMIN(EMM_UINT1(__A)[ 0],EMM_UINT1(__B)[ 0]), EMMMIN(EMM_UINT1(__A)[ 1],EMM_UINT1(__B)[ 1]), EMMMIN(EMM_UINT1(__A)[ 2],EMM_UINT1(__B)[ 2]), EMMMIN(EMM_UINT1(__A)[ 3],EMM_UINT1(__B)[ 3]), EMMMIN(EMM_UINT1(__A)[ 4],EMM_UINT1(__B)[ 4]), EMMMIN(EMM_UINT1(__A)[ 5],EMM_UINT1(__B)[ 5]), EMMMIN(EMM_UINT1(__A)[ 6],EMM_UINT1(__B)[ 6]), EMMMIN(EMM_UINT1(__A)[ 7],EMM_UINT1(__B)[ 7]), EMMMIN(EMM_UINT1(__A)[ 8],EMM_UINT1(__B)[ 8]), EMMMIN(EMM_UINT1(__A)[ 9],EMM_UINT1(__B)[ 9]), EMMMIN(EMM_UINT1(__A)[10],EMM_UINT1(__B)[10]), EMMMIN(EMM_UINT1(__A)[11],EMM_UINT1(__B)[11]), EMMMIN(EMM_UINT1(__A)[12],EMM_UINT1(__B)[12]), EMMMIN(EMM_UINT1(__A)[13],EMM_UINT1(__B)[13]), EMMMIN(EMM_UINT1(__A)[14],EMM_UINT1(__B)[14]), EMMMIN(EMM_UINT1(__A)[15],EMM_UINT1(__B)[15])}; return (__m128i)__tmp; } often result in this sort of compiler error: ./msvfilter.c:208: error: unable to find a register to spill in class 'GENERAL_REGS' ./msvfilter.c:208: error: this is the insn: (insn 1944 1943 1945 46 ../../easel/emmintrin.h:2348 (set (strict_low_part (subreg:HI (reg:TI 1239) 0)) (mem:HI (reg/f:SI 96 [ pretmp.1031 ]) [13 S2 A16])) 47 {*movstricthi_1} (nil)) ./msvfilter.c:208: confused by earlier errors, bailing out Simpler (fewer vector elements, less logic) functions did not do this, although it may be that they would have had I been able to get past the first error. This is, I suspect, again related to an implicit use of SSE2 registers even though -mno-sse2 had been specified. This type of error shows up even when -m32 is specified, so maybe it has a different origin. In any case, rewriting the expressions as follows seems to have eliminated this problem even for -O4, and the primary change was the replacement of the vector {} notation to set the (same) values. typedef union { __m128i vi; __m128d vd; __m128 vf; double f8[2]; float f4[4]; long long i8[2]; int i4[4]; short i2[8]; char i1[16]; unsigned long long u8[2]; unsigned int u4[4]; unsigned short u2[8]; unsigned char u1[16]; } __uni16; #define EMM_UINT1(a) (((__uni16)(a)).u1) #define EMMMIN(a,b) ((a)<(b)?(a):(b)) /* vector operation: returns the minimum of each pair of the 16 8 bit unsigned integers from __A, __B */ static __inline __m128i __attribute__((__always_inline__)) _mm_min_epu8 (__m128i __A, __m128i __B) { __uni16 __tmp; __tmp.u1[ 0] = EMMMIN(EMM_UINT1(__A)[ 0],EMM_UINT1(__B)[ 0]); __tmp.u1[ 1] = EMMMIN(EMM_UINT1(__A)[ 1],EMM_UINT1(__B)[ 1]); __tmp.u1[ 2] = EMMMIN(EMM_UINT1(__A)[ 2],EMM_UINT1(__B)[ 2]); __tmp.u1[ 3] = EMMMIN(EMM_UINT1(__A)[ 3],EMM_UINT1(__B)[ 3]); __tmp.u1[ 4] = EMMMIN(EMM_UINT1(__A)[ 4],EMM_UINT1(__B)[ 4]); __tmp.u1[ 5] = EMMMIN(EMM_UINT1(__A)[ 5],EMM_UINT1(__B)[ 5]); __tmp.u1[ 6] = EMMMIN(EMM_UINT1(__A)[ 6],EMM_UINT1(__B)[ 6]); __tmp.u1[ 7] = EMMMIN(EMM_UINT1(__A)[ 7],EMM_UINT1(__B)[ 7]); __tmp.u1[ 8] = EMMMIN(EMM_UINT1(__A)[ 8],EMM_UINT1(__B)[ 8]); __tmp.u1[ 9] = EMMMIN(EMM_UINT1(__A)[ 9],EMM_UINT1(__B)[ 9]); __tmp.u1[10] = EMMMIN(EMM_UINT1(__A)[10],EMM_UINT1(__B)[10]); __tmp.u1[11] = EMMMIN(EMM_UINT1(__A)[11],EMM_UINT1(__B)[11]); __tmp.u1[12] = EMMMIN(EMM_UINT1(__A)[12],EMM_UINT1(__B)[12]); __tmp.u1[13] = EMMMIN(EMM_UINT1(__A)[13],EMM_UINT1(__B)[13]); __tmp.u1[14] = EMMMIN(EMM_UINT1(__A)[14],EMM_UINT1(__B)[14]); __tmp.u1[15] = EMMMIN(EMM_UINT1(__A)[15],EMM_UINT1(__B)[15]); return __tmp.vi; }