From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 14866 invoked by alias); 26 Aug 2005 02:26:20 -0000 Mailing-List: contact gcc-bugs-help@gcc.gnu.org; run by ezmlm Precedence: bulk List-Archive: List-Post: List-Help: Sender: gcc-bugs-owner@gcc.gnu.org Received: (qmail 14797 invoked by uid 48); 26 Aug 2005 02:26:09 -0000 Date: Fri, 26 Aug 2005 02:32:00 -0000 From: "chen at sys dot wakayama-u dot ac dot jp" To: gcc-bugs@gcc.gnu.org Message-ID: <20050826022604.23570.chen@sys.wakayama-u.ac.jp> Reply-To: gcc-bugzilla@gcc.gnu.org Subject: [Bug c/23570] New: Internal compiler error X-Bugzilla-Reason: CC X-SW-Source: 2005-08/txt/msg02969.txt.bz2 List-Id: The compiler gives internal compiler error when I try to compile my program with -O2. If I compile with -O1, it's OK. % gcc -O2 -msse2 a.c a.c: In function 'ludcompf': a.c:505: internal compiler error: in merge_assigned_reloads, at reload1.c:6091 Please submit a full bug report, with preprocessed source if appropriate. See for instructions. gcc -v Using built-in specs. Target: i686-pc-linux-gnu Configured with: ../gcc-4.0.2/configure --prefix=/usr --libexecdir=/usr/lib --enable-shared --enable-threads=posix --enable-__cxa_atexit --enable-clocale=gnu --enable-libada --enable-languages=c,ada,c++,f95,java,objc,treelang Thread model: posix gcc version 4.0.2 20050825 (prerelease) /* a.c */ extern int printf (__const char *__restrict __format, ...); extern double fabs (double __x) __attribute__ ((__nothrow__)) __attribute__ ((__const__)); extern double __fabs (double __x) __attribute__ ((__nothrow__)) __attribute__ ((__const__)); typedef float __v4sf __attribute__ ((__vector_size__ (16))); typedef float __m128 __attribute__ ((__vector_size__ (16))); static __inline __m128 _mm_setzero_ps (void) { return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; } static __inline __m128 _mm_max_ps (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); } static __inline __m128 _mm_cmpeq_ps (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); } static __inline __m128 _mm_set1_ps (float __F) { return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; } static __inline __m128 _mm_and_ps (__m128 __A, __m128 __B) { return __builtin_ia32_andps (__A, __B); } static __inline __m128 _mm_loadu_ps (float const *__P) { return (__m128) __builtin_ia32_loadups (__P); } static __inline __m128 _mm_setr_ps (float __Z, float __Y, float __X, float __W) { return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; } static __inline void _mm_storeu_ps (float *__P, __m128 __A) { __builtin_ia32_storeups (__P, (__v4sf)__A); } static __inline __m128 _mm_add_ps (__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); } static __inline __m128 _mm_sub_ps (__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); } static __inline __m128 _mm_mul_ps (__m128 __A, __m128 __B) { return (__m128)__builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); } typedef double __v2df __attribute__ ((__vector_size__ (16))); typedef long long __v2di __attribute__ ((__vector_size__ (16))); typedef int __v4si __attribute__ ((__vector_size__ (16))); typedef __v2di __m128i; typedef __v2df __m128d; static __inline __m128d _mm_set1_pd (double __F) { return __extension__ (__m128d){ __F, __F }; } static __inline __m128d _mm_setr_pd (double __W, double __X) { return __extension__ (__m128d){ __W, __X }; } static __inline __m128d _mm_loadu_pd (double const *__P) { return __builtin_ia32_loadupd (__P); } static __inline void _mm_storeu_pd (double *__P, __m128d __A) { __builtin_ia32_storeupd (__P, __A); } static __inline __m128d _mm_set_sd (double __F) { return __extension__ (__m128d){ __F, 0 }; } static __inline __m128d _mm_load_sd (double const *__P) { return _mm_set_sd (*__P); } static __inline __m128d _mm_and_pd (__m128d __A, __m128d __B) { return __builtin_ia32_andpd (__A, __B); } static __inline __m128d se2_abssd(__m128d a) { static const union { __m128d m; unsigned int i[4]; } u = { .i[0] = 0xffffffffUL, .i[1] = 0x7fffffffUL, .i[2] = 0xffffffffUL, .i[3] = 0xffffffffUL }; __m128d msk = u.m; return (__m128d)_mm_and_pd(a, msk); } static __inline __m128d _mm_add_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B); } static __inline __m128d _mm_sub_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B); } static __inline __m128d _mm_mul_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B); } static __inline __m128d _mm_mul_sd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); } static __inline __m128d _mm_max_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); } static __inline __m128d _mm_unpackhi_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); } static __inline __m128d _mm_cmpeq_pd (__m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); } static __inline int _mm_comilt_sd (__m128d __A, __m128d __B) { return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); } static __inline __m128i _mm_add_epi32 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B); } static __inline __m128i _mm_and_si128 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B); } static __inline __m128i _mm_andnot_si128 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); } static __inline __m128i _mm_or_si128 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B); } typedef union { __m128 xmm; __m128i xmmi; __m128d xmmd; long long di[2]; unsigned long long udi[4]; int si[4]; unsigned int usi[4]; short hi[8]; unsigned short uhi[8]; signed char qi[16]; unsigned char uqi[16]; double df[2]; float sf[4]; } __attribute__ ((aligned(16))) um128; static __inline __m128 se_absps(__m128 a) { static const union { __m128 m; unsigned int i[4]; } u = { .i[0] = 0x7fffffffUL, .i[1] = 0x7fffffffUL, .i[2] = 0x7fffffffUL, .i[3] = 0x7fffffffUL }; __m128 msk = u.m; return (__m128)_mm_and_ps(a, msk); } static __inline __m128d se2_abspd(__m128d a) { static const union { __m128d m; unsigned int i[4]; } u = { .i[0] = 0xffffffffUL, .i[1] = 0x7fffffffUL, .i[2] = 0xffffffffUL, .i[3] = 0x7fffffffUL }; __m128 msk = u.m; return (__m128d)_mm_and_pd(a, msk); } static void swap_index(int *prow, int n1, int n2) { int *p1 = prow + n1; int *p2 = prow + n2; n1 = *p1; n2 = *p2; *p1 = n2; *p2 = n1; } static int sse2_max_abs_index(double *v, int step, int n) { __m128d m1, mm; __m128i mi1, mim, mi, msk; um128 u; double *v2end; int step2, n2; static const um128 i0i1 = { .si[0]=0, .si[1]=0, .si[2]=1, .si[3] = 0 }; static const um128 i1i1 = { .si[0]=2, .si[1]=0, .si[2]=2, .si[3] = 0 }; for (n2 = 0; n2 < n; ++n2) printf("%f ", v[step * n2]); printf("\n"); if (n <= 1) return 0; step2 = step + step; v2end = v + (n / 2) * step2; mm = se2_abspd(_mm_setr_pd(v[0], v[step])); v += step2; mi1 = i1i1.xmmi; mim = mi = i0i1.xmmi; while (v < v2end) { mi = _mm_add_epi32(mi, mi1); m1 = se2_abspd(_mm_setr_pd(v[0], v[step])); v += step2; mm = _mm_max_pd(mm, m1); msk = (__m128i)_mm_cmpeq_pd(m1, mm); mim = _mm_or_si128(_mm_and_si128(msk, mi), _mm_andnot_si128(msk, mim)); } if (n & 1) { mi = _mm_add_epi32(mi, mi1); m1 = se2_abssd(_mm_load_sd(v)); mm = _mm_max_pd(mm, m1); msk = (__m128i)_mm_cmpeq_pd(m1, mm); mim = _mm_or_si128(_mm_and_si128(msk, mi), _mm_andnot_si128(msk, mim)); } m1 = _mm_unpackhi_pd(mm, mm); u.xmmi = mim; if (_mm_comilt_sd(mm, m1)) return u.si[2]; return u.si[0]; } static void sse2_add_row(double *dst, double *src, double k, int n) { double *dst2end = dst + (n / 2) * 2; __m128d mk = _mm_set1_pd(k); while (dst < dst2end) { __m128d s = _mm_loadu_pd(src); __m128d d = _mm_loadu_pd(dst); s = _mm_mul_pd(s, mk); d = _mm_add_pd(d, s); _mm_storeu_pd(dst, d); src += 2; dst += 2; } if (n & 1) { dst[0] += k * src[0]; } } static void sse2_swap_row(double *r1, double *r2, int n) { double *r12end = r1 + (n / 2) * 2; while (r1 < r12end) { __m128d v1 = _mm_loadu_pd(r1); __m128d v2 = _mm_loadu_pd(r2); _mm_storeu_pd(r1, v2); _mm_storeu_pd(r2, v1); r1 += 2; r2 += 2; } if (n & 1) { double t = *r1; *r1 = *r2; *r2 = t; } } static int sse_max_abs_indexf(float *v, int step, int n) { __m128 m1, mm; __m128i mi1, mim, mi, msk; um128 u, ui; float *v4end, t; int n4, step2, step3, step4; static const um128 i0123 = { .si[0]=0, .si[1]=1, .si[2]=2, .si[3]=3 }; static const um128 i1111 = { .si[0]=4, .si[1]=4, .si[2]=4, .si[3]=4 }; if (n <= 1) return 0; n4 = (n / 4) * 4; mi1 = i1111.xmmi; mim = mi = i0123.xmmi; mm = _mm_setzero_ps(); if (n4 > 0) { step2 = step + step; step3 = step2 + step; step4 = step2 + step2; v4end = v + n4 * step; mm = se_absps(_mm_setr_ps(v[0], v[step], v[step2], v[step3])); v += step4; mi = _mm_add_epi32(mi, mi1); while (v < v4end) { m1 = se_absps(_mm_setr_ps(v[0], v[step], v[step2], v[step3])); mm = _mm_max_ps(mm, m1); msk = (__m128i)_mm_cmpeq_ps(m1, mm); mim = _mm_or_si128(_mm_and_si128(msk, mi), _mm_andnot_si128(msk, mim)); v += step4; mi = _mm_add_epi32(mi, mi1); } } n4 = n - n4; if (n4) { int i; u.xmm = _mm_setzero_ps(); for (i = 0; i < n4; ++i) { u.sf[i] = v[0]; v += step; } m1 = se_absps(u.xmm); mm = _mm_max_ps(mm, m1); msk = (__m128i)_mm_cmpeq_ps(m1, mm); mim = _mm_or_si128(_mm_and_si128(msk, mi), _mm_andnot_si128(msk, mim)); } ui.xmmi = mim; u.xmm = mm; t = u.sf[0]; n = 0; if (u.sf[1] > t) { t = u.sf[1]; n = 1; } if (u.sf[2] > t) { t = u.sf[2]; n = 2; } if (u.sf[3] > t) { t = u.sf[3]; n = 3; } return ui.si[n]; } static void sse_add_rowf(float *dst, float *src, float k, int n) { int n4 = (n / 4) * 4; int i; float *dst4end = dst + n4; __m128 mk = _mm_set1_ps(k); while (dst < dst4end) { __m128 s = _mm_loadu_ps(src); __m128 d = _mm_loadu_ps(dst); s = _mm_mul_ps(s, mk); d = _mm_add_ps(d, s); _mm_storeu_ps(dst, d); src += 4; dst += 4; } n4 = n - n4; for (i = 0; i < n4; ++i) { dst[i] += k * src[i]; } } static void sse_swap_rowf(float *r1, float *r2, int n) { int i; int n4 = (n / 4) * 4; float *r14end = r1 + n4; while (r1 < r14end) { __m128 v1 = _mm_loadu_ps(r1); __m128 v2 = _mm_loadu_ps(r2); _mm_storeu_ps(r1, v2); _mm_storeu_ps(r2, v1); r1 += 4; r2 += 4; } r14end = r1 + n - n4; while (r1 < r14end) { float t = *r1; *r1 = *r2; *r2 = t; r1++; r2++; } } int ludcompd(double *m, int nw, int *prow, int n) { int i, s = 0; double *pm; for (i = 0; i < n; ++i) prow[i] = i; printf("ludcompd(): SSE2 code is used.\n"); for (i = 0, pm = m; i < n - 1; ++i, pm += nw) { int vi = sse2_max_abs_index(pm + i, nw, n - i); double r, *pt; int j; if (vi != 0) { sse2_swap_row(pm, pm + vi * nw, nw); swap_index(prow, i, i + vi); s = 1 - s; } r = pm[i]; for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw) { double k = pt[i] / r; pt[i] = k; sse2_add_row(pt + i + 1, pm + i + 1, -k, n - i - 1); } } return s; } int ludcompf(float *m, int nw, int *prow, int n) { int i, s = 0; float *pm; for (i = 0; i < n; ++i) prow[i] = i; printf("ludcompf(): SSE2 code is used.\n"); for (i = 0, pm = m; i < n - 1; ++i, pm += nw) { int vi = sse_max_abs_indexf(pm + i, nw, n - i); float r, *pt; int j; if (vi != 0) { sse_swap_rowf(pm, pm + vi * nw, nw); swap_index(prow, i, i + vi); s = 1 - s; } r = pm[i]; for (j = i + 1, pt = pm + nw; j < n; ++j, pt += nw) { float k = pt[i] / r; pt[i] = k; sse_add_rowf(pt + i + 1, pm + i + 1, -k, n - i - 1); } } return s; } void test_ludcompd(void) { static double m[4][4] = { { 1, 2, 3, 4 }, { 4, 2, 1, 7 }, { 5, 6, 10, 78 }, { 3, 2, 1, 0 } }; int p[4]; printf("%d\n", ludcompd(&m[0][0], 4, p, 4)); printf("%d %d %d %d\n", p[0], p[1], p[2], p[3]); printf("%1.3f %1.3f %1.3f %1.3f\n", m[0][0], m[0][1], m[0][2], m[0][3]); printf("%1.3f %1.3f %1.3f %1.3f\n", m[1][0], m[1][1], m[1][2], m[1][3]); printf("%1.3f %1.3f %1.3f %1.3f\n", m[2][0], m[2][1], m[2][2], m[2][3]); printf("%1.3f %1.3f %1.3f %1.3f\n", m[3][0], m[3][1], m[3][2], m[3][3]); } void test_ludcompf(void) { static float m[4][4] = { { 1, 2, 3, 4 }, { 4, 2, 1, 7 }, { 5, 6, 10, 78 }, { 3, 2, 1, 0 } }; int p[4]; printf("%d\n", ludcompf(&m[0][0], 4, p, 4)); printf("%d %d %d %d\n", p[0], p[1], p[2], p[3]); printf("%1.3f %1.3f %1.3f %1.3f\n", m[0][0], m[0][1], m[0][2], m[0][3]); printf("%1.3f %1.3f %1.3f %1.3f\n", m[1][0], m[1][1], m[1][2], m[1][3]); printf("%1.3f %1.3f %1.3f %1.3f\n", m[2][0], m[2][1], m[2][2], m[2][3]); printf("%1.3f %1.3f %1.3f %1.3f\n", m[3][0], m[3][1], m[3][2], m[3][3]); } int main() { test_ludcompd(); test_ludcompf(); return 0; } -- Summary: Internal compiler error Product: gcc Version: 4.0.2 Status: UNCONFIRMED Severity: normal Priority: P2 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: chen at sys dot wakayama-u dot ac dot jp CC: gcc-bugs at gcc dot gnu dot org GCC build triplet: i686-pc-linux-gnu GCC host triplet: i686-pc-linux-gnu GCC target triplet: i686-pc-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=23570