* [x86, merge] Replace builtins with vector extensions
@ 2014-11-11 15:11 Marc Glisse
2014-11-12 8:14 ` Uros Bizjak
0 siblings, 1 reply; 2+ messages in thread
From: Marc Glisse @ 2014-11-11 15:11 UTC (permalink / raw)
To: gcc-patches; +Cc: ubizjak
[-- Attachment #1: Type: TEXT/PLAIN, Size: 2816 bytes --]
Hello,
here is the combined patch+ChangeLog. I'll run a last regtest just before
committing. Ok for trunk?
2014-11-12 Marc Glisse <marc.glisse@inria.fr>
gcc/
* config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
_mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
instead of builtins.
* config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
typedefs.
(_mm_sqrt_sd): Fix comment.
(_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
_mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
_mm_mullo_epi16, _mm_cmpeq_epi8, _mm_cmpeq_epi16, _mm_cmpeq_epi32,
_mm_cmplt_epi8, _mm_cmplt_epi16, _mm_cmplt_epi32, _mm_cmpgt_epi8,
_mm_cmpgt_epi16, _mm_cmpgt_epi32, _mm_and_si128, _mm_or_si128,
_mm_xor_si128, _mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd,
_mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
_mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64):
Use vector extensions instead of builtins.
* config/i386/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
_mm_mullo_epi32): Likewise.
* config/i386/avxintrin.h (__v4du, __v8su, __v16hu, __v32qu):
New typedefs.
(_mm256_add_pd, _mm256_add_ps, _mm256_div_pd, _mm256_div_ps,
_mm256_mul_pd, _mm256_mul_ps, _mm256_sub_pd, _mm256_sub_ps):
Use vector extensions instead of builtins.
* config/i386/avx2intrin.h (_mm256_cmpeq_epi8, _mm256_cmpeq_epi16,
_mm256_cmpeq_epi32, _mm256_cmpeq_epi64, _mm256_cmpgt_epi8,
_mm256_cmpgt_epi16, _mm256_cmpgt_epi32, _mm256_cmpgt_epi64,
_mm256_and_si256, _mm256_or_si256, _mm256_xor_si256, _mm256_add_epi8,
_mm256_add_epi16, _mm256_add_epi32, _mm256_add_epi64,
_mm256_mullo_epi16, _mm256_mullo_epi32, _mm256_sub_epi8,
_mm256_sub_epi16, _mm256_sub_epi32, _mm256_sub_epi64): Likewise.
* config/i386/avx512fintrin.h (__v8du, __v16su, __v32hu, __v64qu):
New typedefs.
(_mm512_or_si512, _mm512_or_epi32, _mm512_or_epi64, _mm512_xor_si512,
_mm512_xor_epi32, _mm512_xor_epi64, _mm512_and_si512,
_mm512_and_epi32, _mm512_and_epi64, _mm512_mullo_epi32,
_mm512_add_epi64, _mm512_sub_epi64, _mm512_add_epi32,
_mm512_sub_epi32, _mm512_add_pd, _mm512_add_ps, _mm512_sub_pd,
_mm512_sub_ps, _mm512_mul_pd, _mm512_mul_ps, _mm512_div_pd,
_mm512_div_ps): Use vector extensions instead of builtins.
* config/i386/avx512bwintrin.h (_mm512_mullo_epi16, _mm512_add_epi8,
_mm512_sub_epi8, _mm512_sub_epi16, _mm512_add_epi16): Likewise.
* config/i386/avx512dqintrin.h (_mm512_mullo_epi64): Likewise.
* config/i386/avx512vldqintrin.h (_mm256_mullo_epi64, _mm_mullo_epi64):
Likewise.
gcc/testsuite/
* gcc.target/i386/intrinsics_opt-1.c: New testcase.
* gcc.target/i386/intrinsics_opt-2.c: Likewise.
* gcc.target/i386/intrinsics_opt-3.c: Likewise.
* gcc.target/i386/intrinsics_opt-4.c: Likewise.
--
Marc Glisse
[-- Attachment #2: Type: TEXT/PLAIN, Size: 40669 bytes --]
diff -ru -N -x .svn trunk/gcc/config/i386/avx2intrin.h intrin/gcc/config/i386/avx2intrin.h
--- trunk/gcc/config/i386/avx2intrin.h 2014-04-01 07:34:06.335878860 +0200
+++ intrin/gcc/config/i386/avx2intrin.h 2014-11-10 21:56:37.040719810 +0100
@@ -104,28 +104,28 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi8 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
+ return (__m256i) ((__v32qu)__A + (__v32qu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi16 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
+ return (__m256i) ((__v16hu)__A + (__v16hu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi32 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
+ return (__m256i) ((__v8su)__A + (__v8su)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_epi64 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4du)__A + (__v4du)__B);
}
extern __inline __m256i
@@ -178,7 +178,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_and_si256 (__m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4du)__A & (__v4du)__B);
}
extern __inline __m256i
@@ -230,59 +230,56 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
+ return (__m256i) ((__v32qi)__A == (__v32qi)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
+ return (__m256i) ((__v16hi)__A == (__v16hi)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
+ return (__m256i) ((__v8si)__A == (__v8si)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4di)__A == (__v4di)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
- (__v32qi)__B);
+ return (__m256i) ((__v32qi)__A > (__v32qi)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
- (__v16hi)__B);
+ return (__m256i) ((__v16hi)__A > (__v16hi)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
- (__v8si)__B);
+ return (__m256i) ((__v8si)__A > (__v8si)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4di)__A > (__v4di)__B);
}
extern __inline __m256i
@@ -555,14 +552,14 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi16 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
+ return (__m256i) ((__v16hu)__A * (__v16hu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi32 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
+ return (__m256i) ((__v8su)__A * (__v8su)__B);
}
extern __inline __m256i
@@ -576,7 +573,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_or_si256 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4du)__A | (__v4du)__B);
}
extern __inline __m256i
@@ -785,28 +782,28 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi8 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
+ return (__m256i) ((__v32qu)__A - (__v32qu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi16 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
+ return (__m256i) ((__v16hu)__A - (__v16hu)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi32 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
+ return (__m256i) ((__v8su)__A - (__v8su)__B);
}
extern __inline __m256i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_epi64 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4du)__A - (__v4du)__B);
}
extern __inline __m256i
@@ -897,7 +894,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_xor_si256 (__m256i __A, __m256i __B)
{
- return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
+ return (__m256i) ((__v4du)__A ^ (__v4du)__B);
}
extern __inline __m256i
diff -ru -N -x .svn trunk/gcc/config/i386/avx512bwintrin.h intrin/gcc/config/i386/avx512bwintrin.h
--- trunk/gcc/config/i386/avx512bwintrin.h 2014-11-02 23:20:47.754876185 +0100
+++ intrin/gcc/config/i386/avx512bwintrin.h 2014-11-08 07:06:53.896984426 +0100
@@ -464,11 +464,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mullo_epi16 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i) ((__v32hu) __A * (__v32hu) __B);
}
extern __inline __m512i
@@ -673,11 +669,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi8 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi)
- _mm512_setzero_qi (),
- (__mmask64) -1);
+ return (__m512i) ((__v64qu) __A + (__v64qu) __B);
}
extern __inline __m512i
@@ -706,11 +698,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi8 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi)
- _mm512_setzero_qi (),
- (__mmask64) -1);
+ return (__m512i) ((__v64qu) __A - (__v64qu) __B);
}
extern __inline __m512i
@@ -904,11 +892,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi16 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i) ((__v32hu) __A - (__v32hu) __B);
}
extern __inline __m512i
@@ -1003,11 +987,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi16 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi)
- _mm512_setzero_hi (),
- (__mmask32) -1);
+ return (__m512i) ((__v32hu) __A + (__v32hu) __B);
}
extern __inline __m512i
diff -ru -N -x .svn trunk/gcc/config/i386/avx512dqintrin.h intrin/gcc/config/i386/avx512dqintrin.h
--- trunk/gcc/config/i386/avx512dqintrin.h 2014-11-02 23:20:47.754876185 +0100
+++ intrin/gcc/config/i386/avx512dqintrin.h 2014-11-08 07:02:56.623860549 +0100
@@ -225,11 +225,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mullo_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A * (__v8du) __B);
}
extern __inline __m512i
diff -ru -N -x .svn trunk/gcc/config/i386/avx512fintrin.h intrin/gcc/config/i386/avx512fintrin.h
--- trunk/gcc/config/i386/avx512fintrin.h 2014-08-23 13:18:11.915096696 +0200
+++ intrin/gcc/config/i386/avx512fintrin.h 2014-11-09 13:23:54.095388098 +0100
@@ -38,9 +38,13 @@
typedef double __v8df __attribute__ ((__vector_size__ (64)));
typedef float __v16sf __attribute__ ((__vector_size__ (64)));
typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef unsigned long long __v8du __attribute__ ((__vector_size__ (64)));
typedef int __v16si __attribute__ ((__vector_size__ (64)));
+typedef unsigned int __v16su __attribute__ ((__vector_size__ (64)));
typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+typedef unsigned short __v32hu __attribute__ ((__vector_size__ (64)));
typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef unsigned char __v64qu __attribute__ ((__vector_size__ (64)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
@@ -515,11 +519,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mullo_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A * (__v16su) __B);
}
extern __inline __m512i
@@ -642,11 +642,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_undefined_si512 (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A + (__v8du) __B);
}
extern __inline __m512i
@@ -674,11 +670,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_undefined_pd (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A - (__v8du) __B);
}
extern __inline __m512i
@@ -802,11 +794,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A + (__v16su) __B);
}
extern __inline __m512i
@@ -865,11 +853,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A - (__v16su) __B);
}
extern __inline __m512i
@@ -6797,22 +6781,14 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_or_si512 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A | (__v16su) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_or_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A | (__v16su) __B);
}
extern __inline __m512i
@@ -6840,11 +6816,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_or_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_undefined_si512 (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A | (__v8du) __B);
}
extern __inline __m512i
@@ -6872,22 +6844,14 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_xor_si512 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A ^ (__v16su) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_xor_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A ^ (__v16su) __B);
}
extern __inline __m512i
@@ -6915,11 +6879,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_xor_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_undefined_si512 (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A ^ (__v8du) __B);
}
extern __inline __m512i
@@ -7128,22 +7088,14 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_and_si512 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A & (__v16su) __B);
}
extern __inline __m512i
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_and_epi32 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
- (__v16si) __B,
- (__v16si)
- _mm512_undefined_si512 (),
- (__mmask16) -1);
+ return (__m512i) ((__v16su) __A & (__v16su) __B);
}
extern __inline __m512i
@@ -7171,11 +7123,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_and_epi64 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
- (__v8di) __B,
- (__v8di)
- _mm512_undefined_si512 (),
- (__mmask8) -1);
+ return (__m512i) ((__v8du) __A & (__v8du) __B);
}
extern __inline __m512i
@@ -10749,12 +10697,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_pd (__m512d __A, __m512d __B)
{
- return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d) ((__v8df)__A + (__v8df)__B);
}
extern __inline __m512d
@@ -10784,12 +10727,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_add_ps (__m512 __A, __m512 __B)
{
- return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512) ((__v16sf)__A + (__v16sf)__B);
}
extern __inline __m512
@@ -10819,12 +10757,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_pd (__m512d __A, __m512d __B)
{
- return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d) ((__v8df)__A - (__v8df)__B);
}
extern __inline __m512d
@@ -10854,12 +10787,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_sub_ps (__m512 __A, __m512 __B)
{
- return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512) ((__v16sf)__A - (__v16sf)__B);
}
extern __inline __m512
@@ -10889,12 +10817,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_pd (__m512d __A, __m512d __B)
{
- return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d) ((__v8df)__A * (__v8df)__B);
}
extern __inline __m512d
@@ -10924,12 +10847,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_mul_ps (__m512 __A, __m512 __B)
{
- return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512) ((__v16sf)__A * (__v16sf)__B);
}
extern __inline __m512
@@ -10959,12 +10877,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_pd (__m512d __M, __m512d __V)
{
- return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
- (__v8df) __V,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512d) ((__v8df)__M / (__v8df)__V);
}
extern __inline __m512d
@@ -10994,12 +10907,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm512_div_ps (__m512 __A, __m512 __B)
{
- return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
+ return (__m512) ((__v16sf)__A / (__v16sf)__B);
}
extern __inline __m512
diff -ru -N -x .svn trunk/gcc/config/i386/avx512vldqintrin.h intrin/gcc/config/i386/avx512vldqintrin.h
--- trunk/gcc/config/i386/avx512vldqintrin.h 2014-11-10 13:09:34.727589835 +0100
+++ intrin/gcc/config/i386/avx512vldqintrin.h 2014-11-11 15:30:32.090155679 +0100
@@ -544,11 +544,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mullo_epi64 (__m256i __A, __m256i __B)
{
- return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
- (__v4di) __B,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) -1);
+ return (__m256i) ((__v4du) __A * (__v4du) __B);
}
extern __inline __m256i
@@ -577,11 +573,7 @@
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
- (__v2di) __B,
- (__v2di)
- _mm_setzero_di (),
- (__mmask8) -1);
+ return (__m128i) ((__v2du) __A * (__v2du) __B);
}
extern __inline __m128i
diff -ru -N -x .svn trunk/gcc/config/i386/avxintrin.h intrin/gcc/config/i386/avxintrin.h
--- trunk/gcc/config/i386/avxintrin.h 2014-04-01 07:34:06.339879012 +0200
+++ intrin/gcc/config/i386/avxintrin.h 2014-11-08 06:55:45.615292438 +0100
@@ -41,9 +41,13 @@
typedef double __v4df __attribute__ ((__vector_size__ (32)));
typedef float __v8sf __attribute__ ((__vector_size__ (32)));
typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
@@ -124,13 +128,13 @@
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
+ return (__m256d) ((__v4df)__A + (__v4df)__B);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_add_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
+ return (__m256) ((__v8sf)__A + (__v8sf)__B);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -218,13 +222,13 @@
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_div_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
+ return (__m256d) ((__v4df)__A / (__v4df)__B);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_div_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
+ return (__m256) ((__v8sf)__A / (__v8sf)__B);
}
/* Dot product instructions with mask-defined summing and zeroing parts
@@ -295,13 +299,13 @@
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
+ return (__m256d) ((__v4df)__A * (__v4df)__B);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_mul_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
+ return (__m256) ((__v8sf)__A * (__v8sf)__B);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -343,13 +347,13 @@
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_pd (__m256d __A, __m256d __B)
{
- return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
+ return (__m256d) ((__v4df)__A - (__v4df)__B);
}
extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm256_sub_ps (__m256 __A, __m256 __B)
{
- return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
+ return (__m256) ((__v8sf)__A - (__v8sf)__B);
}
extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff -ru -N -x .svn trunk/gcc/config/i386/emmintrin.h intrin/gcc/config/i386/emmintrin.h
--- trunk/gcc/config/i386/emmintrin.h 2014-04-01 07:34:06.335878860 +0200
+++ intrin/gcc/config/i386/emmintrin.h 2014-11-09 13:43:42.213157229 +0100
@@ -39,9 +39,13 @@
/* SSE2 */
typedef double __v2df __attribute__ ((__vector_size__ (16)));
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
@@ -168,13 +172,13 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd (double *__P, __m128d __A)
{
- *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
+ *__P = ((__v2df)__A)[0];
}
extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64 (__m128d __A)
{
- return __builtin_ia32_vec_ext_v2df (__A, 0);
+ return ((__v2df)__A)[0];
}
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -187,7 +191,7 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd (double *__P, __m128d __A)
{
- *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
+ *__P = ((__v2df)__A)[1];
}
/* Store the lower DPFP value across two words.
@@ -222,21 +226,21 @@
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64 (__m128i __A)
{
- return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+ return ((__v2di)__A)[0];
}
/* Microsoft intrinsic. */
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x (__m128i __A)
{
- return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+ return ((__v2di)__A)[0];
}
#endif
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) ((__v2df)__A + (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -248,7 +252,7 @@
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) ((__v2df)__A - (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -260,7 +264,7 @@
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) ((__v2df)__A * (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -272,7 +276,7 @@
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd (__m128d __A, __m128d __B)
{
- return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+ return (__m128d) ((__v2df)__A / (__v2df)__B);
}
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -287,7 +291,7 @@
return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
}
-/* Return pair {sqrt (A[0), B[1]}. */
+/* Return pair {sqrt (B[0]), A[1]}. */
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd (__m128d __A, __m128d __B)
{
@@ -715,13 +719,13 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64 (__m128i *__P, __m128i __B)
{
- *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+ *(long long *)__P = ((__v2di)__B)[0];
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64 (__m128i __B)
{
- return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+ return (__m64) ((__v2di)__B)[0];
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1006,25 +1010,25 @@
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qu)__A + (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A + (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4su)__A + (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A + (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1054,25 +1058,25 @@
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qu)__A - (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A - (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4su)__A - (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A - (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1114,7 +1118,7 @@
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A * (__v8hu)__B);
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1247,7 +1251,7 @@
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A & (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1259,67 +1263,67 @@
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A | (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A ^ (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qi)__A == (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hi)__A == (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4si)__A == (__v4si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
+ return (__m128i) ((__v16qi)__A < (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
+ return (__m128i) ((__v8hi)__A < (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
+ return (__m128i) ((__v4si)__A < (__v4si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qi)__A > (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hi)__A > (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4si)__A > (__v4si)__B);
}
#ifdef __OPTIMIZE__
diff -ru -N -x .svn trunk/gcc/config/i386/smmintrin.h intrin/gcc/config/i386/smmintrin.h
--- trunk/gcc/config/i386/smmintrin.h 2014-01-03 11:39:01.159907676 +0100
+++ intrin/gcc/config/i386/smmintrin.h 2014-11-09 13:45:01.300194921 +0100
@@ -267,7 +267,7 @@
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y);
+ return (__m128i) ((__v2di)__X == (__v2di)__Y);
}
/* Min/max packed integer instructions. */
@@ -325,7 +325,7 @@
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi32 (__m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y);
+ return (__m128i) ((__v4su)__X * (__v4su)__Y);
}
/* Packed integer 32-bit multiplication of 2 pairs of operands
@@ -795,7 +795,7 @@
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pcmpgtq ((__v2di)__X, (__v2di)__Y);
+ return (__m128i) ((__v2di)__X > (__v2di)__Y);
}
#ifdef __DISABLE_SSE4_2__
diff -ru -N -x .svn trunk/gcc/config/i386/xmmintrin.h intrin/gcc/config/i386/xmmintrin.h
--- trunk/gcc/config/i386/xmmintrin.h 2014-06-29 20:12:33.675949023 +0200
+++ intrin/gcc/config/i386/xmmintrin.h 2014-10-17 19:53:06.711416455 +0200
@@ -180,25 +180,25 @@
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) ((__v4sf)__A + (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) ((__v4sf)__A - (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) ((__v4sf)__A * (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_ps (__m128 __A, __m128 __B)
{
- return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+ return (__m128) ((__v4sf)__A / (__v4sf)__B);
}
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -957,13 +957,13 @@
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_ss (float *__P, __m128 __A)
{
- *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+ *__P = ((__v4sf)__A)[0];
}
extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_f32 (__m128 __A)
{
- return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+ return ((__v4sf)__A)[0];
}
/* Store four SPFP values. The address must be 16-byte aligned. */
diff -ru -N -x .svn trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c
--- trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c 1970-01-01 01:00:00.000000000 +0100
+++ intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c 2014-10-10 14:07:06.983348081 +0200
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mfma" } */
+
+#include <emmintrin.h>
+
+__m128d myfma(__m128d x, __m128d y, __m128d z){
+ __m128d m = _mm_mul_pd (x, y);
+ return _mm_add_pd (m, z);
+}
+
+/* { dg-final { scan-assembler "vfmadd" } } */
diff -ru -N -x .svn trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c
--- trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c 1970-01-01 01:00:00.000000000 +0100
+++ intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c 2014-10-10 14:07:06.983348081 +0200
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-O -ffast-math -msse2 -fdump-tree-optimized" } */
+
+#include <emmintrin.h>
+
+int f(__m128d x){
+ x = _mm_sub_pd (x, x);
+ x = _mm_mul_pd (x, x);
+ double r = 42;
+ _mm_storeh_pd (&r, x);
+ int z = r == 0;
+ return __builtin_constant_p (z) && z;
+}
+
+/* { dg-final { scan-tree-dump "return 1;" "optimized" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff -ru -N -x .svn trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c
--- trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c 1970-01-01 01:00:00.000000000 +0100
+++ intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c 2014-10-12 22:28:46.603705558 +0200
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O -msse2" } */
+
+#include <emmintrin.h>
+
+double f(){
+ __m128d x = _mm_set1_pd (0.);
+ double r = 42;
+ _mm_storeh_pd (&r, x);
+ return r;
+}
+
+/* { dg-final { scan-assembler-not "unpckhpd" } } */
diff -ru -N -x .svn trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c
--- trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c 1970-01-01 01:00:00.000000000 +0100
+++ intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c 2014-10-12 22:28:52.699931988 +0200
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O -ffast-math -msse2" } */
+
+#include <emmintrin.h>
+
+__m128d f(__m128d x, __m128d y, __m128d z){
+ y = _mm_add_pd (x, y);
+ y = _mm_add_pd (z, y);
+ return _mm_sub_pd (y, x);
+}
+
+/* { dg-final { scan-assembler-not "subpd" } } */
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [x86, merge] Replace builtins with vector extensions
2014-11-11 15:11 [x86, merge] Replace builtins with vector extensions Marc Glisse
@ 2014-11-12 8:14 ` Uros Bizjak
0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2014-11-12 8:14 UTC (permalink / raw)
To: Marc Glisse; +Cc: gcc-patches, Kirill Yukhin
On Tue, Nov 11, 2014 at 4:09 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> Hello,
>
> here is the combined patch+ChangeLog. I'll run a last regtest just before
> committing. Ok for trunk?
>
> 2014-11-12 Marc Glisse <marc.glisse@inria.fr>
>
> gcc/
> * config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
> _mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
> instead of builtins.
> * config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
> typedefs.
> (_mm_sqrt_sd): Fix comment.
> (_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
> _mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
> _mm_mullo_epi16, _mm_cmpeq_epi8, _mm_cmpeq_epi16, _mm_cmpeq_epi32,
> _mm_cmplt_epi8, _mm_cmplt_epi16, _mm_cmplt_epi32, _mm_cmpgt_epi8,
> _mm_cmpgt_epi16, _mm_cmpgt_epi32, _mm_and_si128, _mm_or_si128,
> _mm_xor_si128, _mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd,
> _mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
> _mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64):
> Use vector extensions instead of builtins.
> * config/i386/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
> _mm_mullo_epi32): Likewise.
> * config/i386/avxintrin.h (__v4du, __v8su, __v16hu, __v32qu):
> New typedefs.
> (_mm256_add_pd, _mm256_add_ps, _mm256_div_pd, _mm256_div_ps,
> _mm256_mul_pd, _mm256_mul_ps, _mm256_sub_pd, _mm256_sub_ps):
> Use vector extensions instead of builtins.
> * config/i386/avx2intrin.h (_mm256_cmpeq_epi8, _mm256_cmpeq_epi16,
> _mm256_cmpeq_epi32, _mm256_cmpeq_epi64, _mm256_cmpgt_epi8,
> _mm256_cmpgt_epi16, _mm256_cmpgt_epi32, _mm256_cmpgt_epi64,
> _mm256_and_si256, _mm256_or_si256, _mm256_xor_si256,
> _mm256_add_epi8,
> _mm256_add_epi16, _mm256_add_epi32, _mm256_add_epi64,
> _mm256_mullo_epi16, _mm256_mullo_epi32, _mm256_sub_epi8,
> _mm256_sub_epi16, _mm256_sub_epi32, _mm256_sub_epi64): Likewise.
> * config/i386/avx512fintrin.h (__v8du, __v16su, __v32hu, __v64qu):
> New typedefs.
> (_mm512_or_si512, _mm512_or_epi32, _mm512_or_epi64,
> _mm512_xor_si512,
> _mm512_xor_epi32, _mm512_xor_epi64, _mm512_and_si512,
> _mm512_and_epi32, _mm512_and_epi64, _mm512_mullo_epi32,
> _mm512_add_epi64, _mm512_sub_epi64, _mm512_add_epi32,
> _mm512_sub_epi32, _mm512_add_pd, _mm512_add_ps, _mm512_sub_pd,
> _mm512_sub_ps, _mm512_mul_pd, _mm512_mul_ps, _mm512_div_pd,
> _mm512_div_ps): Use vector extensions instead of builtins.
> * config/i386/avx512bwintrin.h (_mm512_mullo_epi16, _mm512_add_epi8,
> _mm512_sub_epi8, _mm512_sub_epi16, _mm512_add_epi16): Likewise.
> * config/i386/avx512dqintrin.h (_mm512_mullo_epi64): Likewise.
> * config/i386/avx512vldqintrin.h (_mm256_mullo_epi64,
> _mm_mullo_epi64):
> Likewise.
>
> gcc/testsuite/
> * gcc.target/i386/intrinsics_opt-1.c: New testcase.
> * gcc.target/i386/intrinsics_opt-2.c: Likewise.
> * gcc.target/i386/intrinsics_opt-3.c: Likewise.
> * gcc.target/i386/intrinsics_opt-4.c: Likewise.
OK for mainline.
Thanks,
Uros.
> --
> Marc Glisse
> diff -ru -N -x .svn trunk/gcc/config/i386/avx2intrin.h
> intrin/gcc/config/i386/avx2intrin.h
> --- trunk/gcc/config/i386/avx2intrin.h 2014-04-01 07:34:06.335878860 +0200
> +++ intrin/gcc/config/i386/avx2intrin.h 2014-11-10 21:56:37.040719810 +0100
> @@ -104,28 +104,28 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_add_epi8 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B);
> + return (__m256i) ((__v32qu)__A + (__v32qu)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_add_epi16 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B);
> + return (__m256i) ((__v16hu)__A + (__v16hu)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_add_epi32 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B);
> + return (__m256i) ((__v8su)__A + (__v8su)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_add_epi64 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B);
> + return (__m256i) ((__v4du)__A + (__v4du)__B);
> }
>
> extern __inline __m256i
> @@ -178,7 +178,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_and_si256 (__m256i __A, __m256i __B)
> {
> - return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B);
> + return (__m256i) ((__v4du)__A & (__v4du)__B);
> }
>
> extern __inline __m256i
> @@ -230,59 +230,56 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B);
> + return (__m256i) ((__v32qi)__A == (__v32qi)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B);
> + return (__m256i) ((__v16hi)__A == (__v16hi)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B);
> + return (__m256i) ((__v8si)__A == (__v8si)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B);
> + return (__m256i) ((__v4di)__A == (__v4di)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A,
> - (__v32qi)__B);
> + return (__m256i) ((__v32qi)__A > (__v32qi)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A,
> - (__v16hi)__B);
> + return (__m256i) ((__v16hi)__A > (__v16hi)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A,
> - (__v8si)__B);
> + return (__m256i) ((__v8si)__A > (__v8si)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B);
> + return (__m256i) ((__v4di)__A > (__v4di)__B);
> }
>
> extern __inline __m256i
> @@ -555,14 +552,14 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_mullo_epi16 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B);
> + return (__m256i) ((__v16hu)__A * (__v16hu)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_mullo_epi32 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B);
> + return (__m256i) ((__v8su)__A * (__v8su)__B);
> }
>
> extern __inline __m256i
> @@ -576,7 +573,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_or_si256 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B);
> + return (__m256i) ((__v4du)__A | (__v4du)__B);
> }
>
> extern __inline __m256i
> @@ -785,28 +782,28 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_sub_epi8 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B);
> + return (__m256i) ((__v32qu)__A - (__v32qu)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_sub_epi16 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B);
> + return (__m256i) ((__v16hu)__A - (__v16hu)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_sub_epi32 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B);
> + return (__m256i) ((__v8su)__A - (__v8su)__B);
> }
>
> extern __inline __m256i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_sub_epi64 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B);
> + return (__m256i) ((__v4du)__A - (__v4du)__B);
> }
>
> extern __inline __m256i
> @@ -897,7 +894,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_xor_si256 (__m256i __A, __m256i __B)
> {
> - return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B);
> + return (__m256i) ((__v4du)__A ^ (__v4du)__B);
> }
>
> extern __inline __m256i
> diff -ru -N -x .svn trunk/gcc/config/i386/avx512bwintrin.h
> intrin/gcc/config/i386/avx512bwintrin.h
> --- trunk/gcc/config/i386/avx512bwintrin.h 2014-11-02
> 23:20:47.754876185 +0100
> +++ intrin/gcc/config/i386/avx512bwintrin.h 2014-11-08
> 07:06:53.896984426 +0100
> @@ -464,11 +464,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_mullo_epi16 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
> - (__v32hi) __B,
> - (__v32hi)
> - _mm512_setzero_hi (),
> - (__mmask32) -1);
> + return (__m512i) ((__v32hu) __A * (__v32hu) __B);
> }
>
> extern __inline __m512i
> @@ -673,11 +669,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_add_epi8 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
> - (__v64qi) __B,
> - (__v64qi)
> - _mm512_setzero_qi (),
> - (__mmask64) -1);
> + return (__m512i) ((__v64qu) __A + (__v64qu) __B);
> }
>
> extern __inline __m512i
> @@ -706,11 +698,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_sub_epi8 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
> - (__v64qi) __B,
> - (__v64qi)
> - _mm512_setzero_qi (),
> - (__mmask64) -1);
> + return (__m512i) ((__v64qu) __A - (__v64qu) __B);
> }
>
> extern __inline __m512i
> @@ -904,11 +892,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_sub_epi16 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
> - (__v32hi) __B,
> - (__v32hi)
> - _mm512_setzero_hi (),
> - (__mmask32) -1);
> + return (__m512i) ((__v32hu) __A - (__v32hu) __B);
> }
>
> extern __inline __m512i
> @@ -1003,11 +987,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_add_epi16 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
> - (__v32hi) __B,
> - (__v32hi)
> - _mm512_setzero_hi (),
> - (__mmask32) -1);
> + return (__m512i) ((__v32hu) __A + (__v32hu) __B);
> }
>
> extern __inline __m512i
> diff -ru -N -x .svn trunk/gcc/config/i386/avx512dqintrin.h
> intrin/gcc/config/i386/avx512dqintrin.h
> --- trunk/gcc/config/i386/avx512dqintrin.h 2014-11-02
> 23:20:47.754876185 +0100
> +++ intrin/gcc/config/i386/avx512dqintrin.h 2014-11-08
> 07:02:56.623860549 +0100
> @@ -225,11 +225,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_mullo_epi64 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
> - (__v8di) __B,
> - (__v8di)
> - _mm512_setzero_si512 (),
> - (__mmask8) -1);
> + return (__m512i) ((__v8du) __A * (__v8du) __B);
> }
>
> extern __inline __m512i
> diff -ru -N -x .svn trunk/gcc/config/i386/avx512fintrin.h
> intrin/gcc/config/i386/avx512fintrin.h
> --- trunk/gcc/config/i386/avx512fintrin.h 2014-08-23
> 13:18:11.915096696 +0200
> +++ intrin/gcc/config/i386/avx512fintrin.h 2014-11-09
> 13:23:54.095388098 +0100
> @@ -38,9 +38,13 @@
> typedef double __v8df __attribute__ ((__vector_size__ (64)));
> typedef float __v16sf __attribute__ ((__vector_size__ (64)));
> typedef long long __v8di __attribute__ ((__vector_size__ (64)));
> +typedef unsigned long long __v8du __attribute__ ((__vector_size__ (64)));
> typedef int __v16si __attribute__ ((__vector_size__ (64)));
> +typedef unsigned int __v16su __attribute__ ((__vector_size__ (64)));
> typedef short __v32hi __attribute__ ((__vector_size__ (64)));
> +typedef unsigned short __v32hu __attribute__ ((__vector_size__ (64)));
> typedef char __v64qi __attribute__ ((__vector_size__ (64)));
> +typedef unsigned char __v64qu __attribute__ ((__vector_size__ (64)));
>
> /* The Intel API is flexible enough that we must allow aliasing with other
> vector types, and their scalar components. */
> @@ -515,11 +519,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_mullo_epi32 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A * (__v16su) __B);
> }
>
> extern __inline __m512i
> @@ -642,11 +642,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_add_epi64 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
> - (__v8di) __B,
> - (__v8di)
> - _mm512_undefined_si512 (),
> - (__mmask8) -1);
> + return (__m512i) ((__v8du) __A + (__v8du) __B);
> }
>
> extern __inline __m512i
> @@ -674,11 +670,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_sub_epi64 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
> - (__v8di) __B,
> - (__v8di)
> - _mm512_undefined_pd (),
> - (__mmask8) -1);
> + return (__m512i) ((__v8du) __A - (__v8du) __B);
> }
>
> extern __inline __m512i
> @@ -802,11 +794,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_add_epi32 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A + (__v16su) __B);
> }
>
> extern __inline __m512i
> @@ -865,11 +853,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_sub_epi32 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A - (__v16su) __B);
> }
>
> extern __inline __m512i
> @@ -6797,22 +6781,14 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_or_si512 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A | (__v16su) __B);
> }
>
> extern __inline __m512i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_or_epi32 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A | (__v16su) __B);
> }
>
> extern __inline __m512i
> @@ -6840,11 +6816,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_or_epi64 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
> - (__v8di) __B,
> - (__v8di)
> - _mm512_undefined_si512 (),
> - (__mmask8) -1);
> + return (__m512i) ((__v8du) __A | (__v8du) __B);
> }
>
> extern __inline __m512i
> @@ -6872,22 +6844,14 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_xor_si512 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A ^ (__v16su) __B);
> }
>
> extern __inline __m512i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_xor_epi32 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A ^ (__v16su) __B);
> }
>
> extern __inline __m512i
> @@ -6915,11 +6879,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_xor_epi64 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
> - (__v8di) __B,
> - (__v8di)
> - _mm512_undefined_si512 (),
> - (__mmask8) -1);
> + return (__m512i) ((__v8du) __A ^ (__v8du) __B);
> }
>
> extern __inline __m512i
> @@ -7128,22 +7088,14 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_and_si512 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A & (__v16su) __B);
> }
>
> extern __inline __m512i
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_and_epi32 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
> - (__v16si) __B,
> - (__v16si)
> - _mm512_undefined_si512 (),
> - (__mmask16) -1);
> + return (__m512i) ((__v16su) __A & (__v16su) __B);
> }
>
> extern __inline __m512i
> @@ -7171,11 +7123,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_and_epi64 (__m512i __A, __m512i __B)
> {
> - return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
> - (__v8di) __B,
> - (__v8di)
> - _mm512_undefined_si512 (),
> - (__mmask8) -1);
> + return (__m512i) ((__v8du) __A & (__v8du) __B);
> }
>
> extern __inline __m512i
> @@ -10749,12 +10697,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_add_pd (__m512d __A, __m512d __B)
> {
> - return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
> - (__v8df) __B,
> - (__v8df)
> - _mm512_undefined_pd (),
> - (__mmask8) -1,
> - _MM_FROUND_CUR_DIRECTION);
> + return (__m512d) ((__v8df)__A + (__v8df)__B);
> }
>
> extern __inline __m512d
> @@ -10784,12 +10727,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_add_ps (__m512 __A, __m512 __B)
> {
> - return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
> - (__v16sf) __B,
> - (__v16sf)
> - _mm512_undefined_ps (),
> - (__mmask16) -1,
> - _MM_FROUND_CUR_DIRECTION);
> + return (__m512) ((__v16sf)__A + (__v16sf)__B);
> }
>
> extern __inline __m512
> @@ -10819,12 +10757,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_sub_pd (__m512d __A, __m512d __B)
> {
> - return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
> - (__v8df) __B,
> - (__v8df)
> - _mm512_undefined_pd (),
> - (__mmask8) -1,
> - _MM_FROUND_CUR_DIRECTION);
> + return (__m512d) ((__v8df)__A - (__v8df)__B);
> }
>
> extern __inline __m512d
> @@ -10854,12 +10787,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_sub_ps (__m512 __A, __m512 __B)
> {
> - return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
> - (__v16sf) __B,
> - (__v16sf)
> - _mm512_undefined_ps (),
> - (__mmask16) -1,
> - _MM_FROUND_CUR_DIRECTION);
> + return (__m512) ((__v16sf)__A - (__v16sf)__B);
> }
>
> extern __inline __m512
> @@ -10889,12 +10817,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_mul_pd (__m512d __A, __m512d __B)
> {
> - return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
> - (__v8df) __B,
> - (__v8df)
> - _mm512_undefined_pd (),
> - (__mmask8) -1,
> - _MM_FROUND_CUR_DIRECTION);
> + return (__m512d) ((__v8df)__A * (__v8df)__B);
> }
>
> extern __inline __m512d
> @@ -10924,12 +10847,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_mul_ps (__m512 __A, __m512 __B)
> {
> - return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
> - (__v16sf) __B,
> - (__v16sf)
> - _mm512_undefined_ps (),
> - (__mmask16) -1,
> - _MM_FROUND_CUR_DIRECTION);
> + return (__m512) ((__v16sf)__A * (__v16sf)__B);
> }
>
> extern __inline __m512
> @@ -10959,12 +10877,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_div_pd (__m512d __M, __m512d __V)
> {
> - return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
> - (__v8df) __V,
> - (__v8df)
> - _mm512_undefined_pd (),
> - (__mmask8) -1,
> - _MM_FROUND_CUR_DIRECTION);
> + return (__m512d) ((__v8df)__M / (__v8df)__V);
> }
>
> extern __inline __m512d
> @@ -10994,12 +10907,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm512_div_ps (__m512 __A, __m512 __B)
> {
> - return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
> - (__v16sf) __B,
> - (__v16sf)
> - _mm512_undefined_ps (),
> - (__mmask16) -1,
> - _MM_FROUND_CUR_DIRECTION);
> + return (__m512) ((__v16sf)__A / (__v16sf)__B);
> }
>
> extern __inline __m512
> diff -ru -N -x .svn trunk/gcc/config/i386/avx512vldqintrin.h
> intrin/gcc/config/i386/avx512vldqintrin.h
> --- trunk/gcc/config/i386/avx512vldqintrin.h 2014-11-10
> 13:09:34.727589835 +0100
> +++ intrin/gcc/config/i386/avx512vldqintrin.h 2014-11-11
> 15:30:32.090155679 +0100
> @@ -544,11 +544,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm256_mullo_epi64 (__m256i __A, __m256i __B)
> {
> - return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
> - (__v4di) __B,
> - (__v4di)
> - _mm256_setzero_si256 (),
> - (__mmask8) -1);
> + return (__m256i) ((__v4du) __A * (__v4du) __B);
> }
>
> extern __inline __m256i
> @@ -577,11 +573,7 @@
> __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> _mm_mullo_epi64 (__m128i __A, __m128i __B)
> {
> - return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
> - (__v2di) __B,
> - (__v2di)
> - _mm_setzero_di (),
> - (__mmask8) -1);
> + return (__m128i) ((__v2du) __A * (__v2du) __B);
> }
>
> extern __inline __m128i
> diff -ru -N -x .svn trunk/gcc/config/i386/avxintrin.h
> intrin/gcc/config/i386/avxintrin.h
> --- trunk/gcc/config/i386/avxintrin.h 2014-04-01 07:34:06.339879012 +0200
> +++ intrin/gcc/config/i386/avxintrin.h 2014-11-08 06:55:45.615292438 +0100
> @@ -41,9 +41,13 @@
> typedef double __v4df __attribute__ ((__vector_size__ (32)));
> typedef float __v8sf __attribute__ ((__vector_size__ (32)));
> typedef long long __v4di __attribute__ ((__vector_size__ (32)));
> +typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
> typedef int __v8si __attribute__ ((__vector_size__ (32)));
> +typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
> typedef short __v16hi __attribute__ ((__vector_size__ (32)));
> +typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
> typedef char __v32qi __attribute__ ((__vector_size__ (32)));
> +typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
>
> /* The Intel API is flexible enough that we must allow aliasing with other
> vector types, and their scalar components. */
> @@ -124,13 +128,13 @@
> extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm256_add_pd (__m256d __A, __m256d __B)
> {
> - return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
> + return (__m256d) ((__v4df)__A + (__v4df)__B);
> }
>
> extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm256_add_ps (__m256 __A, __m256 __B)
> {
> - return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
> + return (__m256) ((__v8sf)__A + (__v8sf)__B);
> }
>
> extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -218,13 +222,13 @@
> extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm256_div_pd (__m256d __A, __m256d __B)
> {
> - return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
> + return (__m256d) ((__v4df)__A / (__v4df)__B);
> }
>
> extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm256_div_ps (__m256 __A, __m256 __B)
> {
> - return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
> + return (__m256) ((__v8sf)__A / (__v8sf)__B);
> }
>
> /* Dot product instructions with mask-defined summing and zeroing parts
> @@ -295,13 +299,13 @@
> extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm256_mul_pd (__m256d __A, __m256d __B)
> {
> - return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
> + return (__m256d) ((__v4df)__A * (__v4df)__B);
> }
>
> extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm256_mul_ps (__m256 __A, __m256 __B)
> {
> - return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
> + return (__m256) ((__v8sf)__A * (__v8sf)__B);
> }
>
> extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -343,13 +347,13 @@
> extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm256_sub_pd (__m256d __A, __m256d __B)
> {
> - return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
> + return (__m256d) ((__v4df)__A - (__v4df)__B);
> }
>
> extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm256_sub_ps (__m256 __A, __m256 __B)
> {
> - return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
> + return (__m256) ((__v8sf)__A - (__v8sf)__B);
> }
>
> extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> diff -ru -N -x .svn trunk/gcc/config/i386/emmintrin.h
> intrin/gcc/config/i386/emmintrin.h
> --- trunk/gcc/config/i386/emmintrin.h 2014-04-01 07:34:06.335878860 +0200
> +++ intrin/gcc/config/i386/emmintrin.h 2014-11-09 13:43:42.213157229 +0100
> @@ -39,9 +39,13 @@
> /* SSE2 */
> typedef double __v2df __attribute__ ((__vector_size__ (16)));
> typedef long long __v2di __attribute__ ((__vector_size__ (16)));
> +typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
> typedef int __v4si __attribute__ ((__vector_size__ (16)));
> +typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
> typedef short __v8hi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
> typedef char __v16qi __attribute__ ((__vector_size__ (16)));
> +typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
>
> /* The Intel API is flexible enough that we must allow aliasing with other
> vector types, and their scalar components. */
> @@ -168,13 +172,13 @@
> extern __inline void __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_store_sd (double *__P, __m128d __A)
> {
> - *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
> + *__P = ((__v2df)__A)[0];
> }
>
> extern __inline double __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cvtsd_f64 (__m128d __A)
> {
> - return __builtin_ia32_vec_ext_v2df (__A, 0);
> + return ((__v2df)__A)[0];
> }
>
> extern __inline void __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -187,7 +191,7 @@
> extern __inline void __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_storeh_pd (double *__P, __m128d __A)
> {
> - *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
> + *__P = ((__v2df)__A)[1];
> }
>
> /* Store the lower DPFP value across two words.
> @@ -222,21 +226,21 @@
> extern __inline long long __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cvtsi128_si64 (__m128i __A)
> {
> - return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
> + return ((__v2di)__A)[0];
> }
>
> /* Microsoft intrinsic. */
> extern __inline long long __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cvtsi128_si64x (__m128i __A)
> {
> - return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
> + return ((__v2di)__A)[0];
> }
> #endif
>
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_add_pd (__m128d __A, __m128d __B)
> {
> - return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
> + return (__m128d) ((__v2df)__A + (__v2df)__B);
> }
>
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -248,7 +252,7 @@
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_sub_pd (__m128d __A, __m128d __B)
> {
> - return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
> + return (__m128d) ((__v2df)__A - (__v2df)__B);
> }
>
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -260,7 +264,7 @@
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_mul_pd (__m128d __A, __m128d __B)
> {
> - return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
> + return (__m128d) ((__v2df)__A * (__v2df)__B);
> }
>
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -272,7 +276,7 @@
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_div_pd (__m128d __A, __m128d __B)
> {
> - return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
> + return (__m128d) ((__v2df)__A / (__v2df)__B);
> }
>
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -287,7 +291,7 @@
> return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
> }
>
> -/* Return pair {sqrt (A[0), B[1]}. */
> +/* Return pair {sqrt (B[0]), A[1]}. */
> extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_sqrt_sd (__m128d __A, __m128d __B)
> {
> @@ -715,13 +719,13 @@
> extern __inline void __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_storel_epi64 (__m128i *__P, __m128i __B)
> {
> - *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
> + *(long long *)__P = ((__v2di)__B)[0];
> }
>
> extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_movepi64_pi64 (__m128i __B)
> {
> - return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
> + return (__m64) ((__v2di)__B)[0];
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -1006,25 +1010,25 @@
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_add_epi8 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
> + return (__m128i) ((__v16qu)__A + (__v16qu)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_add_epi16 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
> + return (__m128i) ((__v8hu)__A + (__v8hu)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_add_epi32 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
> + return (__m128i) ((__v4su)__A + (__v4su)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_add_epi64 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
> + return (__m128i) ((__v2du)__A + (__v2du)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -1054,25 +1058,25 @@
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_sub_epi8 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
> + return (__m128i) ((__v16qu)__A - (__v16qu)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_sub_epi16 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
> + return (__m128i) ((__v8hu)__A - (__v8hu)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_sub_epi32 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
> + return (__m128i) ((__v4su)__A - (__v4su)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_sub_epi64 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
> + return (__m128i) ((__v2du)__A - (__v2du)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -1114,7 +1118,7 @@
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_mullo_epi16 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
> + return (__m128i) ((__v8hu)__A * (__v8hu)__B);
> }
>
> extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -1247,7 +1251,7 @@
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_and_si128 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pand128 ((__v2di)__A, (__v2di)__B);
> + return (__m128i) ((__v2du)__A & (__v2du)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -1259,67 +1263,67 @@
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_or_si128 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_por128 ((__v2di)__A, (__v2di)__B);
> + return (__m128i) ((__v2du)__A | (__v2du)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_xor_si128 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pxor128 ((__v2di)__A, (__v2di)__B);
> + return (__m128i) ((__v2du)__A ^ (__v2du)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpeqb128 ((__v16qi)__A, (__v16qi)__B);
> + return (__m128i) ((__v16qi)__A == (__v16qi)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpeqw128 ((__v8hi)__A, (__v8hi)__B);
> + return (__m128i) ((__v8hi)__A == (__v8hi)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpeqd128 ((__v4si)__A, (__v4si)__B);
> + return (__m128i) ((__v4si)__A == (__v4si)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmplt_epi8 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__B, (__v16qi)__A);
> + return (__m128i) ((__v16qi)__A < (__v16qi)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmplt_epi16 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__B, (__v8hi)__A);
> + return (__m128i) ((__v8hi)__A < (__v8hi)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmplt_epi32 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__B, (__v4si)__A);
> + return (__m128i) ((__v4si)__A < (__v4si)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpgtb128 ((__v16qi)__A, (__v16qi)__B);
> + return (__m128i) ((__v16qi)__A > (__v16qi)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpgtw128 ((__v8hi)__A, (__v8hi)__B);
> + return (__m128i) ((__v8hi)__A > (__v8hi)__B);
> }
>
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
> {
> - return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
> + return (__m128i) ((__v4si)__A > (__v4si)__B);
> }
>
> #ifdef __OPTIMIZE__
> diff -ru -N -x .svn trunk/gcc/config/i386/smmintrin.h
> intrin/gcc/config/i386/smmintrin.h
> --- trunk/gcc/config/i386/smmintrin.h 2014-01-03 11:39:01.159907676 +0100
> +++ intrin/gcc/config/i386/smmintrin.h 2014-11-09 13:45:01.300194921 +0100
> @@ -267,7 +267,7 @@
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
> {
> - return (__m128i) __builtin_ia32_pcmpeqq ((__v2di)__X, (__v2di)__Y);
> + return (__m128i) ((__v2di)__X == (__v2di)__Y);
> }
>
> /* Min/max packed integer instructions. */
> @@ -325,7 +325,7 @@
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_mullo_epi32 (__m128i __X, __m128i __Y)
> {
> - return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y);
> + return (__m128i) ((__v4su)__X * (__v4su)__Y);
> }
>
> /* Packed integer 32-bit multiplication of 2 pairs of operands
> @@ -795,7 +795,7 @@
> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
> {
> - return (__m128i) __builtin_ia32_pcmpgtq ((__v2di)__X, (__v2di)__Y);
> + return (__m128i) ((__v2di)__X > (__v2di)__Y);
> }
>
> #ifdef __DISABLE_SSE4_2__
> diff -ru -N -x .svn trunk/gcc/config/i386/xmmintrin.h
> intrin/gcc/config/i386/xmmintrin.h
> --- trunk/gcc/config/i386/xmmintrin.h 2014-06-29 20:12:33.675949023 +0200
> +++ intrin/gcc/config/i386/xmmintrin.h 2014-10-17 19:53:06.711416455 +0200
> @@ -180,25 +180,25 @@
> extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_add_ps (__m128 __A, __m128 __B)
> {
> - return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
> + return (__m128) ((__v4sf)__A + (__v4sf)__B);
> }
>
> extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_sub_ps (__m128 __A, __m128 __B)
> {
> - return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
> + return (__m128) ((__v4sf)__A - (__v4sf)__B);
> }
>
> extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_mul_ps (__m128 __A, __m128 __B)
> {
> - return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
> + return (__m128) ((__v4sf)__A * (__v4sf)__B);
> }
>
> extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_div_ps (__m128 __A, __m128 __B)
> {
> - return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
> + return (__m128) ((__v4sf)__A / (__v4sf)__B);
> }
>
> extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> @@ -957,13 +957,13 @@
> extern __inline void __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_store_ss (float *__P, __m128 __A)
> {
> - *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
> + *__P = ((__v4sf)__A)[0];
> }
>
> extern __inline float __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
> _mm_cvtss_f32 (__m128 __A)
> {
> - return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
> + return ((__v4sf)__A)[0];
> }
>
> /* Store four SPFP values. The address must be 16-byte aligned. */
> diff -ru -N -x .svn trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c
> intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c
> --- trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c 1970-01-01
> 01:00:00.000000000 +0100
> +++ intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-1.c 2014-10-10
> 14:07:06.983348081 +0200
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mfma" } */
> +
> +#include <emmintrin.h>
> +
> +__m128d myfma(__m128d x, __m128d y, __m128d z){
> + __m128d m = _mm_mul_pd (x, y);
> + return _mm_add_pd (m, z);
> +}
> +
> +/* { dg-final { scan-assembler "vfmadd" } } */
> diff -ru -N -x .svn trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c
> intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c
> --- trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c 1970-01-01
> 01:00:00.000000000 +0100
> +++ intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-2.c 2014-10-10
> 14:07:06.983348081 +0200
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -ffast-math -msse2 -fdump-tree-optimized" } */
> +
> +#include <emmintrin.h>
> +
> +int f(__m128d x){
> + x = _mm_sub_pd (x, x);
> + x = _mm_mul_pd (x, x);
> + double r = 42;
> + _mm_storeh_pd (&r, x);
> + int z = r == 0;
> + return __builtin_constant_p (z) && z;
> +}
> +
> +/* { dg-final { scan-tree-dump "return 1;" "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> diff -ru -N -x .svn trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c
> intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c
> --- trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c 1970-01-01
> 01:00:00.000000000 +0100
> +++ intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-3.c 2014-10-12
> 22:28:46.603705558 +0200
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -msse2" } */
> +
> +#include <emmintrin.h>
> +
> +double f(){
> + __m128d x = _mm_set1_pd (0.);
> + double r = 42;
> + _mm_storeh_pd (&r, x);
> + return r;
> +}
> +
> +/* { dg-final { scan-assembler-not "unpckhpd" } } */
> diff -ru -N -x .svn trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c
> intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c
> --- trunk/gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c 1970-01-01
> 01:00:00.000000000 +0100
> +++ intrin/gcc/testsuite/gcc.target/i386/intrinsics_opt-4.c 2014-10-12
> 22:28:52.699931988 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O -ffast-math -msse2" } */
> +
> +#include <emmintrin.h>
> +
> +__m128d f(__m128d x, __m128d y, __m128d z){
> + y = _mm_add_pd (x, y);
> + y = _mm_add_pd (z, y);
> + return _mm_sub_pd (y, x);
> +}
> +
> +/* { dg-final { scan-assembler-not "subpd" } } */
>
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2014-11-12 7:26 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-11 15:11 [x86, merge] Replace builtins with vector extensions Marc Glisse
2014-11-12 8:14 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).