* [rs6000] PR89338, PR89339: Fix compat vector intrinsics for BE and 32-bit
@ 2019-02-19 16:10 Paul Clarke
2019-02-19 17:01 ` Segher Boessenkool
0 siblings, 1 reply; 2+ messages in thread
From: Paul Clarke @ 2019-02-19 16:10 UTC (permalink / raw)
To: gcc-patches, Segher Boessenkool
Test FAILS: sse2-cvtpd2dq-1, sse2-cvtpd2ps, sse2-cvttpd2dq on powerpc64
(big-endian).
_mm_cvtpd_epi32, _mm_cvtpd_ps, _mm_cvttpd_epi32: Type conversion from
vector doubleword type to vector word type leaves the results in even
lanes in big endian mode.
Test FAILS: sse-cvtss2si-1, sse-cvtss2si-2, sse-movmskb-1 on powerpc
(32-bit big-endian).
Incorrect type for interpreting the result from mfvsrd instruction leads
to incorrect results. Also, mfvsrd instruction only works as expected in
64-bit mode or for 32-bit quantities in 32-bit mode. A more general,
if slower, solution is needed for 32-bit mode.
2019-02-19 Paul A. Clarke <pc@us.ibm.com>
[gcc]
* config/rs6000/emmintrin.h (_mm_cvtpd_epi32): Fix big endian.
(_mm_cvtpd_ps): Likewise.
(_mm_cvttpd_epi32): Likewise.
PR89338
* config/rs6000/xmmintrin.h (_mm_cvtss_f32): Fix type mismatch.
(_mm_cvt_ss2si): Fix type mismatch and 32-bit.
PR89339
* config/rs6000/xmmintrin.h (_mm_movemask_pi8): Fix 32-bit.
Index: gcc/config/rs6000/emmintrin.h
===================================================================
diff --git a/trunk/gcc/config/rs6000/emmintrin.h b/trunk/gcc/config/rs6000/emmintrin.h
--- a/trunk/gcc/config/rs6000/emmintrin.h (revision 268997)
+++ b/trunk/gcc/config/rs6000/emmintrin.h (working copy)
@@ -887,7 +887,11 @@ _mm_cvtpd_epi32 (__m128d __A)
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@@ -922,7 +926,11 @@ _mm_cvtpd_ps (__m128d __A)
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4sf) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@@ -951,7 +959,11 @@ _mm_cvttpd_epi32 (__m128d __A)
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
Index: gcc/config/rs6000/xmmintrin.h
===================================================================
diff --git a/trunk/gcc/config/rs6000/xmmintrin.h b/trunk/gcc/config/rs6000/xmmintrin.h
--- a/trunk/gcc/config/rs6000/xmmintrin.h (revision 268997)
+++ b/trunk/gcc/config/rs6000/xmmintrin.h (working copy)
@@ -905,7 +905,7 @@ _mm_cvtss_f32 (__m128 __A)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si32 (__m128 __A)
{
- __m64 res = 0;
+ int res;
#ifdef _ARCH_PWR8
double dtmp;
__asm__(
@@ -938,8 +938,8 @@ _mm_cvt_ss2si (__m128 __A)
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64 (__m128 __A)
{
- __m64 res = 0;
-#ifdef _ARCH_PWR8
+ long long res;
+#if defined (_ARCH_PWR8) && defined (__powerpc64__)
double dtmp;
__asm__(
#ifdef __LITTLE_ENDIAN__
@@ -1577,6 +1577,7 @@ _m_pminub (__m64 __A, __m64 __B)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8 (__m64 __A)
{
+#ifdef __powerpc64__
unsigned long long p =
#ifdef __LITTLE_ENDIAN__
0x0008101820283038UL; // permute control for sign bits
@@ -1584,6 +1585,18 @@ _mm_movemask_pi8 (__m64 __A)
0x3830282018100800UL; // permute control for sign bits
#endif
return __builtin_bpermd (p, __A);
+#else
+ vector unsigned char A = (vector unsigned char)
+ (vector unsigned long long) { 0, __A };
+ vector unsigned char mask = {
+ 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00,
+ 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40
+ };
+ vector unsigned long long r = (vector unsigned long long)
+ vec_bperm (A, mask);
+ return r[0];
+#endif
+
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [rs6000] PR89338, PR89339: Fix compat vector intrinsics for BE and 32-bit
2019-02-19 16:10 [rs6000] PR89338, PR89339: Fix compat vector intrinsics for BE and 32-bit Paul Clarke
@ 2019-02-19 17:01 ` Segher Boessenkool
0 siblings, 0 replies; 2+ messages in thread
From: Segher Boessenkool @ 2019-02-19 17:01 UTC (permalink / raw)
To: Paul Clarke; +Cc: gcc-patches
Hi Paul,
On Tue, Feb 19, 2019 at 10:10:38AM -0600, Paul Clarke wrote:
> Incorrect type for interpreting the result from mfvsrd instruction leads
> to incorrect results. Also, mfvsrd instruction only works as expected in
> 64-bit mode or for 32-bit quantities in 32-bit mode. A more general,
> if slower, solution is needed for 32-bit mode.
You cannot use 64-bit registers in 32 bit mode on Linux, yes.
> @@ -1577,6 +1577,7 @@ _m_pminub (__m64 __A, __m64 __B)
> extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> _mm_movemask_pi8 (__m64 __A)
> {
> +#ifdef __powerpc64__
> unsigned long long p =
> #ifdef __LITTLE_ENDIAN__
> 0x0008101820283038UL; // permute control for sign bits
> @@ -1584,6 +1585,18 @@ _mm_movemask_pi8 (__m64 __A)
> 0x3830282018100800UL; // permute control for sign bits
> #endif
> return __builtin_bpermd (p, __A);
> +#else
> + vector unsigned char A = (vector unsigned char)
> + (vector unsigned long long) { 0, __A };
> + vector unsigned char mask = {
> + 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00,
> + 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40
> + };
> + vector unsigned long long r = (vector unsigned long long)
> + vec_bperm (A, mask);
> + return r[0];
> +#endif
Wow, how inelegant. Not that splitting the word into two and doing two
__builtin_bpermd will be much better :-/
Okay for trunk. Thanks!
Segher
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2019-02-19 17:01 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-19 16:10 [rs6000] PR89338, PR89339: Fix compat vector intrinsics for BE and 32-bit Paul Clarke
2019-02-19 17:01 ` Segher Boessenkool
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).