public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
From: "john_platts at hotmail dot com" <gcc-bugzilla@gcc.gnu.org> To: gcc-bugs@gcc.gnu.org Subject: [Bug target/109069] Vector truncation test program produces incorrect result on big-endian powerpc64-linux-gnu with -mcpu=power10 -O2 Date: Thu, 09 Mar 2023 15:41:36 +0000 [thread overview] Message-ID: <bug-109069-4-eqzgnH0hkq@http.gcc.gnu.org/bugzilla/> (raw) In-Reply-To: <bug-109069-4@http.gcc.gnu.org/bugzilla/> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109069 --- Comment #3 from John Platts <john_platts at hotmail dot com> --- Here is another test program that reproduces the vector truncation test issue: #pragma push_macro("vector") #pragma push_macro("pixel") #pragma push_macro("bool") #undef vector #undef pixel #undef bool #include <altivec.h> #pragma pop_macro("vector") #pragma pop_macro("pixel") #pragma pop_macro("bool") #include <stdint.h> #include <stddef.h> #include <iostream> #include <string_view> #include <limits> #include <type_traits> template<size_t LaneSize> struct AltivecTypes { }; template<> struct AltivecTypes<1> { using UnsignedLaneT = unsigned char; using SignedLaneT = signed char; using UnsignedVectT = __vector unsigned char; using SignedVectT = __vector signed char; using BoolVectT = __vector __bool char; }; template<> struct AltivecTypes<2> { using UnsignedLaneT = unsigned short; using SignedLaneT = signed short; using UnsignedVectT = __vector unsigned short; using SignedVectT = __vector signed short; using BoolVectT = __vector __bool short; }; template<> struct AltivecTypes<4> { using UnsignedLaneT = unsigned int; using SignedLaneT = signed int; using FloatLaneT = float; using UnsignedVectT = __vector unsigned int; using SignedVectT = __vector signed int; using BoolVectT = __vector __bool int; using FloatVectT = __vector float; }; template<> struct AltivecTypes<8> { using UnsignedLaneT = unsigned long long; using SignedLaneT = signed long long; using FloatLaneT = double; using UnsignedVectT = __vector unsigned long long; using SignedVectT = __vector signed long long; using BoolVectT = __vector __bool long long; using FloatVectT = __vector double; }; template<class T, bool = std::is_signed_v<T>, bool = std::is_integral_v<T>, bool = std::is_floating_point_v<T>, class = void> struct MakeAltivecVectorType { }; template<class T> struct MakeAltivecVectorType<T, true, true, false, std::void_t<typename AltivecTypes<sizeof(T)>::SignedVectT>> { using type = typename AltivecTypes<sizeof(T)>::SignedVectT; }; template<class T> struct MakeAltivecVectorType<T, false, true, false, std::void_t<typename AltivecTypes<sizeof(T)>::UnsignedVectT>> { using type = typename AltivecTypes<sizeof(T)>::UnsignedVectT; }; template<class T> struct MakeAltivecVectorType<T, true, false, true, std::void_t<typename AltivecTypes<sizeof(T)>::FloatVectT>> { using type = typename AltivecTypes<sizeof(T)>::FloatVectT; }; template<class T> using AltivecVectorType = typename MakeAltivecVectorType<T>::type; template<size_t N, class T, std::enable_if_t<(sizeof(T) * N <= 8)>* = nullptr> AltivecVectorType<T> LoadVector(const T* __restrict__ src) { using Bits = typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT; Bits bits; __builtin_memcpy(&bits, src, sizeof(T) * N); return reinterpret_cast<AltivecVectorType<T>>(vec_splats(bits)); } template<size_t N, class T, std::enable_if_t<(sizeof(T) * N == 16)>* = nullptr> AltivecVectorType<T> LoadVector(const T* __restrict__ src) { using LaneT = std::decay_t<decltype(std::declval<AltivecVectorType<T>>()[0])>; typedef LaneT LoadRawT __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); const LoadRawT* __restrict__ p = reinterpret_cast<const LoadRawT*>(src); return reinterpret_cast<AltivecVectorType<T>>(*p); } template<size_t N, class T, std::enable_if_t<(sizeof(T) * N <= 8)>* = nullptr> void StoreVector(T* __restrict__ dest, AltivecVectorType<T> vect) { using Bits = typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT; typedef Bits BitsVectT __attribute__((__vector_size__(16))); const Bits bits = reinterpret_cast<BitsVectT>(vect)[0]; __builtin_memcpy(dest, &bits, sizeof(T) * N); } template<size_t N, class T, std::enable_if_t<(sizeof(T) * N == 16)>* = nullptr> void StoreVector(T* __restrict__ dest, AltivecVectorType<T> vect) { using LaneT = std::decay_t<decltype(std::declval<AltivecVectorType<T>>()[0])>; typedef LaneT StoreRawT __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); StoreRawT* __restrict__ p = reinterpret_cast<StoreRawT*>(dest); *p = reinterpret_cast<StoreRawT>(vect); } template<class T, size_t N, class T2> AltivecVectorType<T> Iota(const T2 first) { using TU = std::make_unsigned_t<T>; alignas(16) T lanes[N]; for(size_t i = 0; i < N; i++) { lanes[i] = static_cast<T>( (static_cast<TU>(i) + static_cast<TU>(first)) & std::numeric_limits<TU>::max()); } return LoadVector<N>(lanes); } template<class T> AltivecVectorType<T> LoadTestVectToTruncate() { return Iota<T, 1>(uint32_t{0xFA578D00u}); } template<class FromV> AltivecVectorType<uint8_t> AltivecTruncateSingleLaneVectToU8(FromV vect) { using FromLaneT = std::decay_t<decltype(std::declval<FromV>()[0])>; constexpr size_t sizeOfFromLane = sizeof(FromLaneT); static_assert(sizeOfFromLane >= 2, "sizeOfFromLane >= 2 must be true"); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return reinterpret_cast<__vector unsigned char>(vect); #else return reinterpret_cast<__vector unsigned char>( vec_sld(vect, vect, sizeof(FromLaneT) - sizeof(unsigned char))); #endif } static __vector unsigned char TruncateU64TestVectToU8() { __vector unsigned char v = AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate<uint64_t>()); return v; } static __vector unsigned char TruncateU32TestVectToU8() { __vector unsigned char v = AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate<uint32_t>()); return v; } static __vector unsigned char TruncateU16TestVectToU8() { __vector unsigned char v = AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate<uint16_t>()); return v; } static __vector unsigned char TruncateU64TestVectToU8_2() { __vector unsigned long long u64_v = LoadTestVectToTruncate<uint64_t>(); __asm__("" : "+wa" (u64_v)); return AltivecTruncateSingleLaneVectToU8(u64_v); } static __vector unsigned char TruncateU32TestVectToU8_2() { __vector unsigned int u32_v = LoadTestVectToTruncate<uint32_t>(); __asm__("" : "+wa" (u32_v)); return AltivecTruncateSingleLaneVectToU8(u32_v); } static __vector unsigned char TruncateU16TestVectToU8_2() { __vector unsigned short u16_v = LoadTestVectToTruncate<uint16_t>(); __asm__("" : "+wa" (u16_v)); return AltivecTruncateSingleLaneVectToU8(u16_v); } using namespace std::string_view_literals; template<size_t N> inline void PrintUCharValuesToCout(const unsigned char (&vals)[N]) { using namespace std::string_view_literals; for(size_t i = 0; i < N; i++) { if(i != 0) std::cout << ", "sv; std::cout << static_cast<uint16_t>(vals[i]); } } static void DoTruncateTest(std::string_view testName, AltivecVectorType<uint8_t> (*truncateFunc)()) { alignas(16) unsigned char vals[16]; StoreVector<16>(vals, truncateFunc()); std::cout << "Result of "sv << testName << "():\n {"sv; PrintUCharValuesToCout(vals); std::cout << "}\n"sv; } #define DO_TRUNCATE_TEST(testName) \ DoTruncateTest(#testName ""sv, testName) int main(int argc, char** argv) { DO_TRUNCATE_TEST(TruncateU16TestVectToU8); DO_TRUNCATE_TEST(TruncateU32TestVectToU8); DO_TRUNCATE_TEST(TruncateU64TestVectToU8); DO_TRUNCATE_TEST(TruncateU16TestVectToU8_2); DO_TRUNCATE_TEST(TruncateU32TestVectToU8_2); DO_TRUNCATE_TEST(TruncateU64TestVectToU8_2); return 0; } Here is the expected output of running the above test program on big-endian POWER8/POWER9/POWER10: Result of TruncateU16TestVectToU8(): {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141} Result of TruncateU32TestVectToU8(): {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141} Result of TruncateU64TestVectToU8(): {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141} Result of TruncateU16TestVectToU8_2(): {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141} Result of TruncateU32TestVectToU8_2(): {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141} Result of TruncateU64TestVectToU8_2(): {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141} Here is the actual output of running the above test program on big-endian POWER10 when compiled with the -O2 -mcpu=power10 options: Result of TruncateU16TestVectToU8(): {141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0} Result of TruncateU32TestVectToU8(): {250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0} Result of TruncateU64TestVectToU8(): {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141} Result of TruncateU16TestVectToU8_2(): {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141} Result of TruncateU32TestVectToU8_2(): {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141} Result of TruncateU64TestVectToU8_2(): {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141} Here is the assembly code that is generated for the TruncateU16TestVectToU8(), TruncateU16TestVectToU8_2(), TruncateU32TestVectToU8(), and TruncateU32TestVectToU8_2() functions when the above program is compiled with the -mcpu=power10 -O2 options: .L._ZL23TruncateU16TestVectToU8v: .LFB2028: .cfi_startproc xxspltiw 34,2365623552 blr .L._ZL25TruncateU16TestVectToU8_2v: .LFB2031: .cfi_startproc xxspltiw 34,2365623552 vsldoi 2,2,2,1 blr .L._ZL23TruncateU32TestVectToU8v: .LFB2027: .cfi_startproc xxspltiw 34,4200041728 blr .L._ZL25TruncateU32TestVectToU8_2v: .LFB2030: .cfi_startproc xxspltiw 34,4200041728 vsldoi 2,2,2,3 blr The only difference between the TruncateU16TestVectToU8() and TruncateU16TestVectToU8_2() functions is that there is an __asm__("" : "+wa" (u16_v)) inline assembly statement in between the LoadTestVectToTruncate<uint16_t>() and AltivecTruncateSingleLaneVectToU8(u16_v) calls. The inline assembly statement in TruncateU16TestVectToU8_2() doesn't change u16_v, but tells the GCC optimizer that u16_v might not be constant, forcing GCC to generate the vsldoi instruction in TruncateU16TestVectToU8_2(). There are similar differences between the TruncateU32TestVectToU8() and TruncateU32TestVectToU8_2() functions on big-endian PPC64. GCC is incorrectly optimizing the TruncateU16TestVectToU8() and TruncateU32TestVectToU8() functions above when the above code is compiled with the -mcpu=power10 -O2 options on the big-endian powerpc64-linux-gnu target.
next prev parent reply other threads:[~2023-03-09 15:41 UTC|newest] Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top 2023-03-08 21:00 [Bug target/109069] New: " john_platts at hotmail dot com 2023-03-08 21:17 ` [Bug target/109069] " john_platts at hotmail dot com 2023-03-09 12:45 ` linkw at gcc dot gnu.org 2023-03-09 15:41 ` john_platts at hotmail dot com [this message] 2023-03-09 15:53 ` john_platts at hotmail dot com 2023-03-09 16:45 ` john_platts at hotmail dot com 2023-03-10 7:03 ` linkw at gcc dot gnu.org 2023-03-10 9:58 ` [Bug target/109069] [12/13 Regression] Vector truncation test program produces incorrect result since r12-6537-g080a06fcb076b3 linkw at gcc dot gnu.org 2023-03-15 10:02 ` rguenth at gcc dot gnu.org 2023-04-26 5:22 ` [Bug target/109069] [12/13/14 " cvs-commit at gcc dot gnu.org 2023-05-08 12:26 ` [Bug target/109069] [12/13 " rguenth at gcc dot gnu.org 2023-05-09 5:18 ` cvs-commit at gcc dot gnu.org 2023-05-09 8:29 ` cvs-commit at gcc dot gnu.org 2023-05-09 8:33 ` linkw at gcc dot gnu.org
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=bug-109069-4-eqzgnH0hkq@http.gcc.gnu.org/bugzilla/ \ --to=gcc-bugzilla@gcc.gnu.org \ --cc=gcc-bugs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).