From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id A3D343858D33; Thu, 9 Mar 2023 15:41:36 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A3D343858D33 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1678376496; bh=13qxHDylQ7jxYKt1sylqzL1H4DtTFH2yYpvB6uwEW/Q=; h=From:To:Subject:Date:In-Reply-To:References:From; b=V2/Nh1CNzoTnYo1T3B/NaXjQg48RQ52dCBM2IS4iv2MYHZK4P6aFP8TDa7qn/GQfV JrtEWhFnnya7CNAYkxGT5h6aHrI3OT8qTpJFf+MuU/CbZmhelm/hLyIiOh8mA+ZrLK fIxYCfDsjyz6/NgROBEe3DaCLhFabQfxnMlLD1FU= From: "john_platts at hotmail dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/109069] Vector truncation test program produces incorrect result on big-endian powerpc64-linux-gnu with -mcpu=power10 -O2 Date: Thu, 09 Mar 2023 15:41:36 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 12.1.0 X-Bugzilla-Keywords: wrong-code X-Bugzilla-Severity: normal X-Bugzilla-Who: john_platts at hotmail dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109069 --- Comment #3 from John Platts --- Here is another test program that reproduces the vector truncation test iss= ue: #pragma push_macro("vector") #pragma push_macro("pixel") #pragma push_macro("bool") #undef vector #undef pixel #undef bool #include #pragma pop_macro("vector") #pragma pop_macro("pixel") #pragma pop_macro("bool") #include #include #include #include #include #include template struct AltivecTypes { }; template<> struct AltivecTypes<1> { using UnsignedLaneT =3D unsigned char; using SignedLaneT =3D signed char; using UnsignedVectT =3D __vector unsigned char; using SignedVectT =3D __vector signed char; using BoolVectT =3D __vector __bool char; }; template<> struct AltivecTypes<2> { using UnsignedLaneT =3D unsigned short; using SignedLaneT =3D signed short; using UnsignedVectT =3D __vector unsigned short; using SignedVectT =3D __vector signed short; using BoolVectT =3D __vector __bool short; }; template<> struct AltivecTypes<4> { using UnsignedLaneT =3D unsigned int; using SignedLaneT =3D signed int; using FloatLaneT =3D float; using UnsignedVectT =3D __vector unsigned int; using SignedVectT =3D __vector signed int; using BoolVectT =3D __vector __bool int; using FloatVectT =3D __vector float; }; template<> struct AltivecTypes<8> { using UnsignedLaneT =3D unsigned long long; using SignedLaneT =3D signed long long; using FloatLaneT =3D double; using UnsignedVectT =3D __vector unsigned long long; using SignedVectT =3D __vector signed long long; using BoolVectT =3D __vector __bool long long; using FloatVectT =3D __vector double; }; template, bool =3D std::is_integral_v= , bool =3D std::is_floating_point_v, class =3D void> struct MakeAltivecVectorType { }; template struct MakeAltivecVectorType::SignedVectT>> { using type =3D typename AltivecTypes::SignedVectT; }; template struct MakeAltivecVectorType::UnsignedVectT>> { using type =3D typename AltivecTypes::UnsignedVectT; }; template struct MakeAltivecVectorType::FloatVectT>> { using type =3D typename AltivecTypes::FloatVectT; }; template using AltivecVectorType =3D typename MakeAltivecVectorType::type; template* =3D n= ullptr> AltivecVectorType LoadVector(const T* __restrict__ src) { using Bits =3D typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT; Bits bits; __builtin_memcpy(&bits, src, sizeof(T) * N); return reinterpret_cast>(vec_splats(bits)); } template* = =3D nullptr> AltivecVectorType LoadVector(const T* __restrict__ src) { using LaneT =3D std::decay_t>()[0])>; typedef LaneT LoadRawT __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); const LoadRawT* __restrict__ p =3D reinterpret_cast(src); return reinterpret_cast>(*p); } template* =3D n= ullptr> void StoreVector(T* __restrict__ dest, AltivecVectorType vect) { using Bits =3D typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT; typedef Bits BitsVectT __attribute__((__vector_size__(16))); const Bits bits =3D reinterpret_cast(vect)[0]; __builtin_memcpy(dest, &bits, sizeof(T) * N); } template* = =3D nullptr> void StoreVector(T* __restrict__ dest, AltivecVectorType vect) { using LaneT =3D std::decay_t>()[0])>; typedef LaneT StoreRawT __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); StoreRawT* __restrict__ p =3D reinterpret_cast(dest); *p =3D reinterpret_cast(vect); } template AltivecVectorType Iota(const T2 first) { using TU =3D std::make_unsigned_t; alignas(16) T lanes[N]; for(size_t i =3D 0; i < N; i++) { lanes[i] =3D static_cast( (static_cast(i) + static_cast(first)) & std::numeric_limits::max()); } return LoadVector(lanes); } template AltivecVectorType LoadTestVectToTruncate() { return Iota(uint32_t{0xFA578D00u}); } template AltivecVectorType AltivecTruncateSingleLaneVectToU8(FromV vect) { using FromLaneT =3D std::decay_t()[0])>; constexpr size_t sizeOfFromLane =3D sizeof(FromLaneT); static_assert(sizeOfFromLane >=3D 2, "sizeOfFromLane >=3D 2 must be true"= ); #if __BYTE_ORDER__ =3D=3D __ORDER_LITTLE_ENDIAN__ return reinterpret_cast<__vector unsigned char>(vect); #else return reinterpret_cast<__vector unsigned char>( vec_sld(vect, vect, sizeof(FromLaneT) - sizeof(unsigned char))); #endif } static __vector unsigned char TruncateU64TestVectToU8() { __vector unsigned char v =3D AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate()); return v; } static __vector unsigned char TruncateU32TestVectToU8() { __vector unsigned char v =3D AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate()); return v; } static __vector unsigned char TruncateU16TestVectToU8() { __vector unsigned char v =3D AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate()); return v; } static __vector unsigned char TruncateU64TestVectToU8_2() { __vector unsigned long long u64_v =3D LoadTestVectToTruncate(= ); __asm__("" : "+wa" (u64_v)); return AltivecTruncateSingleLaneVectToU8(u64_v); } static __vector unsigned char TruncateU32TestVectToU8_2() { __vector unsigned int u32_v =3D LoadTestVectToTruncate(); __asm__("" : "+wa" (u32_v)); return AltivecTruncateSingleLaneVectToU8(u32_v); } static __vector unsigned char TruncateU16TestVectToU8_2() { __vector unsigned short u16_v =3D LoadTestVectToTruncate(); __asm__("" : "+wa" (u16_v)); return AltivecTruncateSingleLaneVectToU8(u16_v); } using namespace std::string_view_literals; template inline void PrintUCharValuesToCout(const unsigned char (&vals)[N]) { using namespace std::string_view_literals; for(size_t i =3D 0; i < N; i++) { if(i !=3D 0) std::cout << ", "sv; std::cout << static_cast(vals[i]); } } static void DoTruncateTest(std::string_view testName, AltivecVectorType (*truncateFunc)()) { alignas(16) unsigned char vals[16]; StoreVector<16>(vals, truncateFunc()); std::cout << "Result of "sv << testName << "():\n {"sv; PrintUCharValuesToCout(vals); std::cout << "}\n"sv; } #define DO_TRUNCATE_TEST(testName) \ DoTruncateTest(#testName ""sv, testName) int main(int argc, char** argv) { DO_TRUNCATE_TEST(TruncateU16TestVectToU8); DO_TRUNCATE_TEST(TruncateU32TestVectToU8); DO_TRUNCATE_TEST(TruncateU64TestVectToU8); DO_TRUNCATE_TEST(TruncateU16TestVectToU8_2); DO_TRUNCATE_TEST(TruncateU32TestVectToU8_2); DO_TRUNCATE_TEST(TruncateU64TestVectToU8_2); return 0; } Here is the expected output of running the above test program on big-endian POWER8/POWER9/POWER10: Result of TruncateU16TestVectToU8(): {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141} Result of TruncateU32TestVectToU8(): {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141} Result of TruncateU64TestVectToU8(): {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141} Result of TruncateU16TestVectToU8_2(): {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141} Result of TruncateU32TestVectToU8_2(): {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141} Result of TruncateU64TestVectToU8_2(): {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141} Here is the actual output of running the above test program on big-endian POWER10 when compiled with the -O2 -mcpu=3Dpower10 options: Result of TruncateU16TestVectToU8(): {141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0} Result of TruncateU32TestVectToU8(): {250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0} Result of TruncateU64TestVectToU8(): {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141} Result of TruncateU16TestVectToU8_2(): {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141} Result of TruncateU32TestVectToU8_2(): {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141} Result of TruncateU64TestVectToU8_2(): {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141} Here is the assembly code that is generated for the TruncateU16TestVectToU8= (), TruncateU16TestVectToU8_2(), TruncateU32TestVectToU8(), and TruncateU32TestVectToU8_2() functions when the above program is compiled wi= th the -mcpu=3Dpower10 -O2 options: .L._ZL23TruncateU16TestVectToU8v: .LFB2028: .cfi_startproc xxspltiw 34,2365623552 blr .L._ZL25TruncateU16TestVectToU8_2v: .LFB2031: .cfi_startproc xxspltiw 34,2365623552 vsldoi 2,2,2,1 blr .L._ZL23TruncateU32TestVectToU8v: .LFB2027: .cfi_startproc xxspltiw 34,4200041728 blr .L._ZL25TruncateU32TestVectToU8_2v: .LFB2030: .cfi_startproc xxspltiw 34,4200041728 vsldoi 2,2,2,3 blr The only difference between the TruncateU16TestVectToU8() and TruncateU16TestVectToU8_2() functions is that there is an __asm__("" : "+wa" (u16_v)) inline assembly statement in between the LoadTestVectToTruncate() and AltivecTruncateSingleLaneVectToU8(u1= 6_v) calls. The inline assembly statement in TruncateU16TestVectToU8_2() doesn't change u16_v, but tells the GCC optimizer that u16_v might not be constant, forcing GCC to generate the vsldoi instruction in TruncateU16TestVectToU8_2(). There are similar differences between the TruncateU32TestVectToU8() and TruncateU32TestVectToU8_2() functions on big-endian PPC64. GCC is incorrectly optimizing the TruncateU16TestVectToU8() and TruncateU32TestVectToU8() functions above when the above code is compiled w= ith the -mcpu=3Dpower10 -O2 options on the big-endian powerpc64-linux-gnu targe= t.=