From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id A3D343858D33; Thu,  9 Mar 2023 15:41:36 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A3D343858D33
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1678376496;
	bh=13qxHDylQ7jxYKt1sylqzL1H4DtTFH2yYpvB6uwEW/Q=;
	h=From:To:Subject:Date:In-Reply-To:References:From;
	b=V2/Nh1CNzoTnYo1T3B/NaXjQg48RQ52dCBM2IS4iv2MYHZK4P6aFP8TDa7qn/GQfV
	 JrtEWhFnnya7CNAYkxGT5h6aHrI3OT8qTpJFf+MuU/CbZmhelm/hLyIiOh8mA+ZrLK
	 fIxYCfDsjyz6/NgROBEe3DaCLhFabQfxnMlLD1FU=
From: "john_platts at hotmail dot com" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/109069] Vector truncation test program produces
 incorrect result on big-endian powerpc64-linux-gnu with -mcpu=power10 -O2
Date: Thu, 09 Mar 2023 15:41:36 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 12.1.0
X-Bugzilla-Keywords: wrong-code
X-Bugzilla-Severity: normal
X-Bugzilla-Who: john_platts at hotmail dot com
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-109069-4-eqzgnH0hkq@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-109069-4@http.gcc.gnu.org/bugzilla/>
References: <bug-109069-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109069
--- Comment #3 from John Platts <john_platts at hotmail dot com> ---
Here is another test program that reproduces the vector truncation test iss=
ue:
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#include <altivec.h>

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

#include <stdint.h>
#include <stddef.h>
#include <iostream>
#include <string_view>
#include <limits>
#include <type_traits>

template<size_t LaneSize>
struct AltivecTypes {
};

template<>
struct AltivecTypes<1> {
  using UnsignedLaneT =3D unsigned char;
  using SignedLaneT =3D signed char;
  using UnsignedVectT =3D __vector unsigned char;
  using SignedVectT =3D __vector signed char;
  using BoolVectT =3D __vector __bool char;
};

template<>
struct AltivecTypes<2> {
  using UnsignedLaneT =3D unsigned short;
  using SignedLaneT =3D signed short;
  using UnsignedVectT =3D __vector unsigned short;
  using SignedVectT =3D __vector signed short;
  using BoolVectT =3D __vector __bool short;
};

template<>
struct AltivecTypes<4> {
  using UnsignedLaneT =3D unsigned int;
  using SignedLaneT =3D signed int;
  using FloatLaneT =3D float;
  using UnsignedVectT =3D __vector unsigned int;
  using SignedVectT =3D __vector signed int;
  using BoolVectT =3D __vector __bool int;
  using FloatVectT =3D __vector float;
};

template<>
struct AltivecTypes<8> {
  using UnsignedLaneT =3D unsigned long long;
  using SignedLaneT =3D signed long long;
  using FloatLaneT =3D double;
  using UnsignedVectT =3D __vector unsigned long long;
  using SignedVectT =3D __vector signed long long;
  using BoolVectT =3D __vector __bool long long;
  using FloatVectT =3D __vector double;
};

template<class T, bool =3D std::is_signed_v<T>, bool =3D std::is_integral_v=
<T>,
                  bool =3D std::is_floating_point_v<T>, class =3D void>
struct MakeAltivecVectorType {
};

template<class T>
struct MakeAltivecVectorType<T, true, true, false,
  std::void_t<typename AltivecTypes<sizeof(T)>::SignedVectT>> {
  using type =3D typename AltivecTypes<sizeof(T)>::SignedVectT;
};

template<class T>
struct MakeAltivecVectorType<T, false, true, false,
  std::void_t<typename AltivecTypes<sizeof(T)>::UnsignedVectT>> {
  using type =3D typename AltivecTypes<sizeof(T)>::UnsignedVectT;
};

template<class T>
struct MakeAltivecVectorType<T, true, false, true,
  std::void_t<typename AltivecTypes<sizeof(T)>::FloatVectT>> {
  using type =3D typename AltivecTypes<sizeof(T)>::FloatVectT;
};

template<class T>
using AltivecVectorType =3D typename MakeAltivecVectorType<T>::type;

template<size_t N, class T, std::enable_if_t<(sizeof(T) * N <=3D 8)>* =3D n=
ullptr>
AltivecVectorType<T> LoadVector(const T* __restrict__ src) {
  using Bits =3D typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT;
  Bits bits;
  __builtin_memcpy(&bits, src, sizeof(T) * N);
  return reinterpret_cast<AltivecVectorType<T>>(vec_splats(bits));
}

template<size_t N, class T, std::enable_if_t<(sizeof(T) * N =3D=3D 16)>* =
=3D nullptr>
AltivecVectorType<T> LoadVector(const T* __restrict__ src) {
  using LaneT =3D
std::decay_t<decltype(std::declval<AltivecVectorType<T>>()[0])>;
  typedef LaneT LoadRawT
    __attribute__((__vector_size__(16), __aligned__(16), __may_alias__));
  const LoadRawT* __restrict__ p =3D reinterpret_cast<const LoadRawT*>(src);
  return reinterpret_cast<AltivecVectorType<T>>(*p);
}

template<size_t N, class T, std::enable_if_t<(sizeof(T) * N <=3D 8)>* =3D n=
ullptr>
void StoreVector(T* __restrict__ dest, AltivecVectorType<T> vect) {
  using Bits =3D typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT;
  typedef Bits BitsVectT __attribute__((__vector_size__(16)));
  const Bits bits =3D reinterpret_cast<BitsVectT>(vect)[0];
  __builtin_memcpy(dest, &bits, sizeof(T) * N);
}

template<size_t N, class T, std::enable_if_t<(sizeof(T) * N =3D=3D 16)>* =
=3D nullptr>
void StoreVector(T* __restrict__ dest, AltivecVectorType<T> vect) {
  using LaneT =3D
std::decay_t<decltype(std::declval<AltivecVectorType<T>>()[0])>;
  typedef LaneT StoreRawT
    __attribute__((__vector_size__(16), __aligned__(16), __may_alias__));
  StoreRawT* __restrict__ p =3D reinterpret_cast<StoreRawT*>(dest);
  *p =3D reinterpret_cast<StoreRawT>(vect);
}

template<class T, size_t N, class T2>
AltivecVectorType<T> Iota(const T2 first) {
    using TU =3D std::make_unsigned_t<T>;

    alignas(16) T lanes[N];
    for(size_t i =3D 0; i < N; i++) {
        lanes[i] =3D static_cast<T>(
            (static_cast<TU>(i) + static_cast<TU>(first)) &
            std::numeric_limits<TU>::max());
    }

    return LoadVector<N>(lanes);
}

template<class T>
AltivecVectorType<T> LoadTestVectToTruncate() {
    return Iota<T, 1>(uint32_t{0xFA578D00u});
}

template<class FromV>
AltivecVectorType<uint8_t> AltivecTruncateSingleLaneVectToU8(FromV vect) {
  using FromLaneT =3D std::decay_t<decltype(std::declval<FromV>()[0])>;
  constexpr size_t sizeOfFromLane =3D sizeof(FromLaneT);
  static_assert(sizeOfFromLane >=3D 2, "sizeOfFromLane >=3D 2 must be true"=
);

#if __BYTE_ORDER__ =3D=3D __ORDER_LITTLE_ENDIAN__
  return reinterpret_cast<__vector unsigned char>(vect);
#else
  return reinterpret_cast<__vector unsigned char>(
    vec_sld(vect, vect, sizeof(FromLaneT) - sizeof(unsigned char)));
#endif
}

static __vector unsigned char TruncateU64TestVectToU8() {
    __vector unsigned char v =3D
      AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate<uint64_t>());
    return v;
}

static __vector unsigned char TruncateU32TestVectToU8() {
    __vector unsigned char v =3D
      AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate<uint32_t>());
    return v;
}

static __vector unsigned char TruncateU16TestVectToU8() {
    __vector unsigned char v =3D
      AltivecTruncateSingleLaneVectToU8(LoadTestVectToTruncate<uint16_t>());
    return v;
}

static __vector unsigned char TruncateU64TestVectToU8_2() {
    __vector unsigned long long u64_v =3D LoadTestVectToTruncate<uint64_t>(=
);
    __asm__(""
            : "+wa" (u64_v));

    return AltivecTruncateSingleLaneVectToU8(u64_v);
}

static __vector unsigned char TruncateU32TestVectToU8_2() {
    __vector unsigned int u32_v =3D LoadTestVectToTruncate<uint32_t>();
    __asm__(""
            : "+wa" (u32_v));

    return AltivecTruncateSingleLaneVectToU8(u32_v);
}

static __vector unsigned char TruncateU16TestVectToU8_2() {
    __vector unsigned short u16_v =3D LoadTestVectToTruncate<uint16_t>();
    __asm__(""
            : "+wa" (u16_v));

    return AltivecTruncateSingleLaneVectToU8(u16_v);
}

using namespace std::string_view_literals;

template<size_t N>
inline void PrintUCharValuesToCout(const unsigned char (&vals)[N]) {
   using namespace std::string_view_literals;
   for(size_t i =3D 0; i < N; i++) {
       if(i !=3D 0)
           std::cout << ", "sv;

       std::cout << static_cast<uint16_t>(vals[i]);
   }
}

static void DoTruncateTest(std::string_view testName,
                    AltivecVectorType<uint8_t> (*truncateFunc)()) {
  alignas(16) unsigned char vals[16];
  StoreVector<16>(vals, truncateFunc());
  std::cout << "Result of "sv << testName << "():\n {"sv;
  PrintUCharValuesToCout(vals);
  std::cout << "}\n"sv;
}

#define DO_TRUNCATE_TEST(testName) \
  DoTruncateTest(#testName ""sv, testName)

int main(int argc, char** argv) {
    DO_TRUNCATE_TEST(TruncateU16TestVectToU8);
    DO_TRUNCATE_TEST(TruncateU32TestVectToU8);
    DO_TRUNCATE_TEST(TruncateU64TestVectToU8);
    DO_TRUNCATE_TEST(TruncateU16TestVectToU8_2);
    DO_TRUNCATE_TEST(TruncateU32TestVectToU8_2);
    DO_TRUNCATE_TEST(TruncateU64TestVectToU8_2);
    return 0;
}

Here is the expected output of running the above test program on big-endian
POWER8/POWER9/POWER10:
Result of TruncateU16TestVectToU8():
 {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141}
Result of TruncateU32TestVectToU8():
 {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141}
Result of TruncateU64TestVectToU8():
 {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141}
Result of TruncateU16TestVectToU8_2():
 {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141}
Result of TruncateU32TestVectToU8_2():
 {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141}
Result of TruncateU64TestVectToU8_2():
 {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141}

Here is the actual output of running the above test program on big-endian
POWER10 when compiled with the -O2 -mcpu=3Dpower10 options:
Result of TruncateU16TestVectToU8():
 {141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0}
Result of TruncateU32TestVectToU8():
 {250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0}
Result of TruncateU64TestVectToU8():
 {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141}
Result of TruncateU16TestVectToU8_2():
 {0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141, 0, 141}
Result of TruncateU32TestVectToU8_2():
 {0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141, 0, 250, 87, 141}
Result of TruncateU64TestVectToU8_2():
 {0, 0, 0, 0, 0, 250, 87, 141, 0, 0, 0, 0, 0, 250, 87, 141}

Here is the assembly code that is generated for the TruncateU16TestVectToU8=
(),
TruncateU16TestVectToU8_2(), TruncateU32TestVectToU8(), and
TruncateU32TestVectToU8_2() functions when the above program is compiled wi=
th
the -mcpu=3Dpower10 -O2 options:
.L._ZL23TruncateU16TestVectToU8v:
.LFB2028:
        .cfi_startproc
        xxspltiw 34,2365623552
        blr

.L._ZL25TruncateU16TestVectToU8_2v:
.LFB2031:
        .cfi_startproc
        xxspltiw 34,2365623552
        vsldoi 2,2,2,1
        blr

.L._ZL23TruncateU32TestVectToU8v:
.LFB2027:
        .cfi_startproc
        xxspltiw 34,4200041728
        blr

.L._ZL25TruncateU32TestVectToU8_2v:
.LFB2030:
        .cfi_startproc
        xxspltiw 34,4200041728
        vsldoi 2,2,2,3
        blr

The only difference between the TruncateU16TestVectToU8() and
TruncateU16TestVectToU8_2() functions is that there is an __asm__("" : "+wa"
(u16_v)) inline assembly statement in between the
LoadTestVectToTruncate<uint16_t>() and AltivecTruncateSingleLaneVectToU8(u1=
6_v)
calls.

The inline assembly statement in TruncateU16TestVectToU8_2() doesn't change
u16_v, but tells the GCC optimizer that u16_v might not be constant, forcing
GCC to generate the vsldoi instruction in TruncateU16TestVectToU8_2().

There are similar differences between the TruncateU32TestVectToU8() and
TruncateU32TestVectToU8_2() functions on big-endian PPC64.

GCC is incorrectly optimizing the TruncateU16TestVectToU8() and
TruncateU32TestVectToU8() functions above when the above code is compiled w=
ith
the -mcpu=3Dpower10 -O2 options on the big-endian powerpc64-linux-gnu targe=
t.=