From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 60B0A3858C5E; Thu, 9 Mar 2023 15:53:41 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 60B0A3858C5E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1678377221; bh=sfl3KF2AIG+B3qifQOIYdkeH/ek+xf4MXCRkC8bMQco=; h=From:To:Subject:Date:In-Reply-To:References:From; b=pmUinD6Nw0mcLP69Da6D0Tj9PmA8zgL7fCQDE27N0zGVhRYwgOw6wjSjyJRyHWx8E QiyzGy6e0bbyLfGVKSJIcznpP8eenU6L131Wao5hiaTd9/AMrWghVM+qWaw9EeYwS+ WFrd3EAQ2ESg/0h+EkBzbnMUGOyS27yZ8tCj8dwg= From: "john_platts at hotmail dot com" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/109069] Vector truncation test program produces incorrect result on big-endian powerpc64-linux-gnu with -mcpu=power10 -O2 Date: Thu, 09 Mar 2023 15:53:41 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 12.1.0 X-Bugzilla-Keywords: wrong-code X-Bugzilla-Severity: normal X-Bugzilla-Who: john_platts at hotmail dot com X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D109069 --- Comment #4 from John Platts --- Here is another test program that exposes the optimization bug with applying the vec_sl operation to a constant vector (which generates incorrect result= s on both big-endian and little-endian POWER10 when compiled with the -O2 -mcpu=3Dpower10 options with GCC 12.1.0): #pragma push_macro("vector") #pragma push_macro("pixel") #pragma push_macro("bool") #undef vector #undef pixel #undef bool #include #pragma pop_macro("vector") #pragma pop_macro("pixel") #pragma pop_macro("bool") #include #include #include #include template struct AltivecTypes { }; template<> struct AltivecTypes<1> { using UnsignedLaneT =3D unsigned char; using SignedLaneT =3D signed char; using UnsignedVectT =3D __vector unsigned char; using SignedVectT =3D __vector signed char; using BoolVectT =3D __vector __bool char; }; template<> struct AltivecTypes<2> { using UnsignedLaneT =3D unsigned short; using SignedLaneT =3D signed short; using UnsignedVectT =3D __vector unsigned short; using SignedVectT =3D __vector signed short; using BoolVectT =3D __vector __bool short; }; template<> struct AltivecTypes<4> { using UnsignedLaneT =3D unsigned int; using SignedLaneT =3D signed int; using FloatLaneT =3D float; using UnsignedVectT =3D __vector unsigned int; using SignedVectT =3D __vector signed int; using BoolVectT =3D __vector __bool int; using FloatVectT =3D __vector float; }; template<> struct AltivecTypes<8> { using UnsignedLaneT =3D unsigned long long; using SignedLaneT =3D signed long long; using FloatLaneT =3D double; using UnsignedVectT =3D __vector unsigned long long; using SignedVectT =3D __vector signed long long; using BoolVectT =3D __vector __bool long long; using FloatVectT =3D __vector double; }; template, bool =3D std::is_integral_v= , bool =3D std::is_floating_point_v, class =3D void> struct MakeAltivecVectorType { }; template struct MakeAltivecVectorType::SignedVectT>> { using type =3D typename AltivecTypes::SignedVectT; }; template struct MakeAltivecVectorType::UnsignedVectT>> { using type =3D typename AltivecTypes::UnsignedVectT; }; template struct MakeAltivecVectorType::FloatVectT>> { using type =3D typename AltivecTypes::FloatVectT; }; template using AltivecVectorType =3D typename MakeAltivecVectorType::type; template* =3D n= ullptr> AltivecVectorType LoadVector(const T* __restrict__ src) { using Bits =3D typename AltivecTypes<(sizeof(T) * N)>::UnsignedLaneT; Bits bits; __builtin_memcpy(&bits, src, sizeof(T) * N); return reinterpret_cast>(vec_splats(bits)); } template* = =3D nullptr> AltivecVectorType LoadVector(const T* __restrict__ src) { using LaneT =3D std::decay_t>()[0])>; typedef LaneT LoadRawT __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); const LoadRawT* __restrict__ p =3D reinterpret_cast(src); return reinterpret_cast>(*p); } template AltivecVectorType Iota(const T2 first) { using TU =3D std::make_unsigned_t; alignas(16) T lanes[N]; for(size_t i =3D 0; i < N; i++) { lanes[i] =3D static_cast( (static_cast(i) + static_cast(first)) & std::numeric_limits::max()); } return LoadVector(lanes); } template AltivecVectorType LoadTestVectToShift() { return Iota(uint32_t{0xFA578D00u}); } template AltivecVectorType DoVectorShiftToU8(FromV vect) { using FromLaneT =3D std::decay_t()[0])>; constexpr size_t sizeOfFromLane =3D sizeof(FromLaneT); static_assert(sizeOfFromLane >=3D 2, "sizeOfFromLane >=3D 2 must be true"= ); return reinterpret_cast<__vector unsigned char>( vec_sld(vect, vect, sizeof(FromLaneT) - sizeof(unsigned char))); } __vector unsigned char U64ShiftedVect() { __vector unsigned char v =3D DoVectorShiftToU8(LoadTestVectToShift()); return v; } __vector unsigned char U32ShiftedVect() { __vector unsigned char v =3D DoVectorShiftToU8(LoadTestVectToShift()); return v; } __vector unsigned char U16ShiftedVect() { __vector unsigned char v =3D DoVectorShiftToU8(LoadTestVectToShift()); return v; } __vector unsigned char U64ShiftedVect_2() { __vector unsigned long long u64_v =3D LoadTestVectToShift(); __asm__("" : "+wa" (u64_v)); return DoVectorShiftToU8(u64_v); } __vector unsigned char U32ShiftedVect_2() { __vector unsigned int u32_v =3D LoadTestVectToShift(); __asm__("" : "+wa" (u32_v)); return DoVectorShiftToU8(u32_v); } __vector unsigned char U16ShiftedVect_2() { __vector unsigned short u16_v =3D LoadTestVectToShift(); __asm__("" : "+wa" (u16_v)); return DoVectorShiftToU8(u16_v); } Here is the assembly code that is generated when the above code is compiled with the -O2 -mcpu=3Dpower10 options on the powerpc64-linux-gnu target: .file "vsx_power10_shift_test_snippet_030923.cpp" .machine power10 .section ".text" .align 2 .p2align 4,,15 .globl _Z14U64ShiftedVectv .section ".opd","aw" .align 3 _Z14U64ShiftedVectv: .quad .L._Z14U64ShiftedVectv,.TOC.@tocbase,0 .previous .type _Z14U64ShiftedVectv, @function .L._Z14U64ShiftedVectv: .LFB206: .cfi_startproc plxv 34,.LC0@pcrel vsldoi 2,2,2,7 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE206: .size _Z14U64ShiftedVectv,.-.L._Z14U64ShiftedVectv .align 2 .p2align 4,,15 .globl _Z14U32ShiftedVectv .section ".opd","aw" .align 3 _Z14U32ShiftedVectv: .quad .L._Z14U32ShiftedVectv,.TOC.@tocbase,0 .previous .type _Z14U32ShiftedVectv, @function .L._Z14U32ShiftedVectv: .LFB207: .cfi_startproc xxspltiw 34,4200041728 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE207: .size _Z14U32ShiftedVectv,.-.L._Z14U32ShiftedVectv .align 2 .p2align 4,,15 .globl _Z14U16ShiftedVectv .section ".opd","aw" .align 3 _Z14U16ShiftedVectv: .quad .L._Z14U16ShiftedVectv,.TOC.@tocbase,0 .previous .type _Z14U16ShiftedVectv, @function .L._Z14U16ShiftedVectv: .LFB208: .cfi_startproc xxspltiw 34,2365623552 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE208: .size _Z14U16ShiftedVectv,.-.L._Z14U16ShiftedVectv .align 2 .p2align 4,,15 .globl _Z16U64ShiftedVect_2v .section ".opd","aw" .align 3 _Z16U64ShiftedVect_2v: .quad .L._Z16U64ShiftedVect_2v,.TOC.@tocbase,0 .previous .type _Z16U64ShiftedVect_2v, @function .L._Z16U64ShiftedVect_2v: .LFB209: .cfi_startproc plxv 34,.LC0@pcrel vsldoi 2,2,2,7 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE209: .size _Z16U64ShiftedVect_2v,.-.L._Z16U64ShiftedVect_2v .align 2 .p2align 4,,15 .globl _Z16U32ShiftedVect_2v .section ".opd","aw" .align 3 _Z16U32ShiftedVect_2v: .quad .L._Z16U32ShiftedVect_2v,.TOC.@tocbase,0 .previous .type _Z16U32ShiftedVect_2v, @function .L._Z16U32ShiftedVect_2v: .LFB210: .cfi_startproc xxspltiw 34,4200041728 vsldoi 2,2,2,3 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE210: .size _Z16U32ShiftedVect_2v,.-.L._Z16U32ShiftedVect_2v .align 2 .p2align 4,,15 .globl _Z16U16ShiftedVect_2v .section ".opd","aw" .align 3 _Z16U16ShiftedVect_2v: .quad .L._Z16U16ShiftedVect_2v,.TOC.@tocbase,0 .previous .type _Z16U16ShiftedVect_2v, @function .L._Z16U16ShiftedVect_2v: .LFB211: .cfi_startproc xxspltiw 34,2365623552 vsldoi 2,2,2,1 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE211: .size _Z16U16ShiftedVect_2v,.-.L._Z16U16ShiftedVect_2v .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC0: .quad 4200041728 .quad 4200041728 .ident "GCC: (Ubuntu 12.1.0-2ubuntu1~22.04) 12.1.0" Here is the assembly code that is generated when the above code is compiled with the -O2 -mcpu=3Dpower10 options on the powerpc64le-linux-gnu target: .file "vsx_power10_shift_test_snippet_030923.cpp" .machine power10 .abiversion 2 .section ".text" .align 2 .p2align 4,,15 .globl _Z14U64ShiftedVectv .type _Z14U64ShiftedVectv, @function _Z14U64ShiftedVectv: .LFB206: .cfi_startproc .localentry _Z14U64ShiftedVectv,1 plxv 34,.LC0@pcrel vsldoi 2,2,2,7 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE206: .size _Z14U64ShiftedVectv,.-_Z14U64ShiftedVectv .align 2 .p2align 4,,15 .globl _Z14U32ShiftedVectv .type _Z14U32ShiftedVectv, @function _Z14U32ShiftedVectv: .LFB207: .cfi_startproc .localentry _Z14U32ShiftedVectv,1 xxspltiw 34,4200041728 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE207: .size _Z14U32ShiftedVectv,.-_Z14U32ShiftedVectv .align 2 .p2align 4,,15 .globl _Z14U16ShiftedVectv .type _Z14U16ShiftedVectv, @function _Z14U16ShiftedVectv: .LFB208: .cfi_startproc .localentry _Z14U16ShiftedVectv,1 xxspltiw 34,2365623552 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE208: .size _Z14U16ShiftedVectv,.-_Z14U16ShiftedVectv .align 2 .p2align 4,,15 .globl _Z16U64ShiftedVect_2v .type _Z16U64ShiftedVect_2v, @function _Z16U64ShiftedVect_2v: .LFB209: .cfi_startproc .localentry _Z16U64ShiftedVect_2v,1 plxv 34,.LC0@pcrel vsldoi 2,2,2,7 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE209: .size _Z16U64ShiftedVect_2v,.-_Z16U64ShiftedVect_2v .align 2 .p2align 4,,15 .globl _Z16U32ShiftedVect_2v .type _Z16U32ShiftedVect_2v, @function _Z16U32ShiftedVect_2v: .LFB210: .cfi_startproc .localentry _Z16U32ShiftedVect_2v,1 xxspltiw 34,4200041728 vsldoi 2,2,2,3 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE210: .size _Z16U32ShiftedVect_2v,.-_Z16U32ShiftedVect_2v .align 2 .p2align 4,,15 .globl _Z16U16ShiftedVect_2v .type _Z16U16ShiftedVect_2v, @function _Z16U16ShiftedVect_2v: .LFB211: .cfi_startproc .localentry _Z16U16ShiftedVect_2v,1 xxspltiw 34,2365623552 vsldoi 2,2,2,1 blr .long 0 .byte 0,9,0,0,0,0,0,0 .cfi_endproc .LFE211: .size _Z16U16ShiftedVect_2v,.-_Z16U16ShiftedVect_2v .section .rodata.cst16,"aM",@progbits,16 .align 4 .LC0: .quad 4200041728 .quad 4200041728 .ident "GCC: (Ubuntu 12.1.0-2ubuntu1~22.04) 12.1.0" .section .note.GNU-stack,"",@progbits=