From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id EE84B3858289; Tue, 9 Aug 2022 07:50:46 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org EE84B3858289 From: "malat at debian dot org" To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/106322] tree-vectorize: Wrong code at O2 level (-fno-tree-vectorize is working) Date: Tue, 09 Aug 2022 07:50:46 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: tree-optimization X-Bugzilla-Version: 12.1.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: malat at debian dot org X-Bugzilla-Status: WAITING X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 09 Aug 2022 07:50:47 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D106322 --- Comment #19 from Mathieu Malaterre --- Without hwy dependency: % more Makefile bytes.cc demo.cc :::::::::::::: Makefile :::::::::::::: CXXFLAGS :=3D -O2 demo: demo.o bytes.o $(CXX) $(CXXFLAGS) -o $@ $^ clean: rm -f bytes.o demo.o :::::::::::::: bytes.cc :::::::::::::: #include bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) { return memcmp(bytes1, bytes2, size) =3D=3D 0; } :::::::::::::: demo.cc :::::::::::::: #include #include #include #include #include #include #define HWY_ALIGNMENT 64 constexpr size_t kAlignment =3D HWY_ALIGNMENT; constexpr size_t kAlias =3D kAlignment * 4; bool BytesEqual(const void *p1, const void *p2, const size_t size); namespace hwy { namespace N_EMU128 { template struct Vec128 { T raw[16 / sizeof(T)] =3D {}; }; } // namespace N_EMU128 } // namespace hwy template static void Store(const hwy::N_EMU128::Vec128 v, T *__restrict__ aligned) { __builtin_memcpy(aligned, v.raw, sizeof(T) * N); } template static hwy::N_EMU128::Vec128 Load(const T *__restrict__ aligned) { hwy::N_EMU128::Vec128 v; __builtin_memcpy(v.raw, aligned, sizeof(T) * N); return v; } template static hwy::N_EMU128::Vec128 MulHigh(hwy::N_EMU128::Vec128 a, const hwy::N_EMU128::Vec128 b) { for (size_t i =3D 0; i < N; ++i) { // Cast to uint32_t first to prevent overflow. Otherwise the result of // uint16_t * uint16_t is in "int" which may overflow. In practice the // result is the same but this way it is also defined. a.raw[i] =3D static_cast( (static_cast(a.raw[i]) * static_cast(b.raw[i]))= >> 16); } return a; } #define HWY_ASSERT(condition) assert((condition)) #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (ali= gn)) #pragma pack(push, 1) struct AllocationHeader { void *allocated; size_t payload_size; }; #pragma pack(pop) static void FreeAlignedBytes(const void *aligned_pointer) { HWY_ASSERT(aligned_pointer !=3D nullptr); if (aligned_pointer =3D=3D nullptr) return; const uintptr_t payload =3D reinterpret_cast(aligned_pointer); HWY_ASSERT(payload % kAlignment =3D=3D 0); const AllocationHeader *header =3D reinterpret_cast(payload) - 1; free(header->allocated); } class AlignedFreer { public: template void operator()(T *aligned_pointer) const { FreeAlignedBytes(aligned_pointer); } }; template using AlignedFreeUniquePtr =3D std::unique_ptr; static inline constexpr size_t ShiftCount(size_t n) { return (n <=3D 1) ? 0 : 1 + ShiftCount(n / 2); } namespace { static size_t NextAlignedOffset() { static std::atomic next{0}; constexpr uint32_t kGroups =3D kAlias / kAlignment; const uint32_t group =3D next.fetch_add(1, std::memory_order_relaxed) % kGroups; const size_t offset =3D kAlignment * group; HWY_ASSERT((offset % kAlignment =3D=3D 0) && offset <=3D kAlias); // std::cerr << "O: " << offset << std::endl; return offset; } } // namespace static void *AllocateAlignedBytes(const size_t payload_size) { HWY_ASSERT(payload_size !=3D 0); // likely a bug in caller if (payload_size >=3D std::numeric_limits::max() / 2) { HWY_ASSERT(false && "payload_size too large"); return nullptr; } size_t offset =3D NextAlignedOffset(); // What: | misalign | unused | AllocationHeader |payload // Size: |<=3D kAlias | offset |payload_size // ^allocated.^aligned.^header............^payload // The header must immediately precede payload, which must remain aligned. // To avoid wasting space, the header resides at the end of `unused`, // which therefore cannot be empty (offset =3D=3D 0). if (offset =3D=3D 0) { offset =3D kAlignment; // =3D RoundUpTo(sizeof(AllocationHeader), kAlig= nment) static_assert(sizeof(AllocationHeader) <=3D kAlignment, "Else: round up= "); } const size_t allocated_size =3D kAlias + offset + payload_size; void *allocated =3D malloc(allocated_size); HWY_ASSERT(allocated !=3D nullptr); if (allocated =3D=3D nullptr) return nullptr; // Always round up even if already aligned - we already asked for kAlias // extra bytes and there's no way to give them back. uintptr_t aligned =3D reinterpret_cast(allocated) + kAlias; static_assert((kAlias & (kAlias - 1)) =3D=3D 0, "kAlias must be a power o= f 2"); static_assert(kAlias >=3D kAlignment, "Cannot align to more than kAlias"); aligned &=3D ~(kAlias - 1); const uintptr_t payload =3D aligned + offset; // still aligned // Stash `allocated` and payload_size inside header for FreeAlignedBytes(= ). // The allocated_size can be reconstructed from the payload_size. AllocationHeader *header =3D reinterpret_cast(payload= ) - 1; header->allocated =3D allocated; header->payload_size =3D payload_size; //printf("%d-byte aligned addr: %p\n", kAlignment, reinterpret_cast(payload)); return HWY_ASSUME_ALIGNED(reinterpret_cast(payload), kAlignment); } template static T *AllocateAlignedItems(size_t items) { constexpr size_t size =3D sizeof(T); constexpr bool is_pow2 =3D (size & (size - 1)) =3D=3D 0; constexpr size_t bits =3D ShiftCount(size); static_assert(!is_pow2 || (1ull << bits) =3D=3D size, "ShiftCount is inco= rrect"); const size_t bytes =3D is_pow2 ? items << bits : items * size; const size_t check =3D is_pow2 ? bytes >> bits : bytes / size; if (check !=3D items) { return nullptr; // overflowed } return static_cast(AllocateAlignedBytes(bytes)); } template static AlignedFreeUniquePtr AllocateAligned(const size_t items) { return AlignedFreeUniquePtr(AllocateAlignedItems(items), AlignedFreer()); } int main() { AlignedFreeUniquePtr in_lanes =3D AllocateAligned(2= ); uint16_t expected_lanes[2]; in_lanes[0] =3D 65535; in_lanes[1] =3D 32767; expected_lanes[0] =3D 65534; expected_lanes[1] =3D 16383; hwy::N_EMU128::Vec128 v =3D Load(in_lanes.get()= ); hwy::N_EMU128::Vec128 actual =3D MulHigh(v, v); { auto actual_lanes =3D AllocateAligned(2); Store(actual, actual_lanes.get()); const uint8_t *expected_array =3D reinterpret_cast(expected_lanes); const uint8_t *actual_array =3D reinterpret_cast(actual_lanes.get()); for (size_t i =3D 0; i < 2; ++i) { const uint8_t *expected_ptr =3D expected_array + i * 2; const uint8_t *actual_ptr =3D actual_array + i * 2; #if 1 // trigger bug if (!BytesEqual(expected_ptr, actual_ptr, 2)) { #else // no bug if (std::memcmp(expected_ptr, actual_ptr, 2) !=3D 0) { #endif abort(); } } } }=