public inbox for gcc-bugs@sourceware.org help / color / mirror / Atom feed
From: "malat at debian dot org" <gcc-bugzilla@gcc.gnu.org> To: gcc-bugs@gcc.gnu.org Subject: [Bug tree-optimization/106322] tree-vectorize: Wrong code at O2 level (-fno-tree-vectorize is working) Date: Tue, 09 Aug 2022 07:50:46 +0000 [thread overview] Message-ID: <bug-106322-4-4pkd4aWHDT@http.gcc.gnu.org/bugzilla/> (raw) In-Reply-To: <bug-106322-4@http.gcc.gnu.org/bugzilla/> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322 --- Comment #19 from Mathieu Malaterre <malat at debian dot org> --- Without hwy dependency: % more Makefile bytes.cc demo.cc :::::::::::::: Makefile :::::::::::::: CXXFLAGS := -O2 demo: demo.o bytes.o $(CXX) $(CXXFLAGS) -o $@ $^ clean: rm -f bytes.o demo.o :::::::::::::: bytes.cc :::::::::::::: #include <cstring> bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) { return memcmp(bytes1, bytes2, size) == 0; } :::::::::::::: demo.cc :::::::::::::: #include <atomic> #include <cassert> #include <cstdlib> #include <cstring> #include <limits> #include <memory> #define HWY_ALIGNMENT 64 constexpr size_t kAlignment = HWY_ALIGNMENT; constexpr size_t kAlias = kAlignment * 4; bool BytesEqual(const void *p1, const void *p2, const size_t size); namespace hwy { namespace N_EMU128 { template <typename T, size_t N = 16 / sizeof(T)> struct Vec128 { T raw[16 / sizeof(T)] = {}; }; } // namespace N_EMU128 } // namespace hwy template <typename T, size_t N> static void Store(const hwy::N_EMU128::Vec128<T, N> v, T *__restrict__ aligned) { __builtin_memcpy(aligned, v.raw, sizeof(T) * N); } template <typename T, size_t N> static hwy::N_EMU128::Vec128<T, N> Load(const T *__restrict__ aligned) { hwy::N_EMU128::Vec128<T, N> v; __builtin_memcpy(v.raw, aligned, sizeof(T) * N); return v; } template <size_t N> static hwy::N_EMU128::Vec128<uint16_t, N> MulHigh(hwy::N_EMU128::Vec128<uint16_t, N> a, const hwy::N_EMU128::Vec128<uint16_t, N> b) { for (size_t i = 0; i < N; ++i) { // Cast to uint32_t first to prevent overflow. Otherwise the result of // uint16_t * uint16_t is in "int" which may overflow. In practice the // result is the same but this way it is also defined. a.raw[i] = static_cast<uint16_t>( (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >> 16); } return a; } #define HWY_ASSERT(condition) assert((condition)) #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) #pragma pack(push, 1) struct AllocationHeader { void *allocated; size_t payload_size; }; #pragma pack(pop) static void FreeAlignedBytes(const void *aligned_pointer) { HWY_ASSERT(aligned_pointer != nullptr); if (aligned_pointer == nullptr) return; const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer); HWY_ASSERT(payload % kAlignment == 0); const AllocationHeader *header = reinterpret_cast<const AllocationHeader *>(payload) - 1; free(header->allocated); } class AlignedFreer { public: template <typename T> void operator()(T *aligned_pointer) const { FreeAlignedBytes(aligned_pointer); } }; template <typename T> using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>; static inline constexpr size_t ShiftCount(size_t n) { return (n <= 1) ? 0 : 1 + ShiftCount(n / 2); } namespace { static size_t NextAlignedOffset() { static std::atomic<uint32_t> next{0}; constexpr uint32_t kGroups = kAlias / kAlignment; const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) % kGroups; const size_t offset = kAlignment * group; HWY_ASSERT((offset % kAlignment == 0) && offset <= kAlias); // std::cerr << "O: " << offset << std::endl; return offset; } } // namespace static void *AllocateAlignedBytes(const size_t payload_size) { HWY_ASSERT(payload_size != 0); // likely a bug in caller if (payload_size >= std::numeric_limits<size_t>::max() / 2) { HWY_ASSERT(false && "payload_size too large"); return nullptr; } size_t offset = NextAlignedOffset(); // What: | misalign | unused | AllocationHeader |payload // Size: |<= kAlias | offset |payload_size // ^allocated.^aligned.^header............^payload // The header must immediately precede payload, which must remain aligned. // To avoid wasting space, the header resides at the end of `unused`, // which therefore cannot be empty (offset == 0). if (offset == 0) { offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment) static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up"); } const size_t allocated_size = kAlias + offset + payload_size; void *allocated = malloc(allocated_size); HWY_ASSERT(allocated != nullptr); if (allocated == nullptr) return nullptr; // Always round up even if already aligned - we already asked for kAlias // extra bytes and there's no way to give them back. uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias; static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2"); static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias"); aligned &= ~(kAlias - 1); const uintptr_t payload = aligned + offset; // still aligned // Stash `allocated` and payload_size inside header for FreeAlignedBytes(). // The allocated_size can be reconstructed from the payload_size. AllocationHeader *header = reinterpret_cast<AllocationHeader *>(payload) - 1; header->allocated = allocated; header->payload_size = payload_size; //printf("%d-byte aligned addr: %p\n", kAlignment, reinterpret_cast<void*>(payload)); return HWY_ASSUME_ALIGNED(reinterpret_cast<void *>(payload), kAlignment); } template <typename T> static T *AllocateAlignedItems(size_t items) { constexpr size_t size = sizeof(T); constexpr bool is_pow2 = (size & (size - 1)) == 0; constexpr size_t bits = ShiftCount(size); static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect"); const size_t bytes = is_pow2 ? items << bits : items * size; const size_t check = is_pow2 ? bytes >> bits : bytes / size; if (check != items) { return nullptr; // overflowed } return static_cast<T *>(AllocateAlignedBytes(bytes)); } template <typename T> static AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) { return AlignedFreeUniquePtr<T[]>(AllocateAlignedItems<T>(items), AlignedFreer()); } int main() { AlignedFreeUniquePtr<uint16_t[]> in_lanes = AllocateAligned<uint16_t>(2); uint16_t expected_lanes[2]; in_lanes[0] = 65535; in_lanes[1] = 32767; expected_lanes[0] = 65534; expected_lanes[1] = 16383; hwy::N_EMU128::Vec128<uint16_t, 2> v = Load<uint16_t, 2>(in_lanes.get()); hwy::N_EMU128::Vec128<uint16_t, 2> actual = MulHigh(v, v); { auto actual_lanes = AllocateAligned<uint16_t>(2); Store(actual, actual_lanes.get()); const uint8_t *expected_array = reinterpret_cast<const uint8_t *>(expected_lanes); const uint8_t *actual_array = reinterpret_cast<const uint8_t *>(actual_lanes.get()); for (size_t i = 0; i < 2; ++i) { const uint8_t *expected_ptr = expected_array + i * 2; const uint8_t *actual_ptr = actual_array + i * 2; #if 1 // trigger bug if (!BytesEqual(expected_ptr, actual_ptr, 2)) { #else // no bug if (std::memcmp(expected_ptr, actual_ptr, 2) != 0) { #endif abort(); } } } }
next prev parent reply other threads:[~2022-08-09 7:50 UTC|newest] Thread overview: 56+ messages / expand[flat|nested] mbox.gz Atom feed top 2022-07-16 9:48 [Bug c++/106322] New: i386: Wrong code at O2 level (O0 / O1 are working) malat at debian dot org 2022-07-16 9:55 ` [Bug c++/106322] " malat at debian dot org 2022-07-16 10:00 ` malat at debian dot org 2022-07-16 10:00 ` malat at debian dot org 2022-07-16 10:02 ` malat at debian dot org 2022-07-16 10:02 ` malat at debian dot org 2022-07-16 10:07 ` malat at debian dot org 2022-07-16 10:15 ` malat at debian dot org 2022-07-17 20:20 ` [Bug target/106322] " pinskia at gcc dot gnu.org 2022-07-18 8:48 ` marxin at gcc dot gnu.org 2022-07-18 14:40 ` malat at debian dot org 2022-07-19 7:58 ` ubizjak at gmail dot com 2022-08-03 8:41 ` malat at debian dot org 2022-08-03 12:31 ` [Bug tree-optimization/106322] 32bits / tree-vectorize: Wrong code at O2 level (-fno-tree-vectorize is working) malat at debian dot org 2022-08-03 12:32 ` malat at debian dot org 2022-08-03 12:33 ` malat at debian dot org 2022-08-05 13:14 ` [Bug tree-optimization/106322] " malat at debian dot org 2022-08-08 7:12 ` malat at debian dot org 2022-08-08 7:20 ` malat at debian dot org 2022-08-08 10:00 ` malat at debian dot org 2022-08-09 7:50 ` malat at debian dot org [this message] 2022-08-09 12:36 ` marxin at gcc dot gnu.org 2022-08-09 12:58 ` malat at debian dot org 2022-08-09 13:00 ` ubizjak at gmail dot com 2022-08-09 13:03 ` malat at debian dot org 2022-08-09 13:04 ` marxin at gcc dot gnu.org 2022-08-09 13:05 ` malat at debian dot org 2022-08-09 13:11 ` [Bug tree-optimization/106322] [12/13 Regression] tree-vectorize: Wrong code at O2 level (-fno-tree-vectorize is working) since r12-2404-ga1d27560770818c5 marxin at gcc dot gnu.org 2022-08-09 13:12 ` marxin at gcc dot gnu.org 2022-08-09 13:26 ` linkw at gcc dot gnu.org 2022-08-09 13:29 ` marxin at gcc dot gnu.org 2022-08-09 13:30 ` malat at debian dot org 2022-08-09 13:34 ` malat at debian dot org 2022-08-09 13:40 ` linkw at gcc dot gnu.org 2022-08-09 13:48 ` rguenth at gcc dot gnu.org 2022-08-09 13:53 ` malat at debian dot org 2022-08-09 13:56 ` malat at debian dot org 2022-08-09 14:01 ` malat at debian dot org 2022-08-09 15:28 ` pinskia at gcc dot gnu.org 2022-08-10 5:25 ` linkw at gcc dot gnu.org 2022-08-10 5:34 ` linkw at gcc dot gnu.org 2022-08-10 6:03 ` pinskia at gcc dot gnu.org 2022-08-10 6:24 ` linkw at gcc dot gnu.org 2022-08-10 9:47 ` linkw at gcc dot gnu.org 2022-08-10 12:32 ` rguenth at gcc dot gnu.org 2022-08-10 12:36 ` rguenth at gcc dot gnu.org 2022-08-11 1:18 ` linkw at gcc dot gnu.org 2022-08-15 6:51 ` linkw at gcc dot gnu.org 2022-08-16 5:50 ` cvs-commit at gcc dot gnu.org 2022-08-24 2:31 ` [Bug tree-optimization/106322] [12 " cvs-commit at gcc dot gnu.org 2022-08-24 2:53 ` linkw at gcc dot gnu.org 2022-08-24 6:51 ` rguenth at gcc dot gnu.org 2022-09-27 14:14 ` malat at debian dot org 2022-09-27 14:18 ` malat at debian dot org 2022-09-28 6:11 ` malat at debian dot org 2022-09-28 6:26 ` linkw at gcc dot gnu.org
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=bug-106322-4-4pkd4aWHDT@http.gcc.gnu.org/bugzilla/ \ --to=gcc-bugzilla@gcc.gnu.org \ --cc=gcc-bugs@gcc.gnu.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).