[Bug tree-optimization/106322] tree-vectorize: Wrong code at O2 level (-fno-tree-vectorize is working)

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

From: "malat at debian dot org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/106322] tree-vectorize: Wrong code at O2 level (-fno-tree-vectorize is working)
Date: Tue, 09 Aug 2022 07:50:46 +0000	[thread overview]
Message-ID: <bug-106322-4-4pkd4aWHDT@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-106322-4@http.gcc.gnu.org/bugzilla/>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106322

--- Comment #19 from Mathieu Malaterre <malat at debian dot org> ---
Without hwy dependency:

 % more Makefile bytes.cc demo.cc
::::::::::::::
Makefile
::::::::::::::
CXXFLAGS := -O2

demo: demo.o bytes.o
        $(CXX) $(CXXFLAGS) -o $@ $^

clean:
        rm -f bytes.o demo.o
::::::::::::::
bytes.cc
::::::::::::::
#include <cstring>

bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) {
  return memcmp(bytes1, bytes2, size) == 0;
}
::::::::::::::
demo.cc
::::::::::::::
#include <atomic>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <memory>

#define HWY_ALIGNMENT 64
constexpr size_t kAlignment = HWY_ALIGNMENT;
constexpr size_t kAlias = kAlignment * 4;

bool BytesEqual(const void *p1, const void *p2, const size_t size);

namespace hwy {
namespace N_EMU128 {
template <typename T, size_t N = 16 / sizeof(T)> struct Vec128 {
  T raw[16 / sizeof(T)] = {};
};
} // namespace N_EMU128
} // namespace hwy

template <typename T, size_t N>
static void Store(const hwy::N_EMU128::Vec128<T, N> v,
                  T *__restrict__ aligned) {
  __builtin_memcpy(aligned, v.raw, sizeof(T) * N);
}

template <typename T, size_t N>
static hwy::N_EMU128::Vec128<T, N> Load(const T *__restrict__ aligned) {
  hwy::N_EMU128::Vec128<T, N> v;
  __builtin_memcpy(v.raw, aligned, sizeof(T) * N);
  return v;
}

template <size_t N>
static hwy::N_EMU128::Vec128<uint16_t, N>
MulHigh(hwy::N_EMU128::Vec128<uint16_t, N> a,
        const hwy::N_EMU128::Vec128<uint16_t, N> b) {
  for (size_t i = 0; i < N; ++i) {
    // Cast to uint32_t first to prevent overflow. Otherwise the result of
    // uint16_t * uint16_t is in "int" which may overflow. In practice the
    // result is the same but this way it is also defined.
    a.raw[i] = static_cast<uint16_t>(
        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i])) >>
        16);
  }
  return a;
}

#define HWY_ASSERT(condition) assert((condition))
#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align))

#pragma pack(push, 1)
struct AllocationHeader {
  void *allocated;
  size_t payload_size;
};
#pragma pack(pop)

static void FreeAlignedBytes(const void *aligned_pointer) {
  HWY_ASSERT(aligned_pointer != nullptr);
  if (aligned_pointer == nullptr)
    return;

  const uintptr_t payload = reinterpret_cast<uintptr_t>(aligned_pointer);
  HWY_ASSERT(payload % kAlignment == 0);
  const AllocationHeader *header =
      reinterpret_cast<const AllocationHeader *>(payload) - 1;

  free(header->allocated);
}

class AlignedFreer {
public:
  template <typename T> void operator()(T *aligned_pointer) const {
    FreeAlignedBytes(aligned_pointer);
  }
};

template <typename T>
using AlignedFreeUniquePtr = std::unique_ptr<T, AlignedFreer>;

static inline constexpr size_t ShiftCount(size_t n) {
  return (n <= 1) ? 0 : 1 + ShiftCount(n / 2);
}

namespace {
static size_t NextAlignedOffset() {
  static std::atomic<uint32_t> next{0};
  constexpr uint32_t kGroups = kAlias / kAlignment;
  const uint32_t group = next.fetch_add(1, std::memory_order_relaxed) %
kGroups;
  const size_t offset = kAlignment * group;
  HWY_ASSERT((offset % kAlignment == 0) && offset <= kAlias);
  //  std::cerr << "O: " << offset << std::endl;
  return offset;
}
} // namespace

static void *AllocateAlignedBytes(const size_t payload_size) {
  HWY_ASSERT(payload_size != 0); // likely a bug in caller
  if (payload_size >= std::numeric_limits<size_t>::max() / 2) {
    HWY_ASSERT(false && "payload_size too large");
    return nullptr;
  }

  size_t offset = NextAlignedOffset();

  // What: | misalign | unused | AllocationHeader |payload
  // Size: |<= kAlias | offset                    |payload_size
  //       ^allocated.^aligned.^header............^payload
  // The header must immediately precede payload, which must remain aligned.
  // To avoid wasting space, the header resides at the end of `unused`,
  // which therefore cannot be empty (offset == 0).
  if (offset == 0) {
    offset = kAlignment; // = RoundUpTo(sizeof(AllocationHeader), kAlignment)
    static_assert(sizeof(AllocationHeader) <= kAlignment, "Else: round up");
  }

  const size_t allocated_size = kAlias + offset + payload_size;
  void *allocated = malloc(allocated_size);
  HWY_ASSERT(allocated != nullptr);
  if (allocated == nullptr)
    return nullptr;
  // Always round up even if already aligned - we already asked for kAlias
  // extra bytes and there's no way to give them back.
  uintptr_t aligned = reinterpret_cast<uintptr_t>(allocated) + kAlias;
  static_assert((kAlias & (kAlias - 1)) == 0, "kAlias must be a power of 2");
  static_assert(kAlias >= kAlignment, "Cannot align to more than kAlias");
  aligned &= ~(kAlias - 1);

  const uintptr_t payload = aligned + offset; // still aligned

  // Stash `allocated` and payload_size inside header for FreeAlignedBytes().
  // The allocated_size can be reconstructed from the payload_size.
  AllocationHeader *header = reinterpret_cast<AllocationHeader *>(payload) - 1;
  header->allocated = allocated;
  header->payload_size = payload_size;

  //printf("%d-byte aligned addr: %p\n", kAlignment,
reinterpret_cast<void*>(payload));
  return HWY_ASSUME_ALIGNED(reinterpret_cast<void *>(payload), kAlignment);
}

template <typename T> static T *AllocateAlignedItems(size_t items) {
  constexpr size_t size = sizeof(T);

  constexpr bool is_pow2 = (size & (size - 1)) == 0;
  constexpr size_t bits = ShiftCount(size);
  static_assert(!is_pow2 || (1ull << bits) == size, "ShiftCount is incorrect");

  const size_t bytes = is_pow2 ? items << bits : items * size;
  const size_t check = is_pow2 ? bytes >> bits : bytes / size;
  if (check != items) {
    return nullptr; // overflowed
  }
  return static_cast<T *>(AllocateAlignedBytes(bytes));
}

template <typename T>
static AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
  return AlignedFreeUniquePtr<T[]>(AllocateAlignedItems<T>(items),
                                   AlignedFreer());
}

int main() {
  AlignedFreeUniquePtr<uint16_t[]> in_lanes = AllocateAligned<uint16_t>(2);
  uint16_t expected_lanes[2];
  in_lanes[0] = 65535;
  in_lanes[1] = 32767;
  expected_lanes[0] = 65534;
  expected_lanes[1] = 16383;
  hwy::N_EMU128::Vec128<uint16_t, 2> v = Load<uint16_t, 2>(in_lanes.get());
  hwy::N_EMU128::Vec128<uint16_t, 2> actual = MulHigh(v, v);
  {
    auto actual_lanes = AllocateAligned<uint16_t>(2);
    Store(actual, actual_lanes.get());
    const uint8_t *expected_array =
        reinterpret_cast<const uint8_t *>(expected_lanes);
    const uint8_t *actual_array =
        reinterpret_cast<const uint8_t *>(actual_lanes.get());
    for (size_t i = 0; i < 2; ++i) {
      const uint8_t *expected_ptr = expected_array + i * 2;
      const uint8_t *actual_ptr = actual_array + i * 2;
#if 1
      // trigger bug
      if (!BytesEqual(expected_ptr, actual_ptr, 2)) {
#else
      // no bug
      if (std::memcmp(expected_ptr, actual_ptr, 2) != 0) {
#endif
        abort();
      }
    }
  }
}

next prev parent reply	other threads:[~2022-08-09  7:50 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-07-16  9:48 [Bug c++/106322] New: i386: Wrong code at O2 level (O0 / O1 are working) malat at debian dot org
2022-07-16  9:55 ` [Bug c++/106322] " malat at debian dot org
2022-07-16 10:00 ` malat at debian dot org
2022-07-16 10:00 ` malat at debian dot org
2022-07-16 10:02 ` malat at debian dot org
2022-07-16 10:02 ` malat at debian dot org
2022-07-16 10:07 ` malat at debian dot org
2022-07-16 10:15 ` malat at debian dot org
2022-07-17 20:20 ` [Bug target/106322] " pinskia at gcc dot gnu.org
2022-07-18  8:48 ` marxin at gcc dot gnu.org
2022-07-18 14:40 ` malat at debian dot org
2022-07-19  7:58 ` ubizjak at gmail dot com
2022-08-03  8:41 ` malat at debian dot org
2022-08-03 12:31 ` [Bug tree-optimization/106322] 32bits / tree-vectorize: Wrong code at O2 level (-fno-tree-vectorize is working) malat at debian dot org
2022-08-03 12:32 ` malat at debian dot org
2022-08-03 12:33 ` malat at debian dot org
2022-08-05 13:14 ` [Bug tree-optimization/106322] " malat at debian dot org
2022-08-08  7:12 ` malat at debian dot org
2022-08-08  7:20 ` malat at debian dot org
2022-08-08 10:00 ` malat at debian dot org
2022-08-09  7:50 ` malat at debian dot org [this message]
2022-08-09 12:36 ` marxin at gcc dot gnu.org
2022-08-09 12:58 ` malat at debian dot org
2022-08-09 13:00 ` ubizjak at gmail dot com
2022-08-09 13:03 ` malat at debian dot org
2022-08-09 13:04 ` marxin at gcc dot gnu.org
2022-08-09 13:05 ` malat at debian dot org
2022-08-09 13:11 ` [Bug tree-optimization/106322] [12/13 Regression] tree-vectorize: Wrong code at O2 level (-fno-tree-vectorize is working) since r12-2404-ga1d27560770818c5 marxin at gcc dot gnu.org
2022-08-09 13:12 ` marxin at gcc dot gnu.org
2022-08-09 13:26 ` linkw at gcc dot gnu.org
2022-08-09 13:29 ` marxin at gcc dot gnu.org
2022-08-09 13:30 ` malat at debian dot org
2022-08-09 13:34 ` malat at debian dot org
2022-08-09 13:40 ` linkw at gcc dot gnu.org
2022-08-09 13:48 ` rguenth at gcc dot gnu.org
2022-08-09 13:53 ` malat at debian dot org
2022-08-09 13:56 ` malat at debian dot org
2022-08-09 14:01 ` malat at debian dot org
2022-08-09 15:28 ` pinskia at gcc dot gnu.org
2022-08-10  5:25 ` linkw at gcc dot gnu.org
2022-08-10  5:34 ` linkw at gcc dot gnu.org
2022-08-10  6:03 ` pinskia at gcc dot gnu.org
2022-08-10  6:24 ` linkw at gcc dot gnu.org
2022-08-10  9:47 ` linkw at gcc dot gnu.org
2022-08-10 12:32 ` rguenth at gcc dot gnu.org
2022-08-10 12:36 ` rguenth at gcc dot gnu.org
2022-08-11  1:18 ` linkw at gcc dot gnu.org
2022-08-15  6:51 ` linkw at gcc dot gnu.org
2022-08-16  5:50 ` cvs-commit at gcc dot gnu.org
2022-08-24  2:31 ` [Bug tree-optimization/106322] [12 " cvs-commit at gcc dot gnu.org
2022-08-24  2:53 ` linkw at gcc dot gnu.org
2022-08-24  6:51 ` rguenth at gcc dot gnu.org
2022-09-27 14:14 ` malat at debian dot org
2022-09-27 14:18 ` malat at debian dot org
2022-09-28  6:11 ` malat at debian dot org
2022-09-28  6:26 ` linkw at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-106322-4-4pkd4aWHDT@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).