From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
 id EE84B3858289; Tue,  9 Aug 2022 07:50:46 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org EE84B3858289
From: "malat at debian dot org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug tree-optimization/106322] tree-vectorize: Wrong code at O2
 level (-fno-tree-vectorize is working)
Date: Tue, 09 Aug 2022 07:50:46 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: changed
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: tree-optimization
X-Bugzilla-Version: 12.1.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: malat at debian dot org
X-Bugzilla-Status: WAITING
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: 
Message-ID: <bug-106322-4-4pkd4aWHDT@http.gcc.gnu.org/bugzilla/>
In-Reply-To: <bug-106322-4@http.gcc.gnu.org/bugzilla/>
References: <bug-106322-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
X-BeenThere: gcc-bugs@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-bugs mailing list <gcc-bugs.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-bugs/>
List-Post: <mailto:gcc-bugs@gcc.gnu.org>
List-Help: <mailto:gcc-bugs-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-bugs>,
 <mailto:gcc-bugs-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Tue, 09 Aug 2022 07:50:47 -0000

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D106322
--- Comment #19 from Mathieu Malaterre <malat at debian dot org> ---
Without hwy dependency:

 % more Makefile bytes.cc demo.cc
::::::::::::::
Makefile
::::::::::::::
CXXFLAGS :=3D -O2

demo: demo.o bytes.o
        $(CXX) $(CXXFLAGS) -o $@ $^

clean:
        rm -f bytes.o demo.o
::::::::::::::
bytes.cc
::::::::::::::
#include <cstring>

bool BytesEqual(const void *bytes1, const void *bytes2, const size_t size) {
  return memcmp(bytes1, bytes2, size) =3D=3D 0;
}
::::::::::::::
demo.cc
::::::::::::::
#include <atomic>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <memory>

#define HWY_ALIGNMENT 64
constexpr size_t kAlignment =3D HWY_ALIGNMENT;
constexpr size_t kAlias =3D kAlignment * 4;

bool BytesEqual(const void *p1, const void *p2, const size_t size);

namespace hwy {
namespace N_EMU128 {
template <typename T, size_t N =3D 16 / sizeof(T)> struct Vec128 {
  T raw[16 / sizeof(T)] =3D {};
};
} // namespace N_EMU128
} // namespace hwy

template <typename T, size_t N>
static void Store(const hwy::N_EMU128::Vec128<T, N> v,
                  T *__restrict__ aligned) {
  __builtin_memcpy(aligned, v.raw, sizeof(T) * N);
}

template <typename T, size_t N>
static hwy::N_EMU128::Vec128<T, N> Load(const T *__restrict__ aligned) {
  hwy::N_EMU128::Vec128<T, N> v;
  __builtin_memcpy(v.raw, aligned, sizeof(T) * N);
  return v;
}

template <size_t N>
static hwy::N_EMU128::Vec128<uint16_t, N>
MulHigh(hwy::N_EMU128::Vec128<uint16_t, N> a,
        const hwy::N_EMU128::Vec128<uint16_t, N> b) {
  for (size_t i =3D 0; i < N; ++i) {
    // Cast to uint32_t first to prevent overflow. Otherwise the result of
    // uint16_t * uint16_t is in "int" which may overflow. In practice the
    // result is the same but this way it is also defined.
    a.raw[i] =3D static_cast<uint16_t>(
        (static_cast<uint32_t>(a.raw[i]) * static_cast<uint32_t>(b.raw[i]))=
 >>
        16);
  }
  return a;
}

#define HWY_ASSERT(condition) assert((condition))
#define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (ali=
gn))

#pragma pack(push, 1)
struct AllocationHeader {
  void *allocated;
  size_t payload_size;
};
#pragma pack(pop)

static void FreeAlignedBytes(const void *aligned_pointer) {
  HWY_ASSERT(aligned_pointer !=3D nullptr);
  if (aligned_pointer =3D=3D nullptr)
    return;

  const uintptr_t payload =3D reinterpret_cast<uintptr_t>(aligned_pointer);
  HWY_ASSERT(payload % kAlignment =3D=3D 0);
  const AllocationHeader *header =3D
      reinterpret_cast<const AllocationHeader *>(payload) - 1;

  free(header->allocated);
}

class AlignedFreer {
public:
  template <typename T> void operator()(T *aligned_pointer) const {
    FreeAlignedBytes(aligned_pointer);
  }
};

template <typename T>
using AlignedFreeUniquePtr =3D std::unique_ptr<T, AlignedFreer>;

static inline constexpr size_t ShiftCount(size_t n) {
  return (n <=3D 1) ? 0 : 1 + ShiftCount(n / 2);
}

namespace {
static size_t NextAlignedOffset() {
  static std::atomic<uint32_t> next{0};
  constexpr uint32_t kGroups =3D kAlias / kAlignment;
  const uint32_t group =3D next.fetch_add(1, std::memory_order_relaxed) %
kGroups;
  const size_t offset =3D kAlignment * group;
  HWY_ASSERT((offset % kAlignment =3D=3D 0) && offset <=3D kAlias);
  //  std::cerr << "O: " << offset << std::endl;
  return offset;
}
} // namespace

static void *AllocateAlignedBytes(const size_t payload_size) {
  HWY_ASSERT(payload_size !=3D 0); // likely a bug in caller
  if (payload_size >=3D std::numeric_limits<size_t>::max() / 2) {
    HWY_ASSERT(false && "payload_size too large");
    return nullptr;
  }

  size_t offset =3D NextAlignedOffset();

  // What: | misalign | unused | AllocationHeader |payload
  // Size: |<=3D kAlias | offset                    |payload_size
  //       ^allocated.^aligned.^header............^payload
  // The header must immediately precede payload, which must remain aligned.
  // To avoid wasting space, the header resides at the end of `unused`,
  // which therefore cannot be empty (offset =3D=3D 0).
  if (offset =3D=3D 0) {
    offset =3D kAlignment; // =3D RoundUpTo(sizeof(AllocationHeader), kAlig=
nment)
    static_assert(sizeof(AllocationHeader) <=3D kAlignment, "Else: round up=
");
  }

  const size_t allocated_size =3D kAlias + offset + payload_size;
  void *allocated =3D malloc(allocated_size);
  HWY_ASSERT(allocated !=3D nullptr);
  if (allocated =3D=3D nullptr)
    return nullptr;
  // Always round up even if already aligned - we already asked for kAlias
  // extra bytes and there's no way to give them back.
  uintptr_t aligned =3D reinterpret_cast<uintptr_t>(allocated) + kAlias;
  static_assert((kAlias & (kAlias - 1)) =3D=3D 0, "kAlias must be a power o=
f 2");
  static_assert(kAlias >=3D kAlignment, "Cannot align to more than kAlias");
  aligned &=3D ~(kAlias - 1);

  const uintptr_t payload =3D aligned + offset; // still aligned

  // Stash `allocated` and payload_size inside header for FreeAlignedBytes(=
).
  // The allocated_size can be reconstructed from the payload_size.
  AllocationHeader *header =3D reinterpret_cast<AllocationHeader *>(payload=
) - 1;
  header->allocated =3D allocated;
  header->payload_size =3D payload_size;

  //printf("%d-byte aligned addr: %p\n", kAlignment,
reinterpret_cast<void*>(payload));
  return HWY_ASSUME_ALIGNED(reinterpret_cast<void *>(payload), kAlignment);
}

template <typename T> static T *AllocateAlignedItems(size_t items) {
  constexpr size_t size =3D sizeof(T);

  constexpr bool is_pow2 =3D (size & (size - 1)) =3D=3D 0;
  constexpr size_t bits =3D ShiftCount(size);
  static_assert(!is_pow2 || (1ull << bits) =3D=3D size, "ShiftCount is inco=
rrect");

  const size_t bytes =3D is_pow2 ? items << bits : items * size;
  const size_t check =3D is_pow2 ? bytes >> bits : bytes / size;
  if (check !=3D items) {
    return nullptr; // overflowed
  }
  return static_cast<T *>(AllocateAlignedBytes(bytes));
}

template <typename T>
static AlignedFreeUniquePtr<T[]> AllocateAligned(const size_t items) {
  return AlignedFreeUniquePtr<T[]>(AllocateAlignedItems<T>(items),
                                   AlignedFreer());
}

int main() {
  AlignedFreeUniquePtr<uint16_t[]> in_lanes =3D AllocateAligned<uint16_t>(2=
);
  uint16_t expected_lanes[2];
  in_lanes[0] =3D 65535;
  in_lanes[1] =3D 32767;
  expected_lanes[0] =3D 65534;
  expected_lanes[1] =3D 16383;
  hwy::N_EMU128::Vec128<uint16_t, 2> v =3D Load<uint16_t, 2>(in_lanes.get()=
);
  hwy::N_EMU128::Vec128<uint16_t, 2> actual =3D MulHigh(v, v);
  {
    auto actual_lanes =3D AllocateAligned<uint16_t>(2);
    Store(actual, actual_lanes.get());
    const uint8_t *expected_array =3D
        reinterpret_cast<const uint8_t *>(expected_lanes);
    const uint8_t *actual_array =3D
        reinterpret_cast<const uint8_t *>(actual_lanes.get());
    for (size_t i =3D 0; i < 2; ++i) {
      const uint8_t *expected_ptr =3D expected_array + i * 2;
      const uint8_t *actual_ptr =3D actual_array + i * 2;
#if 1
      // trigger bug
      if (!BytesEqual(expected_ptr, actual_ptr, 2)) {
#else
      // no bug
      if (std::memcmp(expected_ptr, actual_ptr, 2) !=3D 0) {
#endif
        abort();
      }
    }
  }
}=