From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id C9E5B3847718; Wed, 3 Apr 2024 17:36:43 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C9E5B3847718 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1712165803; bh=XJ2aQuQQoVzEl0TgXkj2OezoPbAc5PqdK3Z1/X9D2KQ=; h=From:To:Subject:Date:From; b=fp1KVuLyehG0gYhDsFSCvWc1Ur+eV3ePzausEiDOyWcL7LtCJ4aJ032y8NAIye/sW HAByyuhJsscmRESoxpHjfYYgtYyNSs3nw5oI7vQcRANVN7jsuU4uxbvvbe9OjqyQuH 7ohWrnhyAklKoERRLZ5RAbBdSF4AW5uG7NiC+bGM= From: "thiago at kde dot org" To: gcc-bugs@gcc.gnu.org Subject: [Bug target/114576] New: [13 regression][config/i386] GCC 14/trunk emits VEX-prefixed AES instruction without AVX enabled Date: Wed, 03 Apr 2024 17:36:41 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: target X-Bugzilla-Version: 14.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: thiago at kde dot org X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D114576 Bug ID: 114576 Summary: [13 regression][config/i386] GCC 14/trunk emits VEX-prefixed AES instruction without AVX enabled Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: thiago at kde dot org Target Milestone: --- Re: https://bugreports.qt.io/browse/QTBUG-123965 Re: https://bugzilla.redhat.com/show_bug.cgi?id=3D2262640, https://bugzilla.redhat.com/show_bug.cgi?id=3D2272758 Godbolt link: https://gcc.godbolt.org/z/6P9fMvoxW Found while compiling Qt 6.6 or 6.7 with GCC 14 (current trunk). This is a regression from GCC 13. This function from qhash.cpp : Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const { { // unlike the Go code, we don't have more per-process seed __m128i state1 =3D _mm_aesenc_si128(state0, mseed2); return state1; } } Is apparently getting assembled to: .L2: leaq (%rdi,%rsi), %rdx vaesenc %xmm1, %xmm0, %xmm1 Though there's no AVX enabled in this code (the original version in Qt has = some AVX/VAES and AVX512 code but the reduced example does not). This function: // hash twice 16 bytes, running 2 scramble rounds of AES on itself static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, co= nst __m128i *src1) { __m128i data0 =3D _mm_loadu_si128(src0); __m128i data1 =3D _mm_loadu_si128(src1); state0 =3D _mm_xor_si128(data0, state0); state1 =3D _mm_xor_si128(data1, state1); state0 =3D _mm_aesenc_si128(state0, state0); state1 =3D _mm_aesenc_si128(state1, state1); state0 =3D _mm_aesenc_si128(state0, state0); state1 =3D _mm_aesenc_si128(state1, state1); } Is even emitting: .L20: movdqu (%rax), %xmm2 pxor %xmm0, %xmm2 movdqu -16(%rdx), %xmm0 pxor %xmm0, %xmm1 vaesenc %xmm2, %xmm2, %xmm0 aesenc %xmm1, %xmm1 aesenc %xmm0, %xmm0 aesenc %xmm1, %xmm1 and that makes no sense to use AVX for one of four instructions alone, call= ed from the same source function. For reference, GCC 13 generates respectively: .L2: movdqa %xmm0, %xmm1 leaq (%rdi,%rsi), %rdx aesenc %xmm2, %xmm1 and .L20: movdqu (%rax), %xmm2 pxor %xmm0, %xmm2 movdqu -16(%rdx), %xmm0 aesenc %xmm2, %xmm2 pxor %xmm0, %xmm1 movdqa %xmm2, %xmm0 aesenc %xmm1, %xmm1 aesenc %xmm2, %xmm0 aesenc %xmm1, %xmm1 You can tell that they are the same source block because the labels are the same. Sources: #include #ifdef _MSC_VER # define Q_ALWAYS_INLINE __forceinline # define QT_VECTORCALL __vectorcall # define QT_FUNCTION_TARGET(x) #else # define Q_ALWAYS_INLINE inline __attribute__((always_inline)) # define QT_VECTORCALL # define QT_FUNCTION_TARGET(x) __attribute__((target(QT_FUNCTION_TARGET_##= x))) # define QT_FUNCTION_TARGET_AES "sse4.2,aes" //# define qCpuHasFeature(x) __builtin_cpu_supports(QT_FUNCTION_TARGET_ ##= x) #endif #define QT_COMPILER_SUPPORTS_HERE(x) true # define mm_set1_epz _mm_set1_epi64x # define mm_cvtsz_si128 _mm_cvtsi64_si128 # define mm_cvtsi128_sz _mm_cvtsi128_si64 # define mm256_set1_epz _mm256_set1_epi64x extern bool qCpuHasFeature(const char *) noexcept; #define qCpuHasFeature(x) qCpuHasFeature(#x) using uchar =3D unsigned char; using quintptr =3D unsigned long long; using qint8 =3D signed char; // hash 16 bytes, running 3 scramble rounds of AES on itself (like label "final1") static void Q_ALWAYS_INLINE QT_FUNCTION_TARGET(AES) QT_VECTORCALL hash16bytes(__m128i &state0, __m128i data) { state0 =3D _mm_xor_si128(state0, data); state0 =3D _mm_aesenc_si128(state0, state0); state0 =3D _mm_aesenc_si128(state0, state0); state0 =3D _mm_aesenc_si128(state0, state0); } // hash twice 16 bytes, running 2 scramble rounds of AES on itself static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, co= nst __m128i *src1) { __m128i data0 =3D _mm_loadu_si128(src0); __m128i data1 =3D _mm_loadu_si128(src1); state0 =3D _mm_xor_si128(data0, state0); state1 =3D _mm_xor_si128(data1, state1); state0 =3D _mm_aesenc_si128(state0, state0); state1 =3D _mm_aesenc_si128(state1, state1); state0 =3D _mm_aesenc_si128(state0, state0); state1 =3D _mm_aesenc_si128(state1, state1); } struct AESHashSeed { __m128i state0; __m128i mseed2; AESHashSeed(size_t seed, size_t seed2) QT_FUNCTION_TARGET(AES); __m128i state1() const QT_FUNCTION_TARGET(AES); }; Q_ALWAYS_INLINE AESHashSeed::AESHashSeed(size_t seed, size_t seed2) { __m128i mseed =3D mm_cvtsz_si128(seed); mseed2 =3D mm_set1_epz(seed2); // mseed (epi16) =3D [ seed, seed >> 16, seed >> 32, seed >> 48, len, 0= , 0, 0 ] mseed =3D _mm_insert_epi16(mseed, short(seed), 4); // mseed (epi16) =3D [ seed, seed >> 16, seed >> 32, seed >> 48, len, l= en, len, len ] mseed =3D _mm_shufflehi_epi16(mseed, 0); // merge with the process-global seed __m128i key =3D _mm_xor_si128(mseed, mseed2); // scramble the key __m128i state0 =3D _mm_aesenc_si128(key, key); this->state0 =3D state0; } Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const { { // unlike the Go code, we don't have more per-process seed __m128i state1 =3D _mm_aesenc_si128(state0, mseed2); return state1; } } static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL aeshash128_16to32(__m128i state0, __m128i state1, const __m128i *src, const __m128i *srcend) { { if (src + 1 < srcend) { // epilogue: between 16 and 31 bytes hash2x16bytes(state0, state1, src, srcend - 1); } else if (src !=3D srcend) { // epilogue: between 1 and 16 bytes, overlap with the end __m128i data =3D _mm_loadu_si128(srcend - 1); hash16bytes(state0, data); } // combine results: state0 =3D _mm_xor_si128(state0, state1); } return mm_cvtsi128_sz(state0); } static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL aeshash128_lt16(__m128i state0, const uchar *p, size_t len) { if (len) { // We're going to load 16 bytes and mask zero the part we don't care // (the hash of a short string is different from the hash of a long= er // including NULLs at the end because the length is in the key) // WARNING: this may produce valgrind warnings, but it's safe constexpr quintptr PageSize =3D 4096; __m128i data; if ((quintptr(p) & (PageSize / 2)) =3D=3D 0) { // lower half of the page: // load all 16 bytes and mask off the bytes past the end of the source static const qint8 maskarray[] =3D { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; __m128i mask =3D _mm_loadu_si128(reinterpret_cast(maskarray + 15 - len)); data =3D _mm_loadu_si128(reinterpret_cast(p)); data =3D _mm_and_si128(data, mask); } else { // upper half of the page: // load 16 bytes ending at the data end, then shuffle them to t= he beginning static const qint8 shufflecontrol[] =3D { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; __m128i control =3D _mm_loadu_si128(reinterpret_cast(shufflecontrol + 15 - len)); data =3D _mm_loadu_si128(reinterpret_cast(p + = len) - 1); data =3D _mm_shuffle_epi8(data, control); } hash16bytes(state0, data); } return mm_cvtsi128_sz(state0); } static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL aeshash128_ge32(__m128i state0, __m128i state1, const __m128i *src, const __m128i *srcend) { // main loop: scramble two 16-byte blocks for ( ; src + 2 < srcend; src +=3D 2) hash2x16bytes(state0, state1, src, src + 1); return aeshash128_16to32(state0, state1, src, srcend); } static size_t QT_FUNCTION_TARGET(AES) aeshash128(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept { AESHashSeed state(seed, seed2); auto src =3D reinterpret_cast(p); const auto srcend =3D reinterpret_cast(p + len); if (len < sizeof(__m128i)) return aeshash128_lt16(state.state0, p, len); if (len <=3D sizeof(__m256i)) return aeshash128_16to32(state.state0, state.state1(), src, srcend); return aeshash128_ge32(state.state0, state.state1(), src, srcend); } static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept { return aeshash128(p, len, seed, seed2); } extern size_t qt_qhash_seed; size_t qHashBits(const void *p, size_t size, size_t seed) noexcept { size_t seed2 =3D size; if (seed) seed2 =3D qt_qhash_seed; return aeshash(reinterpret_cast(p), size, seed, seed2); }=