From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-bugzilla@gcc.gnu.org>
Received: by sourceware.org (Postfix, from userid 48)
	id C9E5B3847718; Wed,  3 Apr 2024 17:36:43 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C9E5B3847718
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1712165803;
	bh=XJ2aQuQQoVzEl0TgXkj2OezoPbAc5PqdK3Z1/X9D2KQ=;
	h=From:To:Subject:Date:From;
	b=fp1KVuLyehG0gYhDsFSCvWc1Ur+eV3ePzausEiDOyWcL7LtCJ4aJ032y8NAIye/sW
	 HAByyuhJsscmRESoxpHjfYYgtYyNSs3nw5oI7vQcRANVN7jsuU4uxbvvbe9OjqyQuH
	 7ohWrnhyAklKoERRLZ5RAbBdSF4AW5uG7NiC+bGM=
From: "thiago at kde dot org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug target/114576] New: [13 regression][config/i386] GCC 14/trunk
 emits VEX-prefixed AES instruction without AVX enabled
Date: Wed, 03 Apr 2024 17:36:41 +0000
X-Bugzilla-Reason: CC
X-Bugzilla-Type: new
X-Bugzilla-Watch-Reason: None
X-Bugzilla-Product: gcc
X-Bugzilla-Component: target
X-Bugzilla-Version: 14.0
X-Bugzilla-Keywords: 
X-Bugzilla-Severity: normal
X-Bugzilla-Who: thiago at kde dot org
X-Bugzilla-Status: UNCONFIRMED
X-Bugzilla-Resolution: 
X-Bugzilla-Priority: P3
X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org
X-Bugzilla-Target-Milestone: ---
X-Bugzilla-Flags: 
X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status
 bug_severity priority component assigned_to reporter target_milestone
Message-ID: <bug-114576-4@http.gcc.gnu.org/bugzilla/>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/
Auto-Submitted: auto-generated
MIME-Version: 1.0
List-Id: <gcc-bugs.sourceware.org>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D114576

            Bug ID: 114576
           Summary: [13 regression][config/i386] GCC 14/trunk emits
                    VEX-prefixed AES instruction without AVX enabled
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: thiago at kde dot org
  Target Milestone: ---

Re: https://bugreports.qt.io/browse/QTBUG-123965
Re: https://bugzilla.redhat.com/show_bug.cgi?id=3D2262640,
https://bugzilla.redhat.com/show_bug.cgi?id=3D2272758
Godbolt link: https://gcc.godbolt.org/z/6P9fMvoxW

Found while compiling Qt 6.6 or 6.7 with GCC 14 (current trunk). This is a
regression from GCC 13.

This function from qhash.cpp
<https://github.com/qt/qtbase/blob/v6.7.0/src/corelib/tools/qhash.cpp#L581-=
L588>:

Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const
{
    {
        // unlike the Go code, we don't have more per-process seed
        __m128i state1 =3D _mm_aesenc_si128(state0, mseed2);
        return state1;
    }
}

Is apparently getting assembled to:
.L2:
        leaq    (%rdi,%rsi), %rdx
        vaesenc %xmm1, %xmm0, %xmm1

Though there's no AVX enabled in this code (the original version in Qt has =
some
AVX/VAES and AVX512 code but the reduced example does not).

This function:
    // hash twice 16 bytes, running 2 scramble rounds of AES on itself
    static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL
    hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, co=
nst
__m128i *src1)
    {
        __m128i data0 =3D _mm_loadu_si128(src0);
        __m128i data1 =3D _mm_loadu_si128(src1);
        state0 =3D _mm_xor_si128(data0, state0);
        state1 =3D _mm_xor_si128(data1, state1);
        state0 =3D _mm_aesenc_si128(state0, state0);
        state1 =3D _mm_aesenc_si128(state1, state1);
        state0 =3D _mm_aesenc_si128(state0, state0);
        state1 =3D _mm_aesenc_si128(state1, state1);
    }

Is even emitting:
.L20:
        movdqu  (%rax), %xmm2
        pxor    %xmm0, %xmm2
        movdqu  -16(%rdx), %xmm0
        pxor    %xmm0, %xmm1
        vaesenc %xmm2, %xmm2, %xmm0
        aesenc  %xmm1, %xmm1
        aesenc  %xmm0, %xmm0
        aesenc  %xmm1, %xmm1

and that makes no sense to use AVX for one of four instructions alone, call=
ed
from the same source function.

For reference, GCC 13 generates respectively:

.L2:
        movdqa  %xmm0, %xmm1
        leaq    (%rdi,%rsi), %rdx
        aesenc  %xmm2, %xmm1
and
.L20:
        movdqu  (%rax), %xmm2
        pxor    %xmm0, %xmm2
        movdqu  -16(%rdx), %xmm0
        aesenc  %xmm2, %xmm2
        pxor    %xmm0, %xmm1
        movdqa  %xmm2, %xmm0
        aesenc  %xmm1, %xmm1
        aesenc  %xmm2, %xmm0
        aesenc  %xmm1, %xmm1

You can tell that they are the same source block because the labels are the
same.

Sources:

#include <immintrin.h>
#ifdef _MSC_VER
#  define Q_ALWAYS_INLINE __forceinline
#  define QT_VECTORCALL __vectorcall
#  define QT_FUNCTION_TARGET(x)
#else
#  define Q_ALWAYS_INLINE inline __attribute__((always_inline))
#  define QT_VECTORCALL
#  define QT_FUNCTION_TARGET(x) __attribute__((target(QT_FUNCTION_TARGET_##=
x)))
#  define QT_FUNCTION_TARGET_AES        "sse4.2,aes"
//#  define qCpuHasFeature(x) __builtin_cpu_supports(QT_FUNCTION_TARGET_ ##=
 x)
#endif
#define QT_COMPILER_SUPPORTS_HERE(x)    true
#    define mm_set1_epz     _mm_set1_epi64x
#    define mm_cvtsz_si128  _mm_cvtsi64_si128
#    define mm_cvtsi128_sz  _mm_cvtsi128_si64
#    define mm256_set1_epz  _mm256_set1_epi64x
extern bool qCpuHasFeature(const char *) noexcept;
#define qCpuHasFeature(x)     qCpuHasFeature(#x)

using uchar =3D unsigned char;
using quintptr =3D unsigned long long;
using qint8 =3D signed char;

    // hash 16 bytes, running 3 scramble rounds of AES on itself (like label
"final1")
    static void Q_ALWAYS_INLINE QT_FUNCTION_TARGET(AES) QT_VECTORCALL
    hash16bytes(__m128i &state0, __m128i data)
    {
        state0 =3D _mm_xor_si128(state0, data);
        state0 =3D _mm_aesenc_si128(state0, state0);
        state0 =3D _mm_aesenc_si128(state0, state0);
        state0 =3D _mm_aesenc_si128(state0, state0);
    }

    // hash twice 16 bytes, running 2 scramble rounds of AES on itself
    static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL
    hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, co=
nst
__m128i *src1)
    {
        __m128i data0 =3D _mm_loadu_si128(src0);
        __m128i data1 =3D _mm_loadu_si128(src1);
        state0 =3D _mm_xor_si128(data0, state0);
        state1 =3D _mm_xor_si128(data1, state1);
        state0 =3D _mm_aesenc_si128(state0, state0);
        state1 =3D _mm_aesenc_si128(state1, state1);
        state0 =3D _mm_aesenc_si128(state0, state0);
        state1 =3D _mm_aesenc_si128(state1, state1);
    }

    struct AESHashSeed
    {
        __m128i state0;
        __m128i mseed2;
        AESHashSeed(size_t seed, size_t seed2) QT_FUNCTION_TARGET(AES);
        __m128i state1() const QT_FUNCTION_TARGET(AES);
    };

Q_ALWAYS_INLINE AESHashSeed::AESHashSeed(size_t seed, size_t seed2)
{
    __m128i mseed =3D mm_cvtsz_si128(seed);
    mseed2 =3D mm_set1_epz(seed2);

    // mseed (epi16) =3D [ seed, seed >> 16, seed >> 32, seed >> 48, len, 0=
, 0, 0
]
    mseed =3D _mm_insert_epi16(mseed, short(seed), 4);
    // mseed (epi16) =3D [ seed, seed >> 16, seed >> 32, seed >> 48, len, l=
en,
len, len ]
    mseed =3D _mm_shufflehi_epi16(mseed, 0);

    // merge with the process-global seed
    __m128i key =3D _mm_xor_si128(mseed, mseed2);

    // scramble the key
    __m128i state0 =3D _mm_aesenc_si128(key, key);
    this->state0 =3D state0;
}

Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const
{
    {
        // unlike the Go code, we don't have more per-process seed
        __m128i state1 =3D _mm_aesenc_si128(state0, mseed2);
        return state1;
    }
}

static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_16to32(__m128i state0, __m128i state1, const __m128i *src, const
__m128i *srcend)
{
    {
        if (src + 1 < srcend) {
            // epilogue: between 16 and 31 bytes
            hash2x16bytes(state0, state1, src, srcend - 1);
        } else if (src !=3D srcend) {
            // epilogue: between 1 and 16 bytes, overlap with the end
            __m128i data =3D _mm_loadu_si128(srcend - 1);
            hash16bytes(state0, data);
        }

        // combine results:
        state0 =3D _mm_xor_si128(state0, state1);
    }

    return mm_cvtsi128_sz(state0);
}

static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_lt16(__m128i state0, const uchar *p, size_t len)
{
    if (len) {
        // We're going to load 16 bytes and mask zero the part we don't care
        // (the hash of a short string is different from the hash of a long=
er
        // including NULLs at the end because the length is in the key)
        // WARNING: this may produce valgrind warnings, but it's safe

        constexpr quintptr PageSize =3D 4096;
        __m128i data;

        if ((quintptr(p) & (PageSize / 2)) =3D=3D 0) {
            // lower half of the page:
            // load all 16 bytes and mask off the bytes past the end of the
source
            static const qint8 maskarray[] =3D {
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
            };
            __m128i mask =3D _mm_loadu_si128(reinterpret_cast<const __m128i
*>(maskarray + 15 - len));
            data =3D _mm_loadu_si128(reinterpret_cast<const __m128i *>(p));
            data =3D _mm_and_si128(data, mask);
        } else {
            // upper half of the page:
            // load 16 bytes ending at the data end, then shuffle them to t=
he
beginning
            static const qint8 shufflecontrol[] =3D {
                1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            };
            __m128i control =3D _mm_loadu_si128(reinterpret_cast<const __m1=
28i
*>(shufflecontrol + 15 - len));
            data =3D _mm_loadu_si128(reinterpret_cast<const __m128i *>(p + =
len) -
1);
            data =3D _mm_shuffle_epi8(data, control);
        }

        hash16bytes(state0, data);
    }
    return mm_cvtsi128_sz(state0);
}

static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL
aeshash128_ge32(__m128i state0, __m128i state1, const __m128i *src, const
__m128i *srcend)
{
    // main loop: scramble two 16-byte blocks
    for ( ; src + 2 < srcend; src +=3D 2)
        hash2x16bytes(state0, state1, src, src + 1);

    return aeshash128_16to32(state0, state1, src, srcend);
}


static size_t QT_FUNCTION_TARGET(AES)
aeshash128(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
{
    AESHashSeed state(seed, seed2);
    auto src =3D reinterpret_cast<const __m128i *>(p);
    const auto srcend =3D reinterpret_cast<const __m128i *>(p + len);

    if (len < sizeof(__m128i))
        return aeshash128_lt16(state.state0, p, len);

    if (len <=3D sizeof(__m256i))
        return aeshash128_16to32(state.state0, state.state1(), src, srcend);

    return aeshash128_ge32(state.state0, state.state1(), src, srcend);
}

static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2)
noexcept
{
    return aeshash128(p, len, seed, seed2);
}

extern size_t qt_qhash_seed;
size_t qHashBits(const void *p, size_t size, size_t seed) noexcept
{
    size_t seed2 =3D size;
    if (seed)
        seed2 =3D qt_qhash_seed;
    return aeshash(reinterpret_cast<const uchar *>(p), size, seed, seed2);
}=