From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id 76CB23858D1E; Sat, 6 Jan 2024 00:03:01 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 76CB23858D1E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1704499381; bh=KWFMlecwBRODipEUpq9CWi4Ua0aMKpjFRKt24+Y0Ywc=; h=From:To:Subject:Date:From; b=Br5iSY065wfUdTIf6TlUVjMEfAW+mmbkMBPery4gOqOuPpbWidzweUBna3oAjKfgL 8nzmVXPT9AVIBaNpnWOf7ynIVHhvm7t9LlnKHDV/VZy+D9wF8dGnb054OVCfVAOGY9 HaWcXDOWXfHnxMGBvUEnPKSLQxw6/mPgqUIILJ/k= From: "juzhe.zhong at rivai dot ai" To: gcc-bugs@gcc.gnu.org Subject: [Bug c/113247] New: RISC-V: Performance bug in SHA256 Date: Sat, 06 Jan 2024 00:02:56 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: new X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: c X-Bugzilla-Version: 14.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: juzhe.zhong at rivai dot ai X-Bugzilla-Status: UNCONFIRMED X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: bug_id short_desc product version bug_status bug_severity priority component assigned_to reporter target_milestone Message-ID: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 List-Id: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D113247 Bug ID: 113247 Summary: RISC-V: Performance bug in SHA256 Product: gcc Version: 14.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c Assignee: unassigned at gcc dot gnu.org Reporter: juzhe.zhong at rivai dot ai Target Milestone: --- We found we have a performance bug while testing with various benchmarks. This is the case from coremark-pro SHA256: Tested on spike vector vs scalar in case of dynamic instruction count: 897210195 (vector) vs 418451694 (scalar) Obviously vector dynamic instruction count as twice as scalar. We tested on our hardware board and Thead C908 (K230), Both vector performance drop about 60%+ in the real hardware in case of vec= tor vs scalar. After investigation, the performance bug issue happens in the following cas= e: https://compiler-explorer.com/z/GcsnK7edn #include #define Ch(x,y,z) (z ^ (x & (y ^ z))) #define Maj(x,y,z) ((x & y) | (z & (x | y))) #define SHR(x, n) (x >> n) #define ROTR(x,n) (SHR(x,n) | (x << (32 - n))) #define S1(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25)) #define S0(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22)) #define s1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10)) #define s0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3)) #define SHA256_STEP(a,b,c,d,e,f,g,h,x,K) \ { \ tmp1 =3D h + S1(e) + Ch(e,f,g) + K + x; \ tmp2 =3D S0(a) + Maj(a,b,c); \ h =3D tmp1 + tmp2; \ d +=3D tmp1; \ } #define BE_LOAD32(n,b,i) (n) =3D byteswap(*(uint32_t *)(b + i)) static uint32_t byteswap(uint32_t x) { x =3D (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16; x =3D (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;=20=20 return x; } void sha256 (const uint8_t *in, uint32_t out[8]) { uint32_t tmp1, tmp2, a, b, c, d, e, f, g, h; uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w1= 4, w15; tmp1 =3D tmp2 =3D 0; w0 =3D w1 =3D w2 =3D w3 =3D w4 =3D w5 =3D w6 =3D w7 =3D w8 =3D w9 =3D w= 10 =3D w11 =3D w12 =3D w13 =3D w14 =3D w15 =3D 0; BE_LOAD32 ( w0, in, 0 ); BE_LOAD32 ( w1, in, 4 ); BE_LOAD32 ( w2, in, 8 ); BE_LOAD32 ( w3, in, 12 ); BE_LOAD32 ( w4, in, 16 ); BE_LOAD32 ( w5, in, 20 ); BE_LOAD32 ( w6, in, 24 ); BE_LOAD32 ( w7, in, 28 ); BE_LOAD32 ( w8, in, 32 ); BE_LOAD32 ( w9, in, 36 ); BE_LOAD32 ( w10, in, 40 ); BE_LOAD32 ( w11, in, 44 ); BE_LOAD32 ( w12, in, 48 ); BE_LOAD32 ( w13, in, 52 ); BE_LOAD32 ( w14, in, 56 ); BE_LOAD32 ( w15, in, 60 ); a =3D out[0]; b =3D out[1]; c =3D out[2]; d =3D out[3]; e =3D out[4]; f =3D out[5]; g =3D out[6]; h =3D out[7]; SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x428a2f98); SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x71374491); SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0xb5c0fbcf); SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0xe9b5dba5); SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x3956c25b); SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x59f111f1); SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x923f82a4); SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0xab1c5ed5); SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xd807aa98); SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x12835b01); SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x243185be); SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x550c7dc3); SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x72be5d74); SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0x80deb1fe); SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x9bdc06a7); SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc19bf174); w0 =3D s1(w14) + w9 + s0(w1) + w0; SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0xe49b69c1); w1 =3D s1(w15) + w10 + s0(w2) + w1; SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0xefbe4786); w2 =3D s1(w0) + w11 + s0(w3) + w2; SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x0fc19dc6); w3 =3D s1(w1) + w12 + s0(w4) + w3; SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x240ca1cc); w4 =3D s1(w2) + w13 + s0(w5) + w4; SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x2de92c6f); w5 =3D s1(w3) + w14 + s0(w6) + w5; SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4a7484aa); w6 =3D s1(w4) + w15 + s0(w7) + w6; SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5cb0a9dc); w7 =3D s1(w5) + w0 + s0(w8) + w7; SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x76f988da); w8 =3D s1(w6) + w1 + s0(w9) + w8; SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x983e5152); w9 =3D s1(w7) + w2 + s0(w10) + w9; SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa831c66d); w10 =3D s1(w8) + w3 + s0(w11) + w10; SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xb00327c8); w11 =3D s1(w9) + w4 + s0(w12) + w11; SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xbf597fc7); w12 =3D s1(w10) + w5 + s0(w13) + w12; SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xc6e00bf3); w13 =3D s1(w11) + w6 + s0(w14) + w13; SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd5a79147); w14 =3D s1(w12) + w7 + s0(w15) + w14; SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x06ca6351); w15 =3D s1(w13) + w8 + s0(w0) + w15; SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x14292967); w0 =3D s1(w14) + w9 + s0(w1) + w0; SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x27b70a85); w1 =3D s1(w15) + w10 + s0(w2) + w1; SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x2e1b2138); w2 =3D s1(w0) + w11 + s0(w3) + w2; SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x4d2c6dfc); w3 =3D s1(w1) + w12 + s0(w4) + w3; SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x53380d13); w4 =3D s1(w2) + w13 + s0(w5) + w4; SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x650a7354); w5 =3D s1(w3) + w14 + s0(w6) + w5; SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x766a0abb); w6 =3D s1(w4) + w15 + s0(w7) + w6; SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x81c2c92e); w7 =3D s1(w5) + w0 + s0(w8) + w7; SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x92722c85); w8 =3D s1(w6) + w1 + s0(w9) + w8; SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xa2bfe8a1); w9 =3D s1(w7) + w2 + s0(w10) + w9; SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa81a664b); w10 =3D s1(w8) + w3 + s0(w11) + w10; SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xc24b8b70); w11 =3D s1(w9) + w4 + s0(w12) + w11; SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xc76c51a3); w12 =3D s1(w10) + w5 + s0(w13) + w12; SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xd192e819); w13 =3D s1(w11) + w6 + s0(w14) + w13; SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd6990624); w14 =3D s1(w12) + w7 + s0(w15) + w14; SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xf40e3585); w15 =3D s1(w13) + w8 + s0(w0) + w15; SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x106aa070); w0 =3D s1(w14) + w9 + s0(w1) + w0; SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x19a4c116); w1 =3D s1(w15) + w10 + s0(w2) + w1; SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x1e376c08); w2 =3D s1(w0) + w11 + s0(w3) + w2; SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x2748774c); w3 =3D s1(w1) + w12 + s0(w4) + w3; SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x34b0bcb5); w4 =3D s1(w2) + w13 + s0(w5) + w4; SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x391c0cb3); w5 =3D s1(w3) + w14 + s0(w6) + w5; SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4ed8aa4a); w6 =3D s1(w4) + w15 + s0(w7) + w6; SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5b9cca4f); w7 =3D s1(w5) + w0 + s0(w8) + w7; SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x682e6ff3); w8 =3D s1(w6) + w1 + s0(w9) + w8; SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x748f82ee); w9 =3D s1(w7) + w2 + s0(w10) + w9; SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x78a5636f); w10 =3D s1(w8) + w3 + s0(w11) + w10; SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x84c87814); w11 =3D s1(w9) + w4 + s0(w12) + w11; SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x8cc70208); w12 =3D s1(w10) + w5 + s0(w13) + w12; SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x90befffa); w13 =3D s1(w11) + w6 + s0(w14) + w13; SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xa4506ceb); w14 =3D s1(w12) + w7 + s0(w15) + w14; SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xbef9a3f7); w15 =3D s1(w13) + w8 + s0(w0) + w15; SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc67178f2); out[0] +=3D a; out[1] +=3D b; out[2] +=3D c; out[3] +=3D d; out[4] +=3D e; out[5] +=3D f; out[6] +=3D g; out[7] +=3D h; } The assembly is quite staigth-forward. RVV GCC vectorizes codes which is profitable to be vectorized. Confirm both ARM SVE GCC and RISC-V Clang don't vectorize this code. So I believe we should teach cost model not vectorize it.=