* [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
@ 2024-01-15 12:00 Juzhe-Zhong
2024-01-17 1:37 ` Edwin Lu
0 siblings, 1 reply; 8+ messages in thread
From: Juzhe-Zhong @ 2024-01-15 12:00 UTC (permalink / raw)
To: gcc-patches; +Cc: Juzhe-Zhong
This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with -march=rv64gcv in real hardware.
The root cause is incorrect cost model cause inefficient vectorization which makes us performance drop significantly.
So this patch does:
1. Adjust vector to scalar cost by introducing v to scalar reg move.
2. Adjust vec_construct cost since we does spend NUNITS instructions to construct the vector.
Tested on both RV32/RV64 no regression, Rebase to the trunk and commit it as it is approved by Robin.
PR target/113247
gcc/ChangeLog:
* config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector to scalar regmove.
* config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto.
* config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust vec_construct cost.
gcc/testsuite/ChangeLog:
* gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test.
* gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
* gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test.
* gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test.
---
gcc/config/riscv/riscv-protos.h | 2 +
gcc/config/riscv/riscv-vector-costs.cc | 3 +
gcc/config/riscv/riscv.cc | 4 +-
.../vect/costmodel/riscv/rvv/pr113247-1.c | 195 ++++++++++++++++++
.../vect/costmodel/riscv/rvv/pr113247-2.c | 6 +
.../vect/costmodel/riscv/rvv/pr113247-3.c | 6 +
.../vect/costmodel/riscv/rvv/pr113247-4.c | 6 +
.../riscv/rvv/autovec/vls/reduc-19.c | 2 +-
.../riscv/rvv/autovec/vls/reduc-20.c | 2 +-
.../riscv/rvv/autovec/vls/reduc-21.c | 2 +-
10 files changed, 224 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
index 4f3b677f4f9..21f6dadf113 100644
--- a/gcc/config/riscv/riscv-protos.h
+++ b/gcc/config/riscv/riscv-protos.h
@@ -255,6 +255,8 @@ struct regmove_vector_cost
{
const int GR2VR;
const int FR2VR;
+ const int VR2GR;
+ const int VR2FR;
};
/* Cost for vector insn classes. */
diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
index 90ab93b7506..7c9840df4e9 100644
--- a/gcc/config/riscv/riscv-vector-costs.cc
+++ b/gcc/config/riscv/riscv-vector-costs.cc
@@ -1056,6 +1056,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost)
case scalar_to_vec:
return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
: costs->regmove->GR2VR);
+ case vec_to_scalar:
+ return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
+ : costs->regmove->VR2GR);
default:
break;
}
diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ee1a57b321d..568db90a27d 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
static const regmove_vector_cost rvv_regmove_vector_cost = {
2, /* GR2VR */
2, /* FR2VR */
+ 2, /* VR2GR */
+ 2, /* VR2FR */
};
/* Generic costs for vector insn classes. It is supposed to be the vector cost
@@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
case vec_construct:
- return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1;
+ return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
default:
gcc_unreachable ();
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
new file mode 100644
index 00000000000..0d09a624a00
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
@@ -0,0 +1,195 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic" } */
+
+#include <stdint-gcc.h>
+
+#define Ch(x,y,z) (z ^ (x & (y ^ z)))
+#define Maj(x,y,z) ((x & y) | (z & (x | y)))
+
+#define SHR(x, n) (x >> n)
+#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))
+#define S1(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
+#define S0(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
+
+#define s1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))
+#define s0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))
+
+#define SHA256_STEP(a,b,c,d,e,f,g,h,x,K) \
+{ \
+ tmp1 = h + S1(e) + Ch(e,f,g) + K + x; \
+ tmp2 = S0(a) + Maj(a,b,c); \
+ h = tmp1 + tmp2; \
+ d += tmp1; \
+}
+
+#define BE_LOAD32(n,b,i) (n) = byteswap(*(uint32_t *)(b + i))
+
+static uint32_t byteswap(uint32_t x)
+{
+ x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
+ x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
+
+ return x;
+}
+
+void sha256 (const uint8_t *in, uint32_t out[8])
+{
+ uint32_t tmp1, tmp2, a, b, c, d, e, f, g, h;
+ uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+
+ tmp1 = tmp2 = 0;
+ w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = w10 = w11 = w12 = w13 = w14 = w15 = 0;
+
+ BE_LOAD32 ( w0, in, 0 );
+ BE_LOAD32 ( w1, in, 4 );
+ BE_LOAD32 ( w2, in, 8 );
+ BE_LOAD32 ( w3, in, 12 );
+ BE_LOAD32 ( w4, in, 16 );
+ BE_LOAD32 ( w5, in, 20 );
+ BE_LOAD32 ( w6, in, 24 );
+ BE_LOAD32 ( w7, in, 28 );
+ BE_LOAD32 ( w8, in, 32 );
+ BE_LOAD32 ( w9, in, 36 );
+ BE_LOAD32 ( w10, in, 40 );
+ BE_LOAD32 ( w11, in, 44 );
+ BE_LOAD32 ( w12, in, 48 );
+ BE_LOAD32 ( w13, in, 52 );
+ BE_LOAD32 ( w14, in, 56 );
+ BE_LOAD32 ( w15, in, 60 );
+
+ a = out[0];
+ b = out[1];
+ c = out[2];
+ d = out[3];
+ e = out[4];
+ f = out[5];
+ g = out[6];
+ h = out[7];
+
+ SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x428a2f98);
+ SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x71374491);
+ SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0xb5c0fbcf);
+ SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0xe9b5dba5);
+ SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x3956c25b);
+ SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x59f111f1);
+ SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x923f82a4);
+ SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0xab1c5ed5);
+ SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xd807aa98);
+ SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x12835b01);
+ SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x243185be);
+ SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x550c7dc3);
+ SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x72be5d74);
+ SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0x80deb1fe);
+ SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x9bdc06a7);
+ SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc19bf174);
+
+ w0 = s1(w14) + w9 + s0(w1) + w0;
+ SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0xe49b69c1);
+ w1 = s1(w15) + w10 + s0(w2) + w1;
+ SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0xefbe4786);
+ w2 = s1(w0) + w11 + s0(w3) + w2;
+ SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x0fc19dc6);
+ w3 = s1(w1) + w12 + s0(w4) + w3;
+ SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x240ca1cc);
+ w4 = s1(w2) + w13 + s0(w5) + w4;
+ SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x2de92c6f);
+ w5 = s1(w3) + w14 + s0(w6) + w5;
+ SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4a7484aa);
+ w6 = s1(w4) + w15 + s0(w7) + w6;
+ SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5cb0a9dc);
+ w7 = s1(w5) + w0 + s0(w8) + w7;
+ SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x76f988da);
+ w8 = s1(w6) + w1 + s0(w9) + w8;
+ SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x983e5152);
+ w9 = s1(w7) + w2 + s0(w10) + w9;
+ SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa831c66d);
+ w10 = s1(w8) + w3 + s0(w11) + w10;
+ SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xb00327c8);
+ w11 = s1(w9) + w4 + s0(w12) + w11;
+ SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xbf597fc7);
+ w12 = s1(w10) + w5 + s0(w13) + w12;
+ SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xc6e00bf3);
+ w13 = s1(w11) + w6 + s0(w14) + w13;
+ SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd5a79147);
+ w14 = s1(w12) + w7 + s0(w15) + w14;
+ SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x06ca6351);
+ w15 = s1(w13) + w8 + s0(w0) + w15;
+ SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x14292967);
+
+ w0 = s1(w14) + w9 + s0(w1) + w0;
+ SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x27b70a85);
+ w1 = s1(w15) + w10 + s0(w2) + w1;
+ SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x2e1b2138);
+ w2 = s1(w0) + w11 + s0(w3) + w2;
+ SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x4d2c6dfc);
+ w3 = s1(w1) + w12 + s0(w4) + w3;
+ SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x53380d13);
+ w4 = s1(w2) + w13 + s0(w5) + w4;
+ SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x650a7354);
+ w5 = s1(w3) + w14 + s0(w6) + w5;
+ SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x766a0abb);
+ w6 = s1(w4) + w15 + s0(w7) + w6;
+ SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x81c2c92e);
+ w7 = s1(w5) + w0 + s0(w8) + w7;
+ SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x92722c85);
+ w8 = s1(w6) + w1 + s0(w9) + w8;
+ SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xa2bfe8a1);
+ w9 = s1(w7) + w2 + s0(w10) + w9;
+ SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa81a664b);
+ w10 = s1(w8) + w3 + s0(w11) + w10;
+ SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xc24b8b70);
+ w11 = s1(w9) + w4 + s0(w12) + w11;
+ SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xc76c51a3);
+ w12 = s1(w10) + w5 + s0(w13) + w12;
+ SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xd192e819);
+ w13 = s1(w11) + w6 + s0(w14) + w13;
+ SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd6990624);
+ w14 = s1(w12) + w7 + s0(w15) + w14;
+ SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xf40e3585);
+ w15 = s1(w13) + w8 + s0(w0) + w15;
+ SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x106aa070);
+
+ w0 = s1(w14) + w9 + s0(w1) + w0;
+ SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x19a4c116);
+ w1 = s1(w15) + w10 + s0(w2) + w1;
+ SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x1e376c08);
+ w2 = s1(w0) + w11 + s0(w3) + w2;
+ SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x2748774c);
+ w3 = s1(w1) + w12 + s0(w4) + w3;
+ SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x34b0bcb5);
+ w4 = s1(w2) + w13 + s0(w5) + w4;
+ SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x391c0cb3);
+ w5 = s1(w3) + w14 + s0(w6) + w5;
+ SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4ed8aa4a);
+ w6 = s1(w4) + w15 + s0(w7) + w6;
+ SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5b9cca4f);
+ w7 = s1(w5) + w0 + s0(w8) + w7;
+ SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x682e6ff3);
+ w8 = s1(w6) + w1 + s0(w9) + w8;
+ SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x748f82ee);
+ w9 = s1(w7) + w2 + s0(w10) + w9;
+ SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x78a5636f);
+ w10 = s1(w8) + w3 + s0(w11) + w10;
+ SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x84c87814);
+ w11 = s1(w9) + w4 + s0(w12) + w11;
+ SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x8cc70208);
+ w12 = s1(w10) + w5 + s0(w13) + w12;
+ SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x90befffa);
+ w13 = s1(w11) + w6 + s0(w14) + w13;
+ SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xa4506ceb);
+ w14 = s1(w12) + w7 + s0(w15) + w14;
+ SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xbef9a3f7);
+ w15 = s1(w13) + w8 + s0(w0) + w15;
+ SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc67178f2);
+
+ out[0] += a;
+ out[1] += b;
+ out[2] += c;
+ out[3] += d;
+ out[4] += e;
+ out[5] += f;
+ out[6] += g;
+ out[7] += h;
+}
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
new file mode 100644
index 00000000000..64a53cfca88
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic --param=riscv-autovec-preference=fixed-vlmax" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
new file mode 100644
index 00000000000..423c90e4154
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
new file mode 100644
index 00000000000..c2a46d848e5
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
@@ -0,0 +1,6 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-preference=fixed-vlmax" } */
+
+#include "pr113247-1.c"
+
+/* { dg-final { scan-assembler-not {vset} } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
index 2048b636910..5130fe5f2e3 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
@@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512)
DEF_REDUC_PLUS (_Float16, 1024)
DEF_REDUC_PLUS (_Float16, 2048)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
index bfc328da568..819104a8cdf 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
@@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256)
DEF_REDUC_PLUS (float, 512)
DEF_REDUC_PLUS (float, 1024)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
index 8228590fa3b..2b61e0ac71a 100644
--- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
+++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
@@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128)
DEF_REDUC_PLUS (float, 256)
DEF_REDUC_PLUS (float, 512)
-/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
+/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */
/* { dg-final { scan-assembler-not {csrr} } } */
/* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
/* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
--
2.36.3
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
2024-01-15 12:00 [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro Juzhe-Zhong
@ 2024-01-17 1:37 ` Edwin Lu
2024-01-17 1:37 ` Edwin Lu
2024-01-17 1:41 ` juzhe.zhong
0 siblings, 2 replies; 8+ messages in thread
From: Edwin Lu @ 2024-01-17 1:37 UTC (permalink / raw)
To: Juzhe-Zhong, gcc-patches; +Cc: Patrick O'Neill
Hi Juzhe,
I'm seeing that this patch introduces failures with rv32gcv-ilp32d as
seen here https://github.com/ewlu/gcc-precommit-ci/issues/1194. Digging
a little deeper, it appears that there's an illegal instruction in a
shared library which (at least for FAIL:
gcc.c-torture/execute/920501-8.c -O2 execution test) is using vmv.v.i
without a prior vsetvl. I believe the other failures may be similar.
Logs:
spawn -ignore SIGHUP
/scratch/ewlu/ci/triage/compare/build-407-errors/build-gcc-linux-stage2/gcc/xgcc
-B/scratch/ewlu/ci/triage/compare/build-407-errors/build-gcc-linux-stage2/gcc/
/scratch/ewlu/ci/triage/compare/gcc/gcc/testsuite/gcc.c-torture/execute/920501-8.c
-march=rv32gcv -mabi=ilp32d -mtune=rocket -mcmodel=medlow
-fdiagnostics-plain-output -O2 -w -lm -o ./920501-8.exe
PASS: gcc.c-torture/execute/920501-8.c -O2 (test for excess errors)
spawn riscv64-unknown-linux-gnu-run ./920501-8.exe
/scratch/ewlu/ci/triage/compare/build-407-errors/../scripts/wrapper/qemu/riscv64-unknown-linux-gnu-run:
line 15: 584664 Illegal instruction (core dumped)
QEMU_CPU="$(march-to-cpu-opt --get-riscv-tag $1)" qemu-riscv$xlen -r
5.10 "${qemu_args[@]}" -L ${RISC_V_SYSROOT} "$@"
FAIL: gcc.c-torture/execute/920501-8.c -O2 execution test
Execution:
QEMU_CPU="rv32,vlen=128,v=true,vext_spec=v1.0,Zve32f=true,Zve64f=true"
./bin/qemu-riscv32 ./920501-8.exe
GDB output:
Program received signal SIGILL, Illegal instruction.
0x2b3d0f3e in __printf_buffer () from
/scratch/ewlu/ci/triage/compare/build-407-errors/sysroot/lib32/ilp32d/libc.so.6
1: x/i $pc
=> 0x2b3d0f3e <__printf_buffer+410>: vmv.v.i v1,0
I've included the first 150ish lines of the function's objdump below.
Edwin
$ ./bin/riscv64-unknown-linux-gnu-objdump -d
sysroot/lib32/ilp32d/libc.so.6 > dump
00046da4 <__printf_buffer>:
46da4: 000f8797 auipc a5,0xf8
46da8: 1ac7a783 lw a5,428(a5) # 13ef50
<_GLOBAL_OFFSET_TABLE_+0x64>
46dac: b3010113 addi sp,sp,-1232
46db0: 4c812423 sw s0,1224(sp)
46db4: c6be sw a5,76(sp)
46db6: 9792 add a5,a5,tp
46db8: 439c lw a5,0(a5)
46dba: 842a mv s0,a0
46dbc: 4d212023 sw s2,1216(sp)
46dc0: c2ae sw a1,68(sp)
46dc2: 892e mv s2,a1
46dc4: 852e mv a0,a1
46dc6: 02500593 li a1,37
46dca: de3e sw a5,60(sp)
46dcc: 4c112623 sw ra,1228(sp)
46dd0: 4c912223 sw s1,1220(sp)
46dd4: 4b312e23 sw s3,1212(sp)
46dd8: 4b812423 sw s8,1192(sp)
46ddc: 84b2 mv s1,a2
46dde: dcb2 sw a2,120(sp)
46de0: 89b6 mv s3,a3
46de2: c0b6 sw a3,64(sp)
46de4: 60a300ef jal 773ee <strchrnul>
46de8: c4aa sw a0,72(sp)
46dea: 41250633 sub a2,a0,s2
46dee: 8c2a mv s8,a0
46df0: 85ca mv a1,s2
46df2: 8522 mv a0,s0
46df4: 933f90ef jal 40726
<__printf_buffer_write>
46df8: 4c1c lw a5,24(s0)
46dfa: c3dd beqz a5,46ea0
<__printf_buffer+0xfc>
46dfc: 000c4783 lbu a5,0(s8)
46e00: c3c5 beqz a5,46ea0
<__printf_buffer+0xfc>
46e02: 000fa797 auipc a5,0xfa
46e06: 85a7a783 lw a5,-1958(a5) # 14065c
<__printf_function_table>
46e0a: 4b512a23 sw s5,1204(sp)
46e0e: da3e sw a5,52(sp)
46e10: c399 beqz a5,46e16
<__printf_buffer+0x72>
46e12: 2680106f j 4807a
<__printf_buffer+0x12d6>
46e16: 000fa797 auipc a5,0xfa
46e1a: 8367a783 lw a5,-1994(a5) # 14064c
<__printf_modifier_table>
46e1e: 740797e3 bnez a5,47d6c
<__printf_buffer+0xfc8>
46e22: 000f9797 auipc a5,0xf9
46e26: e927a783 lw a5,-366(a5) # 13fcb4
<__printf_va_arg_table>
46e2a: 740791e3 bnez a5,47d6c
<__printf_buffer+0xfc8>
46e2e: 57fd li a5,-1
46e30: d0be sw a5,96(sp)
46e32: 0019f793 andi a5,s3,1
46e36: d4be sw a5,104(sp)
46e38: 111c addi a5,sp,160
46e3a: 4b412c23 sw s4,1208(sp)
46e3e: 4b612823 sw s6,1200(sp)
46e42: 4b712623 sw s7,1196(sp)
46e46: 4b912223 sw s9,1188(sp)
46e4a: 4ba12023 sw s10,1184(sp)
46e4e: 49b12e23 sw s11,1180(sp)
46e52: ce82 sw zero,92(sp)
46e54: 4a81 li s5,0
46e56: 000b9a17 auipc s4,0xb9
46e5a: 2dea0a13 addi s4,s4,734 # 100134
<step4_jumps.0>
46e5e: d6be sw a5,108(sp)
46e60: 001c4d03 lbu s10,1(s8)
46e64: fe0d0793 addi a5,s10,-32
46e68: 0ff7f793 zext.b a5,a5
46e6c: 05a00713 li a4,90
46e70: 04f77763 bgeu a4,a5,46ebe
<__printf_buffer+0x11a>
46e74: 520d17e3 bnez s10,47ba2
<__printf_buffer+0xdfe>
46e78: 47b6 lw a5,76(sp)
46e7a: 9792 add a5,a5,tp
46e7c: 4759 li a4,22
46e7e: 00042c23 sw zero,24(s0)
46e82: c398 sw a4,0(a5)
46e84: 4b812a03 lw s4,1208(sp)
46e88: 4b412a83 lw s5,1204(sp)
46e8c: 4b012b03 lw s6,1200(sp)
46e90: 4ac12b83 lw s7,1196(sp)
46e94: 4a412c83 lw s9,1188(sp)
46e98: 4a012d03 lw s10,1184(sp)
46e9c: 49c12d83 lw s11,1180(sp)
46ea0: 4cc12083 lw ra,1228(sp)
46ea4: 4c812403 lw s0,1224(sp)
46ea8: 4c412483 lw s1,1220(sp)
46eac: 4c012903 lw s2,1216(sp)
46eb0: 4bc12983 lw s3,1212(sp)
46eb4: 4a812c03 lw s8,1192(sp)
46eb8: 4d010113 addi sp,sp,1232
46ebc: 8082 ret
46ebe: 01aa07b3 add a5,s4,s10
46ec2: 05c7c783 lbu a5,92(a5)
46ec6: 078a slli a5,a5,0x2
46ec8: 97d2 add a5,a5,s4
46eca: 0d87a703 lw a4,216(a5)
46ece: 00000797 auipc a5,0x0
46ed2: fa678793 addi a5,a5,-90 # 46e74
<__printf_buffer+0xd0>
46ed6: 973e add a4,a4,a5
46ed8: 02000793 li a5,32
46edc: d63e sw a5,44(sp)
46ede: 0c05 addi s8,s8,1
46ee0: 4b01 li s6,0
46ee2: 59fd li s3,-1
46ee4: 4c81 li s9,0
46ee6: 4381 li t2,0
46ee8: 4781 li a5,0
46eea: 4b81 li s7,0
46eec: 4901 li s2,0
46eee: 4e01 li t3,0
46ef0: ce02 sw zero,28(sp)
46ef2: 4d81 li s11,0
46ef4: d202 sw zero,36(sp)
46ef6: d402 sw zero,40(sp)
46ef8: 8702 jr a4
46efa: 5726 lw a4,104(sp)
46efc: c319 beqz a4,46f02
<__printf_buffer+0x15e>
46efe: 23c0106f j 4813a
<__printf_buffer+0x1396>
46f02: 874a mv a4,s2
46f04: 001b9693 slli a3,s7,0x1
46f08: 8f55 or a4,a4,a3
46f0a: 078a slli a5,a5,0x2
46f0c: 8fd9 or a5,a5,a4
46f0e: 5722 lw a4,40(sp)
46f10: 070e slli a4,a4,0x3
46f12: 8fd9 or a5,a5,a4
46f14: 5712 lw a4,36(sp)
46f16: 0712 slli a4,a4,0x4
46f18: 8fd9 or a5,a5,a4
46f1a: 005d9f13 slli t5,s11,0x5
46f1e: 4772 lw a4,28(sp)
46f20: 071a slli a4,a4,0x6
46f22: 01e7e7b3 or a5,a5,t5
46f26: 8fd9 or a5,a5,a4
46f28: 0e1e slli t3,t3,0x7
46f2a: 01c7e7b3 or a5,a5,t3
46f2e: 0b2e slli s6,s6,0xb
46f30: 6705 lui a4,0x1
46f32: 8ff70713 addi a4,a4,-1793 # 8ff
<current+0x8b7>
46f36: 0167e7b3 or a5,a5,s6
46f3a: 8ff9 and a5,a5,a4
46f3c: 5736 lw a4,108(sp)
46f3e: 5e0030d7 vmv.v.i v1,0
46f42: 020700a7 vse8.v v1,(a4)
46f46: 5732 lw a4,44(sp)
46f48: cb4e sw s3,148(sp)
46f4a: cd66 sw s9,152(sp)
46f4c: cf6a sw s10,156(sp)
On 1/15/2024 4:00 AM, Juzhe-Zhong wrote:
> This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with -march=rv64gcv in real hardware.
>
> The root cause is incorrect cost model cause inefficient vectorization which makes us performance drop significantly.
>
> So this patch does:
>
> 1. Adjust vector to scalar cost by introducing v to scalar reg move.
> 2. Adjust vec_construct cost since we does spend NUNITS instructions to construct the vector.
>
> Tested on both RV32/RV64 no regression, Rebase to the trunk and commit it as it is approved by Robin.
>
> PR target/113247
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector to scalar regmove.
> * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto.
> * config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust vec_construct cost.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test.
> * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
> * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test.
>
> ---
> gcc/config/riscv/riscv-protos.h | 2 +
> gcc/config/riscv/riscv-vector-costs.cc | 3 +
> gcc/config/riscv/riscv.cc | 4 +-
> .../vect/costmodel/riscv/rvv/pr113247-1.c | 195 ++++++++++++++++++
> .../vect/costmodel/riscv/rvv/pr113247-2.c | 6 +
> .../vect/costmodel/riscv/rvv/pr113247-3.c | 6 +
> .../vect/costmodel/riscv/rvv/pr113247-4.c | 6 +
> .../riscv/rvv/autovec/vls/reduc-19.c | 2 +-
> .../riscv/rvv/autovec/vls/reduc-20.c | 2 +-
> .../riscv/rvv/autovec/vls/reduc-21.c | 2 +-
> 10 files changed, 224 insertions(+), 4 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 4f3b677f4f9..21f6dadf113 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -255,6 +255,8 @@ struct regmove_vector_cost
> {
> const int GR2VR;
> const int FR2VR;
> + const int VR2GR;
> + const int VR2FR;
> };
>
> /* Cost for vector insn classes. */
> diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
> index 90ab93b7506..7c9840df4e9 100644
> --- a/gcc/config/riscv/riscv-vector-costs.cc
> +++ b/gcc/config/riscv/riscv-vector-costs.cc
> @@ -1056,6 +1056,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost)
> case scalar_to_vec:
> return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
> : costs->regmove->GR2VR);
> + case vec_to_scalar:
> + return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
> + : costs->regmove->VR2GR);
> default:
> break;
> }
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index ee1a57b321d..568db90a27d 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
> static const regmove_vector_cost rvv_regmove_vector_cost = {
> 2, /* GR2VR */
> 2, /* FR2VR */
> + 2, /* VR2GR */
> + 2, /* VR2FR */
> };
>
> /* Generic costs for vector insn classes. It is supposed to be the vector cost
> @@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
>
> case vec_construct:
> - return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1;
> + return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
>
> default:
> gcc_unreachable ();
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> new file mode 100644
> index 00000000000..0d09a624a00
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> @@ -0,0 +1,195 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define Ch(x,y,z) (z ^ (x & (y ^ z)))
> +#define Maj(x,y,z) ((x & y) | (z & (x | y)))
> +
> +#define SHR(x, n) (x >> n)
> +#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))
> +#define S1(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
> +#define S0(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
> +
> +#define s1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))
> +#define s0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))
> +
> +#define SHA256_STEP(a,b,c,d,e,f,g,h,x,K) \
> +{ \
> + tmp1 = h + S1(e) + Ch(e,f,g) + K + x; \
> + tmp2 = S0(a) + Maj(a,b,c); \
> + h = tmp1 + tmp2; \
> + d += tmp1; \
> +}
> +
> +#define BE_LOAD32(n,b,i) (n) = byteswap(*(uint32_t *)(b + i))
> +
> +static uint32_t byteswap(uint32_t x)
> +{
> + x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> + x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> +
> + return x;
> +}
> +
> +void sha256 (const uint8_t *in, uint32_t out[8])
> +{
> + uint32_t tmp1, tmp2, a, b, c, d, e, f, g, h;
> + uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
> +
> + tmp1 = tmp2 = 0;
> + w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = w10 = w11 = w12 = w13 = w14 = w15 = 0;
> +
> + BE_LOAD32 ( w0, in, 0 );
> + BE_LOAD32 ( w1, in, 4 );
> + BE_LOAD32 ( w2, in, 8 );
> + BE_LOAD32 ( w3, in, 12 );
> + BE_LOAD32 ( w4, in, 16 );
> + BE_LOAD32 ( w5, in, 20 );
> + BE_LOAD32 ( w6, in, 24 );
> + BE_LOAD32 ( w7, in, 28 );
> + BE_LOAD32 ( w8, in, 32 );
> + BE_LOAD32 ( w9, in, 36 );
> + BE_LOAD32 ( w10, in, 40 );
> + BE_LOAD32 ( w11, in, 44 );
> + BE_LOAD32 ( w12, in, 48 );
> + BE_LOAD32 ( w13, in, 52 );
> + BE_LOAD32 ( w14, in, 56 );
> + BE_LOAD32 ( w15, in, 60 );
> +
> + a = out[0];
> + b = out[1];
> + c = out[2];
> + d = out[3];
> + e = out[4];
> + f = out[5];
> + g = out[6];
> + h = out[7];
> +
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x428a2f98);
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x71374491);
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0xb5c0fbcf);
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0xe9b5dba5);
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x3956c25b);
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x59f111f1);
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x923f82a4);
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0xab1c5ed5);
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xd807aa98);
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x12835b01);
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x243185be);
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x550c7dc3);
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x72be5d74);
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0x80deb1fe);
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x9bdc06a7);
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc19bf174);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0xe49b69c1);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0xefbe4786);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x0fc19dc6);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x240ca1cc);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x2de92c6f);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4a7484aa);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5cb0a9dc);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x76f988da);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x983e5152);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa831c66d);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xb00327c8);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xbf597fc7);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xc6e00bf3);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd5a79147);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x06ca6351);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x14292967);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x27b70a85);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x2e1b2138);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x4d2c6dfc);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x53380d13);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x650a7354);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x766a0abb);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x81c2c92e);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x92722c85);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xa2bfe8a1);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa81a664b);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xc24b8b70);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xc76c51a3);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xd192e819);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd6990624);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xf40e3585);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x106aa070);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x19a4c116);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x1e376c08);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x2748774c);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x34b0bcb5);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x391c0cb3);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4ed8aa4a);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5b9cca4f);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x682e6ff3);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x748f82ee);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x78a5636f);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x84c87814);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x8cc70208);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x90befffa);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xa4506ceb);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xbef9a3f7);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc67178f2);
> +
> + out[0] += a;
> + out[1] += b;
> + out[2] += c;
> + out[3] += d;
> + out[4] += e;
> + out[5] += f;
> + out[6] += g;
> + out[7] += h;
> +}
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> new file mode 100644
> index 00000000000..64a53cfca88
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic --param=riscv-autovec-preference=fixed-vlmax" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> new file mode 100644
> index 00000000000..423c90e4154
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
> new file mode 100644
> index 00000000000..c2a46d848e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-preference=fixed-vlmax" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> index 2048b636910..5130fe5f2e3 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> @@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512)
> DEF_REDUC_PLUS (_Float16, 1024)
> DEF_REDUC_PLUS (_Float16, 2048)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> index bfc328da568..819104a8cdf 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> @@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256)
> DEF_REDUC_PLUS (float, 512)
> DEF_REDUC_PLUS (float, 1024)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> index 8228590fa3b..2b61e0ac71a 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> @@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128)
> DEF_REDUC_PLUS (float, 256)
> DEF_REDUC_PLUS (float, 512)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
2024-01-17 1:37 ` Edwin Lu
@ 2024-01-17 1:37 ` Edwin Lu
2024-01-17 1:41 ` juzhe.zhong
1 sibling, 0 replies; 8+ messages in thread
From: Edwin Lu @ 2024-01-17 1:37 UTC (permalink / raw)
To: gcc-patches; +Cc: Patrick O'Neill
Hi Juzhe,
I'm seeing that this patch introduces failures with rv32gcv-ilp32d as
seen here https://github.com/ewlu/gcc-precommit-ci/issues/1194. Digging
a little deeper, it appears that there's an illegal instruction in a
shared library which (at least for FAIL:
gcc.c-torture/execute/920501-8.c -O2 execution test) is using vmv.v.i
without a prior vsetvl. I believe the other failures may be similar.
Logs:
spawn -ignore SIGHUP
/scratch/ewlu/ci/triage/compare/build-407-errors/build-gcc-linux-stage2/gcc/xgcc
-B/scratch/ewlu/ci/triage/compare/build-407-errors/build-gcc-linux-stage2/gcc/
/scratch/ewlu/ci/triage/compare/gcc/gcc/testsuite/gcc.c-torture/execute/920501-8.c
-march=rv32gcv -mabi=ilp32d -mtune=rocket -mcmodel=medlow
-fdiagnostics-plain-output -O2 -w -lm -o ./920501-8.exe
PASS: gcc.c-torture/execute/920501-8.c -O2 (test for excess errors)
spawn riscv64-unknown-linux-gnu-run ./920501-8.exe
/scratch/ewlu/ci/triage/compare/build-407-errors/../scripts/wrapper/qemu/riscv64-unknown-linux-gnu-run:
line 15: 584664 Illegal instruction (core dumped)
QEMU_CPU="$(march-to-cpu-opt --get-riscv-tag $1)" qemu-riscv$xlen -r
5.10 "${qemu_args[@]}" -L ${RISC_V_SYSROOT} "$@"
FAIL: gcc.c-torture/execute/920501-8.c -O2 execution test
Execution:
QEMU_CPU="rv32,vlen=128,v=true,vext_spec=v1.0,Zve32f=true,Zve64f=true"
./bin/qemu-riscv32 ./920501-8.exe
GDB output:
Program received signal SIGILL, Illegal instruction.
0x2b3d0f3e in __printf_buffer () from
/scratch/ewlu/ci/triage/compare/build-407-errors/sysroot/lib32/ilp32d/libc.so.6
1: x/i $pc
=> 0x2b3d0f3e <__printf_buffer+410>: vmv.v.i v1,0
I've included the first 150ish lines of the function's objdump below.
Edwin
$ ./bin/riscv64-unknown-linux-gnu-objdump -d
sysroot/lib32/ilp32d/libc.so.6 > dump
00046da4 <__printf_buffer>:
46da4: 000f8797 auipc a5,0xf8
46da8: 1ac7a783 lw a5,428(a5) # 13ef50
<_GLOBAL_OFFSET_TABLE_+0x64>
46dac: b3010113 addi sp,sp,-1232
46db0: 4c812423 sw s0,1224(sp)
46db4: c6be sw a5,76(sp)
46db6: 9792 add a5,a5,tp
46db8: 439c lw a5,0(a5)
46dba: 842a mv s0,a0
46dbc: 4d212023 sw s2,1216(sp)
46dc0: c2ae sw a1,68(sp)
46dc2: 892e mv s2,a1
46dc4: 852e mv a0,a1
46dc6: 02500593 li a1,37
46dca: de3e sw a5,60(sp)
46dcc: 4c112623 sw ra,1228(sp)
46dd0: 4c912223 sw s1,1220(sp)
46dd4: 4b312e23 sw s3,1212(sp)
46dd8: 4b812423 sw s8,1192(sp)
46ddc: 84b2 mv s1,a2
46dde: dcb2 sw a2,120(sp)
46de0: 89b6 mv s3,a3
46de2: c0b6 sw a3,64(sp)
46de4: 60a300ef jal 773ee <strchrnul>
46de8: c4aa sw a0,72(sp)
46dea: 41250633 sub a2,a0,s2
46dee: 8c2a mv s8,a0
46df0: 85ca mv a1,s2
46df2: 8522 mv a0,s0
46df4: 933f90ef jal 40726
<__printf_buffer_write>
46df8: 4c1c lw a5,24(s0)
46dfa: c3dd beqz a5,46ea0
<__printf_buffer+0xfc>
46dfc: 000c4783 lbu a5,0(s8)
46e00: c3c5 beqz a5,46ea0
<__printf_buffer+0xfc>
46e02: 000fa797 auipc a5,0xfa
46e06: 85a7a783 lw a5,-1958(a5) # 14065c
<__printf_function_table>
46e0a: 4b512a23 sw s5,1204(sp)
46e0e: da3e sw a5,52(sp)
46e10: c399 beqz a5,46e16
<__printf_buffer+0x72>
46e12: 2680106f j 4807a
<__printf_buffer+0x12d6>
46e16: 000fa797 auipc a5,0xfa
46e1a: 8367a783 lw a5,-1994(a5) # 14064c
<__printf_modifier_table>
46e1e: 740797e3 bnez a5,47d6c
<__printf_buffer+0xfc8>
46e22: 000f9797 auipc a5,0xf9
46e26: e927a783 lw a5,-366(a5) # 13fcb4
<__printf_va_arg_table>
46e2a: 740791e3 bnez a5,47d6c
<__printf_buffer+0xfc8>
46e2e: 57fd li a5,-1
46e30: d0be sw a5,96(sp)
46e32: 0019f793 andi a5,s3,1
46e36: d4be sw a5,104(sp)
46e38: 111c addi a5,sp,160
46e3a: 4b412c23 sw s4,1208(sp)
46e3e: 4b612823 sw s6,1200(sp)
46e42: 4b712623 sw s7,1196(sp)
46e46: 4b912223 sw s9,1188(sp)
46e4a: 4ba12023 sw s10,1184(sp)
46e4e: 49b12e23 sw s11,1180(sp)
46e52: ce82 sw zero,92(sp)
46e54: 4a81 li s5,0
46e56: 000b9a17 auipc s4,0xb9
46e5a: 2dea0a13 addi s4,s4,734 # 100134
<step4_jumps.0>
46e5e: d6be sw a5,108(sp)
46e60: 001c4d03 lbu s10,1(s8)
46e64: fe0d0793 addi a5,s10,-32
46e68: 0ff7f793 zext.b a5,a5
46e6c: 05a00713 li a4,90
46e70: 04f77763 bgeu a4,a5,46ebe
<__printf_buffer+0x11a>
46e74: 520d17e3 bnez s10,47ba2
<__printf_buffer+0xdfe>
46e78: 47b6 lw a5,76(sp)
46e7a: 9792 add a5,a5,tp
46e7c: 4759 li a4,22
46e7e: 00042c23 sw zero,24(s0)
46e82: c398 sw a4,0(a5)
46e84: 4b812a03 lw s4,1208(sp)
46e88: 4b412a83 lw s5,1204(sp)
46e8c: 4b012b03 lw s6,1200(sp)
46e90: 4ac12b83 lw s7,1196(sp)
46e94: 4a412c83 lw s9,1188(sp)
46e98: 4a012d03 lw s10,1184(sp)
46e9c: 49c12d83 lw s11,1180(sp)
46ea0: 4cc12083 lw ra,1228(sp)
46ea4: 4c812403 lw s0,1224(sp)
46ea8: 4c412483 lw s1,1220(sp)
46eac: 4c012903 lw s2,1216(sp)
46eb0: 4bc12983 lw s3,1212(sp)
46eb4: 4a812c03 lw s8,1192(sp)
46eb8: 4d010113 addi sp,sp,1232
46ebc: 8082 ret
46ebe: 01aa07b3 add a5,s4,s10
46ec2: 05c7c783 lbu a5,92(a5)
46ec6: 078a slli a5,a5,0x2
46ec8: 97d2 add a5,a5,s4
46eca: 0d87a703 lw a4,216(a5)
46ece: 00000797 auipc a5,0x0
46ed2: fa678793 addi a5,a5,-90 # 46e74
<__printf_buffer+0xd0>
46ed6: 973e add a4,a4,a5
46ed8: 02000793 li a5,32
46edc: d63e sw a5,44(sp)
46ede: 0c05 addi s8,s8,1
46ee0: 4b01 li s6,0
46ee2: 59fd li s3,-1
46ee4: 4c81 li s9,0
46ee6: 4381 li t2,0
46ee8: 4781 li a5,0
46eea: 4b81 li s7,0
46eec: 4901 li s2,0
46eee: 4e01 li t3,0
46ef0: ce02 sw zero,28(sp)
46ef2: 4d81 li s11,0
46ef4: d202 sw zero,36(sp)
46ef6: d402 sw zero,40(sp)
46ef8: 8702 jr a4
46efa: 5726 lw a4,104(sp)
46efc: c319 beqz a4,46f02
<__printf_buffer+0x15e>
46efe: 23c0106f j 4813a
<__printf_buffer+0x1396>
46f02: 874a mv a4,s2
46f04: 001b9693 slli a3,s7,0x1
46f08: 8f55 or a4,a4,a3
46f0a: 078a slli a5,a5,0x2
46f0c: 8fd9 or a5,a5,a4
46f0e: 5722 lw a4,40(sp)
46f10: 070e slli a4,a4,0x3
46f12: 8fd9 or a5,a5,a4
46f14: 5712 lw a4,36(sp)
46f16: 0712 slli a4,a4,0x4
46f18: 8fd9 or a5,a5,a4
46f1a: 005d9f13 slli t5,s11,0x5
46f1e: 4772 lw a4,28(sp)
46f20: 071a slli a4,a4,0x6
46f22: 01e7e7b3 or a5,a5,t5
46f26: 8fd9 or a5,a5,a4
46f28: 0e1e slli t3,t3,0x7
46f2a: 01c7e7b3 or a5,a5,t3
46f2e: 0b2e slli s6,s6,0xb
46f30: 6705 lui a4,0x1
46f32: 8ff70713 addi a4,a4,-1793 # 8ff
<current+0x8b7>
46f36: 0167e7b3 or a5,a5,s6
46f3a: 8ff9 and a5,a5,a4
46f3c: 5736 lw a4,108(sp)
46f3e: 5e0030d7 vmv.v.i v1,0
46f42: 020700a7 vse8.v v1,(a4)
46f46: 5732 lw a4,44(sp)
46f48: cb4e sw s3,148(sp)
46f4a: cd66 sw s9,152(sp)
46f4c: cf6a sw s10,156(sp)
On 1/15/2024 4:00 AM, Juzhe-Zhong wrote:
> This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with -march=rv64gcv in real hardware.
>
> The root cause is incorrect cost model cause inefficient vectorization which makes us performance drop significantly.
>
> So this patch does:
>
> 1. Adjust vector to scalar cost by introducing v to scalar reg move.
> 2. Adjust vec_construct cost since we does spend NUNITS instructions to construct the vector.
>
> Tested on both RV32/RV64 no regression, Rebase to the trunk and commit it as it is approved by Robin.
>
> PR target/113247
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector to scalar regmove.
> * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto.
> * config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust vec_construct cost.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test.
> * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
> * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test.
>
> ---
> gcc/config/riscv/riscv-protos.h | 2 +
> gcc/config/riscv/riscv-vector-costs.cc | 3 +
> gcc/config/riscv/riscv.cc | 4 +-
> .../vect/costmodel/riscv/rvv/pr113247-1.c | 195 ++++++++++++++++++
> .../vect/costmodel/riscv/rvv/pr113247-2.c | 6 +
> .../vect/costmodel/riscv/rvv/pr113247-3.c | 6 +
> .../vect/costmodel/riscv/rvv/pr113247-4.c | 6 +
> .../riscv/rvv/autovec/vls/reduc-19.c | 2 +-
> .../riscv/rvv/autovec/vls/reduc-20.c | 2 +-
> .../riscv/rvv/autovec/vls/reduc-21.c | 2 +-
> 10 files changed, 224 insertions(+), 4 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 4f3b677f4f9..21f6dadf113 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -255,6 +255,8 @@ struct regmove_vector_cost
> {
> const int GR2VR;
> const int FR2VR;
> + const int VR2GR;
> + const int VR2FR;
> };
>
> /* Cost for vector insn classes. */
> diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
> index 90ab93b7506..7c9840df4e9 100644
> --- a/gcc/config/riscv/riscv-vector-costs.cc
> +++ b/gcc/config/riscv/riscv-vector-costs.cc
> @@ -1056,6 +1056,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost)
> case scalar_to_vec:
> return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
> : costs->regmove->GR2VR);
> + case vec_to_scalar:
> + return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
> + : costs->regmove->VR2GR);
> default:
> break;
> }
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index ee1a57b321d..568db90a27d 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
> static const regmove_vector_cost rvv_regmove_vector_cost = {
> 2, /* GR2VR */
> 2, /* FR2VR */
> + 2, /* VR2GR */
> + 2, /* VR2FR */
> };
>
> /* Generic costs for vector insn classes. It is supposed to be the vector cost
> @@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
>
> case vec_construct:
> - return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1;
> + return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
>
> default:
> gcc_unreachable ();
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> new file mode 100644
> index 00000000000..0d09a624a00
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> @@ -0,0 +1,195 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define Ch(x,y,z) (z ^ (x & (y ^ z)))
> +#define Maj(x,y,z) ((x & y) | (z & (x | y)))
> +
> +#define SHR(x, n) (x >> n)
> +#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))
> +#define S1(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
> +#define S0(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
> +
> +#define s1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))
> +#define s0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))
> +
> +#define SHA256_STEP(a,b,c,d,e,f,g,h,x,K) \
> +{ \
> + tmp1 = h + S1(e) + Ch(e,f,g) + K + x; \
> + tmp2 = S0(a) + Maj(a,b,c); \
> + h = tmp1 + tmp2; \
> + d += tmp1; \
> +}
> +
> +#define BE_LOAD32(n,b,i) (n) = byteswap(*(uint32_t *)(b + i))
> +
> +static uint32_t byteswap(uint32_t x)
> +{
> + x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> + x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> +
> + return x;
> +}
> +
> +void sha256 (const uint8_t *in, uint32_t out[8])
> +{
> + uint32_t tmp1, tmp2, a, b, c, d, e, f, g, h;
> + uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
> +
> + tmp1 = tmp2 = 0;
> + w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = w10 = w11 = w12 = w13 = w14 = w15 = 0;
> +
> + BE_LOAD32 ( w0, in, 0 );
> + BE_LOAD32 ( w1, in, 4 );
> + BE_LOAD32 ( w2, in, 8 );
> + BE_LOAD32 ( w3, in, 12 );
> + BE_LOAD32 ( w4, in, 16 );
> + BE_LOAD32 ( w5, in, 20 );
> + BE_LOAD32 ( w6, in, 24 );
> + BE_LOAD32 ( w7, in, 28 );
> + BE_LOAD32 ( w8, in, 32 );
> + BE_LOAD32 ( w9, in, 36 );
> + BE_LOAD32 ( w10, in, 40 );
> + BE_LOAD32 ( w11, in, 44 );
> + BE_LOAD32 ( w12, in, 48 );
> + BE_LOAD32 ( w13, in, 52 );
> + BE_LOAD32 ( w14, in, 56 );
> + BE_LOAD32 ( w15, in, 60 );
> +
> + a = out[0];
> + b = out[1];
> + c = out[2];
> + d = out[3];
> + e = out[4];
> + f = out[5];
> + g = out[6];
> + h = out[7];
> +
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x428a2f98);
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x71374491);
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0xb5c0fbcf);
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0xe9b5dba5);
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x3956c25b);
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x59f111f1);
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x923f82a4);
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0xab1c5ed5);
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xd807aa98);
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x12835b01);
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x243185be);
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x550c7dc3);
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x72be5d74);
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0x80deb1fe);
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x9bdc06a7);
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc19bf174);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0xe49b69c1);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0xefbe4786);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x0fc19dc6);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x240ca1cc);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x2de92c6f);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4a7484aa);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5cb0a9dc);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x76f988da);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x983e5152);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa831c66d);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xb00327c8);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xbf597fc7);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xc6e00bf3);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd5a79147);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x06ca6351);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x14292967);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x27b70a85);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x2e1b2138);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x4d2c6dfc);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x53380d13);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x650a7354);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x766a0abb);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x81c2c92e);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x92722c85);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xa2bfe8a1);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa81a664b);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xc24b8b70);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xc76c51a3);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xd192e819);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd6990624);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xf40e3585);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x106aa070);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x19a4c116);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x1e376c08);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x2748774c);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x34b0bcb5);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x391c0cb3);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4ed8aa4a);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5b9cca4f);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x682e6ff3);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x748f82ee);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x78a5636f);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x84c87814);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x8cc70208);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x90befffa);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xa4506ceb);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xbef9a3f7);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc67178f2);
> +
> + out[0] += a;
> + out[1] += b;
> + out[2] += c;
> + out[3] += d;
> + out[4] += e;
> + out[5] += f;
> + out[6] += g;
> + out[7] += h;
> +}
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> new file mode 100644
> index 00000000000..64a53cfca88
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic --param=riscv-autovec-preference=fixed-vlmax" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> new file mode 100644
> index 00000000000..423c90e4154
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
> new file mode 100644
> index 00000000000..c2a46d848e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-preference=fixed-vlmax" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> index 2048b636910..5130fe5f2e3 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> @@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512)
> DEF_REDUC_PLUS (_Float16, 1024)
> DEF_REDUC_PLUS (_Float16, 2048)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> index bfc328da568..819104a8cdf 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> @@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256)
> DEF_REDUC_PLUS (float, 512)
> DEF_REDUC_PLUS (float, 1024)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> index 8228590fa3b..2b61e0ac71a 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> @@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128)
> DEF_REDUC_PLUS (float, 256)
> DEF_REDUC_PLUS (float, 512)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
2024-01-17 1:37 ` Edwin Lu
2024-01-17 1:37 ` Edwin Lu
@ 2024-01-17 1:41 ` juzhe.zhong
2024-01-17 1:45 ` Edwin Lu
1 sibling, 1 reply; 8+ messages in thread
From: juzhe.zhong @ 2024-01-17 1:41 UTC (permalink / raw)
To: Edwin Lu, gcc-patches; +Cc: Patrick O'Neill
[-- Attachment #1: Type: text/plain, Size: 28538 bytes --]
Are you saying using glibc lib ? I do the testing with newlib, I didn't anything wrong.
It seems that this patch triggers latent bug of VSETVL PASS (Even though this patch doesn't change anything related to VSETVL PASS).
I will investigate it.
Thanks.
juzhe.zhong@rivai.ai
From: Edwin Lu
Date: 2024-01-17 09:37
To: Juzhe-Zhong; gcc-patches
CC: Patrick O'Neill
Subject: Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
Hi Juzhe,
I'm seeing that this patch introduces failures with rv32gcv-ilp32d as
seen here https://github.com/ewlu/gcc-precommit-ci/issues/1194. Digging
a little deeper, it appears that there's an illegal instruction in a
shared library which (at least for FAIL:
gcc.c-torture/execute/920501-8.c -O2 execution test) is using vmv.v.i
without a prior vsetvl. I believe the other failures may be similar.
Logs:
spawn -ignore SIGHUP
/scratch/ewlu/ci/triage/compare/build-407-errors/build-gcc-linux-stage2/gcc/xgcc
-B/scratch/ewlu/ci/triage/compare/build-407-errors/build-gcc-linux-stage2/gcc/
/scratch/ewlu/ci/triage/compare/gcc/gcc/testsuite/gcc.c-torture/execute/920501-8.c
-march=rv32gcv -mabi=ilp32d -mtune=rocket -mcmodel=medlow
-fdiagnostics-plain-output -O2 -w -lm -o ./920501-8.exe
PASS: gcc.c-torture/execute/920501-8.c -O2 (test for excess errors)
spawn riscv64-unknown-linux-gnu-run ./920501-8.exe
/scratch/ewlu/ci/triage/compare/build-407-errors/../scripts/wrapper/qemu/riscv64-unknown-linux-gnu-run:
line 15: 584664 Illegal instruction (core dumped)
QEMU_CPU="$(march-to-cpu-opt --get-riscv-tag $1)" qemu-riscv$xlen -r
5.10 "${qemu_args[@]}" -L ${RISC_V_SYSROOT} "$@"
FAIL: gcc.c-torture/execute/920501-8.c -O2 execution test
Execution:
QEMU_CPU="rv32,vlen=128,v=true,vext_spec=v1.0,Zve32f=true,Zve64f=true"
./bin/qemu-riscv32 ./920501-8.exe
GDB output:
Program received signal SIGILL, Illegal instruction.
0x2b3d0f3e in __printf_buffer () from
/scratch/ewlu/ci/triage/compare/build-407-errors/sysroot/lib32/ilp32d/libc.so.6
1: x/i $pc
=> 0x2b3d0f3e <__printf_buffer+410>: vmv.v.i v1,0
I've included the first 150ish lines of the function's objdump below.
Edwin
$ ./bin/riscv64-unknown-linux-gnu-objdump -d
sysroot/lib32/ilp32d/libc.so.6 > dump
00046da4 <__printf_buffer>:
46da4: 000f8797 auipc a5,0xf8
46da8: 1ac7a783 lw a5,428(a5) # 13ef50
<_GLOBAL_OFFSET_TABLE_+0x64>
46dac: b3010113 addi sp,sp,-1232
46db0: 4c812423 sw s0,1224(sp)
46db4: c6be sw a5,76(sp)
46db6: 9792 add a5,a5,tp
46db8: 439c lw a5,0(a5)
46dba: 842a mv s0,a0
46dbc: 4d212023 sw s2,1216(sp)
46dc0: c2ae sw a1,68(sp)
46dc2: 892e mv s2,a1
46dc4: 852e mv a0,a1
46dc6: 02500593 li a1,37
46dca: de3e sw a5,60(sp)
46dcc: 4c112623 sw ra,1228(sp)
46dd0: 4c912223 sw s1,1220(sp)
46dd4: 4b312e23 sw s3,1212(sp)
46dd8: 4b812423 sw s8,1192(sp)
46ddc: 84b2 mv s1,a2
46dde: dcb2 sw a2,120(sp)
46de0: 89b6 mv s3,a3
46de2: c0b6 sw a3,64(sp)
46de4: 60a300ef jal 773ee <strchrnul>
46de8: c4aa sw a0,72(sp)
46dea: 41250633 sub a2,a0,s2
46dee: 8c2a mv s8,a0
46df0: 85ca mv a1,s2
46df2: 8522 mv a0,s0
46df4: 933f90ef jal 40726
<__printf_buffer_write>
46df8: 4c1c lw a5,24(s0)
46dfa: c3dd beqz a5,46ea0
<__printf_buffer+0xfc>
46dfc: 000c4783 lbu a5,0(s8)
46e00: c3c5 beqz a5,46ea0
<__printf_buffer+0xfc>
46e02: 000fa797 auipc a5,0xfa
46e06: 85a7a783 lw a5,-1958(a5) # 14065c
<__printf_function_table>
46e0a: 4b512a23 sw s5,1204(sp)
46e0e: da3e sw a5,52(sp)
46e10: c399 beqz a5,46e16
<__printf_buffer+0x72>
46e12: 2680106f j 4807a
<__printf_buffer+0x12d6>
46e16: 000fa797 auipc a5,0xfa
46e1a: 8367a783 lw a5,-1994(a5) # 14064c
<__printf_modifier_table>
46e1e: 740797e3 bnez a5,47d6c
<__printf_buffer+0xfc8>
46e22: 000f9797 auipc a5,0xf9
46e26: e927a783 lw a5,-366(a5) # 13fcb4
<__printf_va_arg_table>
46e2a: 740791e3 bnez a5,47d6c
<__printf_buffer+0xfc8>
46e2e: 57fd li a5,-1
46e30: d0be sw a5,96(sp)
46e32: 0019f793 andi a5,s3,1
46e36: d4be sw a5,104(sp)
46e38: 111c addi a5,sp,160
46e3a: 4b412c23 sw s4,1208(sp)
46e3e: 4b612823 sw s6,1200(sp)
46e42: 4b712623 sw s7,1196(sp)
46e46: 4b912223 sw s9,1188(sp)
46e4a: 4ba12023 sw s10,1184(sp)
46e4e: 49b12e23 sw s11,1180(sp)
46e52: ce82 sw zero,92(sp)
46e54: 4a81 li s5,0
46e56: 000b9a17 auipc s4,0xb9
46e5a: 2dea0a13 addi s4,s4,734 # 100134
<step4_jumps.0>
46e5e: d6be sw a5,108(sp)
46e60: 001c4d03 lbu s10,1(s8)
46e64: fe0d0793 addi a5,s10,-32
46e68: 0ff7f793 zext.b a5,a5
46e6c: 05a00713 li a4,90
46e70: 04f77763 bgeu a4,a5,46ebe
<__printf_buffer+0x11a>
46e74: 520d17e3 bnez s10,47ba2
<__printf_buffer+0xdfe>
46e78: 47b6 lw a5,76(sp)
46e7a: 9792 add a5,a5,tp
46e7c: 4759 li a4,22
46e7e: 00042c23 sw zero,24(s0)
46e82: c398 sw a4,0(a5)
46e84: 4b812a03 lw s4,1208(sp)
46e88: 4b412a83 lw s5,1204(sp)
46e8c: 4b012b03 lw s6,1200(sp)
46e90: 4ac12b83 lw s7,1196(sp)
46e94: 4a412c83 lw s9,1188(sp)
46e98: 4a012d03 lw s10,1184(sp)
46e9c: 49c12d83 lw s11,1180(sp)
46ea0: 4cc12083 lw ra,1228(sp)
46ea4: 4c812403 lw s0,1224(sp)
46ea8: 4c412483 lw s1,1220(sp)
46eac: 4c012903 lw s2,1216(sp)
46eb0: 4bc12983 lw s3,1212(sp)
46eb4: 4a812c03 lw s8,1192(sp)
46eb8: 4d010113 addi sp,sp,1232
46ebc: 8082 ret
46ebe: 01aa07b3 add a5,s4,s10
46ec2: 05c7c783 lbu a5,92(a5)
46ec6: 078a slli a5,a5,0x2
46ec8: 97d2 add a5,a5,s4
46eca: 0d87a703 lw a4,216(a5)
46ece: 00000797 auipc a5,0x0
46ed2: fa678793 addi a5,a5,-90 # 46e74
<__printf_buffer+0xd0>
46ed6: 973e add a4,a4,a5
46ed8: 02000793 li a5,32
46edc: d63e sw a5,44(sp)
46ede: 0c05 addi s8,s8,1
46ee0: 4b01 li s6,0
46ee2: 59fd li s3,-1
46ee4: 4c81 li s9,0
46ee6: 4381 li t2,0
46ee8: 4781 li a5,0
46eea: 4b81 li s7,0
46eec: 4901 li s2,0
46eee: 4e01 li t3,0
46ef0: ce02 sw zero,28(sp)
46ef2: 4d81 li s11,0
46ef4: d202 sw zero,36(sp)
46ef6: d402 sw zero,40(sp)
46ef8: 8702 jr a4
46efa: 5726 lw a4,104(sp)
46efc: c319 beqz a4,46f02
<__printf_buffer+0x15e>
46efe: 23c0106f j 4813a
<__printf_buffer+0x1396>
46f02: 874a mv a4,s2
46f04: 001b9693 slli a3,s7,0x1
46f08: 8f55 or a4,a4,a3
46f0a: 078a slli a5,a5,0x2
46f0c: 8fd9 or a5,a5,a4
46f0e: 5722 lw a4,40(sp)
46f10: 070e slli a4,a4,0x3
46f12: 8fd9 or a5,a5,a4
46f14: 5712 lw a4,36(sp)
46f16: 0712 slli a4,a4,0x4
46f18: 8fd9 or a5,a5,a4
46f1a: 005d9f13 slli t5,s11,0x5
46f1e: 4772 lw a4,28(sp)
46f20: 071a slli a4,a4,0x6
46f22: 01e7e7b3 or a5,a5,t5
46f26: 8fd9 or a5,a5,a4
46f28: 0e1e slli t3,t3,0x7
46f2a: 01c7e7b3 or a5,a5,t3
46f2e: 0b2e slli s6,s6,0xb
46f30: 6705 lui a4,0x1
46f32: 8ff70713 addi a4,a4,-1793 # 8ff
<current+0x8b7>
46f36: 0167e7b3 or a5,a5,s6
46f3a: 8ff9 and a5,a5,a4
46f3c: 5736 lw a4,108(sp)
46f3e: 5e0030d7 vmv.v.i v1,0
46f42: 020700a7 vse8.v v1,(a4)
46f46: 5732 lw a4,44(sp)
46f48: cb4e sw s3,148(sp)
46f4a: cd66 sw s9,152(sp)
46f4c: cf6a sw s10,156(sp)
On 1/15/2024 4:00 AM, Juzhe-Zhong wrote:
> This patch fixes -70% performance drop from GCC-13.2 to GCC-14 with -march=rv64gcv in real hardware.
>
> The root cause is incorrect cost model cause inefficient vectorization which makes us performance drop significantly.
>
> So this patch does:
>
> 1. Adjust vector to scalar cost by introducing v to scalar reg move.
> 2. Adjust vec_construct cost since we does spend NUNITS instructions to construct the vector.
>
> Tested on both RV32/RV64 no regression, Rebase to the trunk and commit it as it is approved by Robin.
>
> PR target/113247
>
> gcc/ChangeLog:
>
> * config/riscv/riscv-protos.h (struct regmove_vector_cost): Add vector to scalar regmove.
> * config/riscv/riscv-vector-costs.cc (adjust_stmt_cost): Ditto.
> * config/riscv/riscv.cc (riscv_builtin_vectorization_cost): Adjust vec_construct cost.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/riscv/rvv/autovec/vls/reduc-19.c: Adapt test.
> * gcc.target/riscv/rvv/autovec/vls/reduc-20.c: Ditto.
> * gcc.target/riscv/rvv/autovec/vls/reduc-21.c: Ditto.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c: New test.
> * gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c: New test.
>
> ---
> gcc/config/riscv/riscv-protos.h | 2 +
> gcc/config/riscv/riscv-vector-costs.cc | 3 +
> gcc/config/riscv/riscv.cc | 4 +-
> .../vect/costmodel/riscv/rvv/pr113247-1.c | 195 ++++++++++++++++++
> .../vect/costmodel/riscv/rvv/pr113247-2.c | 6 +
> .../vect/costmodel/riscv/rvv/pr113247-3.c | 6 +
> .../vect/costmodel/riscv/rvv/pr113247-4.c | 6 +
> .../riscv/rvv/autovec/vls/reduc-19.c | 2 +-
> .../riscv/rvv/autovec/vls/reduc-20.c | 2 +-
> .../riscv/rvv/autovec/vls/reduc-21.c | 2 +-
> 10 files changed, 224 insertions(+), 4 deletions(-)
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> create mode 100644 gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
>
> diff --git a/gcc/config/riscv/riscv-protos.h b/gcc/config/riscv/riscv-protos.h
> index 4f3b677f4f9..21f6dadf113 100644
> --- a/gcc/config/riscv/riscv-protos.h
> +++ b/gcc/config/riscv/riscv-protos.h
> @@ -255,6 +255,8 @@ struct regmove_vector_cost
> {
> const int GR2VR;
> const int FR2VR;
> + const int VR2GR;
> + const int VR2FR;
> };
>
> /* Cost for vector insn classes. */
> diff --git a/gcc/config/riscv/riscv-vector-costs.cc b/gcc/config/riscv/riscv-vector-costs.cc
> index 90ab93b7506..7c9840df4e9 100644
> --- a/gcc/config/riscv/riscv-vector-costs.cc
> +++ b/gcc/config/riscv/riscv-vector-costs.cc
> @@ -1056,6 +1056,9 @@ adjust_stmt_cost (enum vect_cost_for_stmt kind, tree vectype, int stmt_cost)
> case scalar_to_vec:
> return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->FR2VR
> : costs->regmove->GR2VR);
> + case vec_to_scalar:
> + return stmt_cost += (FLOAT_TYPE_P (vectype) ? costs->regmove->VR2FR
> + : costs->regmove->VR2GR);
> default:
> break;
> }
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index ee1a57b321d..568db90a27d 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -395,6 +395,8 @@ static const scalable_vector_cost rvv_vla_vector_cost = {
> static const regmove_vector_cost rvv_regmove_vector_cost = {
> 2, /* GR2VR */
> 2, /* FR2VR */
> + 2, /* VR2GR */
> + 2, /* VR2FR */
> };
>
> /* Generic costs for vector insn classes. It is supposed to be the vector cost
> @@ -10522,7 +10524,7 @@ riscv_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
> return fp ? common_costs->fp_stmt_cost : common_costs->int_stmt_cost;
>
> case vec_construct:
> - return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)) - 1;
> + return estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
>
> default:
> gcc_unreachable ();
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> new file mode 100644
> index 00000000000..0d09a624a00
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-1.c
> @@ -0,0 +1,195 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic" } */
> +
> +#include <stdint-gcc.h>
> +
> +#define Ch(x,y,z) (z ^ (x & (y ^ z)))
> +#define Maj(x,y,z) ((x & y) | (z & (x | y)))
> +
> +#define SHR(x, n) (x >> n)
> +#define ROTR(x,n) (SHR(x,n) | (x << (32 - n)))
> +#define S1(x) (ROTR(x, 6) ^ ROTR(x,11) ^ ROTR(x,25))
> +#define S0(x) (ROTR(x, 2) ^ ROTR(x,13) ^ ROTR(x,22))
> +
> +#define s1(x) (ROTR(x,17) ^ ROTR(x,19) ^ SHR(x,10))
> +#define s0(x) (ROTR(x, 7) ^ ROTR(x,18) ^ SHR(x, 3))
> +
> +#define SHA256_STEP(a,b,c,d,e,f,g,h,x,K) \
> +{ \
> + tmp1 = h + S1(e) + Ch(e,f,g) + K + x; \
> + tmp2 = S0(a) + Maj(a,b,c); \
> + h = tmp1 + tmp2; \
> + d += tmp1; \
> +}
> +
> +#define BE_LOAD32(n,b,i) (n) = byteswap(*(uint32_t *)(b + i))
> +
> +static uint32_t byteswap(uint32_t x)
> +{
> + x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16;
> + x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8;
> +
> + return x;
> +}
> +
> +void sha256 (const uint8_t *in, uint32_t out[8])
> +{
> + uint32_t tmp1, tmp2, a, b, c, d, e, f, g, h;
> + uint32_t w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
> +
> + tmp1 = tmp2 = 0;
> + w0 = w1 = w2 = w3 = w4 = w5 = w6 = w7 = w8 = w9 = w10 = w11 = w12 = w13 = w14 = w15 = 0;
> +
> + BE_LOAD32 ( w0, in, 0 );
> + BE_LOAD32 ( w1, in, 4 );
> + BE_LOAD32 ( w2, in, 8 );
> + BE_LOAD32 ( w3, in, 12 );
> + BE_LOAD32 ( w4, in, 16 );
> + BE_LOAD32 ( w5, in, 20 );
> + BE_LOAD32 ( w6, in, 24 );
> + BE_LOAD32 ( w7, in, 28 );
> + BE_LOAD32 ( w8, in, 32 );
> + BE_LOAD32 ( w9, in, 36 );
> + BE_LOAD32 ( w10, in, 40 );
> + BE_LOAD32 ( w11, in, 44 );
> + BE_LOAD32 ( w12, in, 48 );
> + BE_LOAD32 ( w13, in, 52 );
> + BE_LOAD32 ( w14, in, 56 );
> + BE_LOAD32 ( w15, in, 60 );
> +
> + a = out[0];
> + b = out[1];
> + c = out[2];
> + d = out[3];
> + e = out[4];
> + f = out[5];
> + g = out[6];
> + h = out[7];
> +
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x428a2f98);
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x71374491);
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0xb5c0fbcf);
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0xe9b5dba5);
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x3956c25b);
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x59f111f1);
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x923f82a4);
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0xab1c5ed5);
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xd807aa98);
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x12835b01);
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x243185be);
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x550c7dc3);
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x72be5d74);
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0x80deb1fe);
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x9bdc06a7);
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc19bf174);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0xe49b69c1);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0xefbe4786);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x0fc19dc6);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x240ca1cc);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x2de92c6f);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4a7484aa);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5cb0a9dc);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x76f988da);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x983e5152);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa831c66d);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xb00327c8);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xbf597fc7);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xc6e00bf3);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd5a79147);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0x06ca6351);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x14292967);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x27b70a85);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x2e1b2138);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x4d2c6dfc);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x53380d13);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x650a7354);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x766a0abb);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x81c2c92e);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x92722c85);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0xa2bfe8a1);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0xa81a664b);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0xc24b8b70);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0xc76c51a3);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0xd192e819);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xd6990624);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xf40e3585);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0x106aa070);
> +
> + w0 = s1(w14) + w9 + s0(w1) + w0;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w0, 0x19a4c116);
> + w1 = s1(w15) + w10 + s0(w2) + w1;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w1, 0x1e376c08);
> + w2 = s1(w0) + w11 + s0(w3) + w2;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w2, 0x2748774c);
> + w3 = s1(w1) + w12 + s0(w4) + w3;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w3, 0x34b0bcb5);
> + w4 = s1(w2) + w13 + s0(w5) + w4;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w4, 0x391c0cb3);
> + w5 = s1(w3) + w14 + s0(w6) + w5;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w5, 0x4ed8aa4a);
> + w6 = s1(w4) + w15 + s0(w7) + w6;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w6, 0x5b9cca4f);
> + w7 = s1(w5) + w0 + s0(w8) + w7;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w7, 0x682e6ff3);
> + w8 = s1(w6) + w1 + s0(w9) + w8;
> + SHA256_STEP(a, b, c, d, e, f, g, h, w8, 0x748f82ee);
> + w9 = s1(w7) + w2 + s0(w10) + w9;
> + SHA256_STEP(h, a, b, c, d, e, f, g, w9, 0x78a5636f);
> + w10 = s1(w8) + w3 + s0(w11) + w10;
> + SHA256_STEP(g, h, a, b, c, d, e, f, w10, 0x84c87814);
> + w11 = s1(w9) + w4 + s0(w12) + w11;
> + SHA256_STEP(f, g, h, a, b, c, d, e, w11, 0x8cc70208);
> + w12 = s1(w10) + w5 + s0(w13) + w12;
> + SHA256_STEP(e, f, g, h, a, b, c, d, w12, 0x90befffa);
> + w13 = s1(w11) + w6 + s0(w14) + w13;
> + SHA256_STEP(d, e, f, g, h, a, b, c, w13, 0xa4506ceb);
> + w14 = s1(w12) + w7 + s0(w15) + w14;
> + SHA256_STEP(c, d, e, f, g, h, a, b, w14, 0xbef9a3f7);
> + w15 = s1(w13) + w8 + s0(w0) + w15;
> + SHA256_STEP(b, c, d, e, f, g, h, a, w15, 0xc67178f2);
> +
> + out[0] += a;
> + out[1] += b;
> + out[2] += c;
> + out[3] += d;
> + out[4] += e;
> + out[5] += f;
> + out[6] += g;
> + out[7] += h;
> +}
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> new file mode 100644
> index 00000000000..64a53cfca88
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-2.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-lmul=dynamic --param=riscv-autovec-preference=fixed-vlmax" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> new file mode 100644
> index 00000000000..423c90e4154
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-3.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
> new file mode 100644
> index 00000000000..c2a46d848e5
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/costmodel/riscv/rvv/pr113247-4.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64gcv -mabi=lp64d -O3 -ftree-vectorize --param=riscv-autovec-preference=fixed-vlmax" } */
> +
> +#include "pr113247-1.c"
> +
> +/* { dg-final { scan-assembler-not {vset} } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> index 2048b636910..5130fe5f2e3 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-19.c
> @@ -14,7 +14,7 @@ DEF_REDUC_PLUS (_Float16, 512)
> DEF_REDUC_PLUS (_Float16, 1024)
> DEF_REDUC_PLUS (_Float16, 2048)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 10 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> index bfc328da568..819104a8cdf 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-20.c
> @@ -13,7 +13,7 @@ DEF_REDUC_PLUS (float, 256)
> DEF_REDUC_PLUS (float, 512)
> DEF_REDUC_PLUS (float, 1024)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 9 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
> diff --git a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> index 8228590fa3b..2b61e0ac71a 100644
> --- a/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> +++ b/gcc/testsuite/gcc.target/riscv/rvv/autovec/vls/reduc-21.c
> @@ -12,7 +12,7 @@ DEF_REDUC_PLUS (float, 128)
> DEF_REDUC_PLUS (float, 256)
> DEF_REDUC_PLUS (float, 512)
>
> -/* { dg-final { scan-assembler-times {vfredosum\.vs} 8 } } */
> +/* { dg-final { scan-assembler-times {vfredosum\.vs} 7 } } */
> /* { dg-final { scan-assembler-not {csrr} } } */
> /* { dg-final { scan-tree-dump-not "1,1" "optimized" } } */
> /* { dg-final { scan-tree-dump-not "2,2" "optimized" } } */
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
2024-01-17 1:41 ` juzhe.zhong
@ 2024-01-17 1:45 ` Edwin Lu
2024-01-17 1:45 ` Edwin Lu
` (2 more replies)
0 siblings, 3 replies; 8+ messages in thread
From: Edwin Lu @ 2024-01-17 1:45 UTC (permalink / raw)
To: juzhe.zhong, gcc-patches; +Cc: Patrick O'Neill
On 1/16/2024 5:41 PM, juzhe.zhong@rivai.ai wrote:
> Are you saying using glibc lib ? I do the testing with newlib, I didn't
> anything wrong.
>
Yes, I'm seeing the problem using glibc. Looking at our postcommit ci
reports, it appears to only affect linux rv32gcv.
> It seems that this patch triggers latent bug of VSETVL PASS (Even though
> this patch doesn't change anything related to VSETVL PASS).
>
> I will investigate it.
>
> Thanks.
>
Thanks!
Edwin
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
2024-01-17 1:45 ` Edwin Lu
@ 2024-01-17 1:45 ` Edwin Lu
2024-01-17 1:50 ` Li, Pan2
2024-01-26 7:53 ` juzhe.zhong
2 siblings, 0 replies; 8+ messages in thread
From: Edwin Lu @ 2024-01-17 1:45 UTC (permalink / raw)
To: gcc-patches; +Cc: Patrick O'Neill
On 1/16/2024 5:41 PM, juzhe.zhong@rivai.ai wrote:
> Are you saying using glibc lib ? I do the testing with newlib, I didn't
> anything wrong.
>
Yes, I'm seeing the problem using glibc. Looking at our postcommit ci
reports, it appears to only affect linux rv32gcv.
> It seems that this patch triggers latent bug of VSETVL PASS (Even though
> this patch doesn't change anything related to VSETVL PASS).
>
> I will investigate it.
>
> Thanks.
>
Thanks!
Edwin
^ permalink raw reply [flat|nested] 8+ messages in thread
* RE: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
2024-01-17 1:45 ` Edwin Lu
2024-01-17 1:45 ` Edwin Lu
@ 2024-01-17 1:50 ` Li, Pan2
2024-01-26 7:53 ` juzhe.zhong
2 siblings, 0 replies; 8+ messages in thread
From: Li, Pan2 @ 2024-01-17 1:50 UTC (permalink / raw)
To: Edwin Lu, juzhe.zhong, gcc-patches; +Cc: Patrick O'Neill
> Yes, I'm seeing the problem using glibc. Looking at our postcommit ci
> reports, it appears to only affect linux rv32gcv.
Just FYI. Double confirmed rv64gcv with glibc works well with this patch.
Pan
-----Original Message-----
From: Edwin Lu <ewlu@rivosinc.com>
Sent: Wednesday, January 17, 2024 9:45 AM
To: juzhe.zhong@rivai.ai; gcc-patches <gcc-patches@gcc.gnu.org>
Cc: Patrick O'Neill <patrick@rivosinc.com>
Subject: Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
On 1/16/2024 5:41 PM, juzhe.zhong@rivai.ai wrote:
> Are you saying using glibc lib ? I do the testing with newlib, I didn't
> anything wrong.
>
Yes, I'm seeing the problem using glibc. Looking at our postcommit ci
reports, it appears to only affect linux rv32gcv.
> It seems that this patch triggers latent bug of VSETVL PASS (Even though
> this patch doesn't change anything related to VSETVL PASS).
>
> I will investigate it.
>
> Thanks.
>
Thanks!
Edwin
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
2024-01-17 1:45 ` Edwin Lu
2024-01-17 1:45 ` Edwin Lu
2024-01-17 1:50 ` Li, Pan2
@ 2024-01-26 7:53 ` juzhe.zhong
2 siblings, 0 replies; 8+ messages in thread
From: juzhe.zhong @ 2024-01-26 7:53 UTC (permalink / raw)
To: Edwin Lu, gcc-patches; +Cc: Patrick O'Neill
[-- Attachment #1: Type: text/plain, Size: 843 bytes --]
It's fixed by this commit: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=d40b3c1e439db05c835b6bd4fd5bba58fda71dd6
juzhe.zhong@rivai.ai
From: Edwin Lu
Date: 2024-01-17 09:45
To: juzhe.zhong@rivai.ai; gcc-patches
CC: Patrick O'Neill
Subject: Re: [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro
On 1/16/2024 5:41 PM, juzhe.zhong@rivai.ai wrote:
> Are you saying using glibc lib ? I do the testing with newlib, I didn't
> anything wrong.
>
Yes, I'm seeing the problem using glibc. Looking at our postcommit ci
reports, it appears to only affect linux rv32gcv.
> It seems that this patch triggers latent bug of VSETVL PASS (Even though
> this patch doesn't change anything related to VSETVL PASS).
>
> I will investigate it.
>
> Thanks.
>
Thanks!
Edwin
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2024-01-26 7:54 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-15 12:00 [Committed V2] RISC-V: Fix regression (GCC-14 compare with GCC-13.2) of SHA256 from coremark-pro Juzhe-Zhong
2024-01-17 1:37 ` Edwin Lu
2024-01-17 1:37 ` Edwin Lu
2024-01-17 1:41 ` juzhe.zhong
2024-01-17 1:45 ` Edwin Lu
2024-01-17 1:45 ` Edwin Lu
2024-01-17 1:50 ` Li, Pan2
2024-01-26 7:53 ` juzhe.zhong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).