* [PATCH 1/3]rs6000: update num_insns_constant for 2 insns @ 2023-10-25 2:00 Jiufu Guo 2023-10-25 2:00 ` [PATCH 2/3]rs6000: using 'pli' to load 34bit-constant Jiufu Guo ` (2 more replies) 0 siblings, 3 replies; 11+ messages in thread From: Jiufu Guo @ 2023-10-25 2:00 UTC (permalink / raw) To: gcc-patches; +Cc: segher, dje.gcc, linkw, bergner, guojiufu Hi, Trunk gcc supports more constants to be built via two instructions: e.g. "li/lis; xori/xoris/rldicl/rldicr/rldic". And then num_insns_constant should also be updated. Bootstrap & regtest pass ppc64{,le}. Is this ok for trunk? BR, Jeff (Jiufu Guo) gcc/ChangeLog: * config/rs6000/rs6000.cc (can_be_built_by_lilis_and_rldicX): New function. (num_insns_constant_gpr): Update to return 2 for more cases. (rs6000_emit_set_long_const): Update to use can_be_built_by_lilis_and_rldicX. --- gcc/config/rs6000/rs6000.cc | 64 ++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index cc24dd5301e..b23ff3d7917 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -6032,6 +6032,9 @@ direct_return (void) return 0; } +static bool +can_be_built_by_lilis_and_rldicX (HOST_WIDE_INT, int *, HOST_WIDE_INT *); + /* Helper for num_insns_constant. Calculate number of instructions to load VALUE to a single gpr using combinations of addi, addis, ori, oris, sldi and rldimi instructions. */ @@ -6044,35 +6047,41 @@ num_insns_constant_gpr (HOST_WIDE_INT value) return 1; /* constant loadable with addis */ - else if ((value & 0xffff) == 0 - && (value >> 31 == -1 || value >> 31 == 0)) + if ((value & 0xffff) == 0 && (value >> 31 == -1 || value >> 31 == 0)) return 1; /* PADDI can support up to 34 bit signed integers. */ - else if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (value)) + if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (value)) return 1; - else if (TARGET_POWERPC64) - { - HOST_WIDE_INT low = sext_hwi (value, 32); - HOST_WIDE_INT high = value >> 31; + if (!TARGET_POWERPC64) + return 2; - if (high == 0 || high == -1) - return 2; + HOST_WIDE_INT low = sext_hwi (value, 32); + HOST_WIDE_INT high = value >> 31; - high >>= 1; + if (high == 0 || high == -1) + return 2; - if (low == 0 || low == high) - return num_insns_constant_gpr (high) + 1; - else if (high == 0) - return num_insns_constant_gpr (low) + 1; - else - return (num_insns_constant_gpr (high) - + num_insns_constant_gpr (low) + 1); - } + high >>= 1; - else + HOST_WIDE_INT ud2 = (low >> 16) & 0xffff; + HOST_WIDE_INT ud1 = low & 0xffff; + if (high == -1 && ((!(ud2 & 0x8000) && ud1 == 0) || (ud1 & 0x8000))) + return 2; + if (high == 0 && (ud1 == 0 || (!(ud1 & 0x8000)))) return 2; + + int shift; + HOST_WIDE_INT mask; + if (can_be_built_by_lilis_and_rldicX (value, &shift, &mask)) + return 2; + + if (low == 0 || low == high) + return num_insns_constant_gpr (high) + 1; + if (high == 0) + return num_insns_constant_gpr (low) + 1; + return (num_insns_constant_gpr (high) + num_insns_constant_gpr (low) + 1); } /* Helper for num_insns_constant. Allow constants formed by the @@ -10492,6 +10501,18 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask) return false; } +/* Combine the above checking functions for li/lis;rldicX. */ + +static bool +can_be_built_by_lilis_and_rldicX (HOST_WIDE_INT c, int *shift, + HOST_WIDE_INT *mask) +{ + return (can_be_built_by_li_lis_and_rotldi (c, shift, mask) + || can_be_built_by_li_lis_and_rldicl (c, shift, mask) + || can_be_built_by_li_lis_and_rldicr (c, shift, mask) + || can_be_built_by_li_and_rldic (c, shift, mask)); +} + /* Subroutine of rs6000_emit_set_const, handling PowerPC64 DImode. Output insns to set DEST equal to the constant C as a series of lis, ori and shl instructions. */ @@ -10538,10 +10559,7 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c) emit_move_insn (dest, gen_rtx_XOR (DImode, temp, GEN_INT ((ud2 ^ 0xffff) << 16))); } - else if (can_be_built_by_li_lis_and_rotldi (c, &shift, &mask) - || can_be_built_by_li_lis_and_rldicl (c, &shift, &mask) - || can_be_built_by_li_lis_and_rldicr (c, &shift, &mask) - || can_be_built_by_li_and_rldic (c, &shift, &mask)) + else if (can_be_built_by_lilis_and_rldicX (c, &shift, &mask)) { temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode); unsigned HOST_WIDE_INT imm = (c | ~mask); -- 2.25.1 ^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 2/3]rs6000: using 'pli' to load 34bit-constant 2023-10-25 2:00 [PATCH 1/3]rs6000: update num_insns_constant for 2 insns Jiufu Guo @ 2023-10-25 2:00 ` Jiufu Guo 2023-10-25 2:33 ` Kewen.Lin 2023-10-25 2:00 ` [PATCH 3/3]rs6000: split complicate constant to constant pool Jiufu Guo 2023-10-25 2:27 ` [PATCH 1/3]rs6000: update num_insns_constant for 2 insns Kewen.Lin 2 siblings, 1 reply; 11+ messages in thread From: Jiufu Guo @ 2023-10-25 2:00 UTC (permalink / raw) To: gcc-patches; +Cc: segher, dje.gcc, linkw, bergner, guojiufu Hi, For constants with 16bit values, 'li or lis' can be used to generate the value. For 34bit constant, 'pli' is ok to generate the value. Bootstrap®test pass on ppc64{,le}. Is this ok for trunk? BR, Jeff (Jiufu Guo) gcc/ChangeLog: * config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Add code to use pli for 34bit constant. --- gcc/config/rs6000/rs6000.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index b23ff3d7917..4690384cdbe 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -10530,7 +10530,11 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c) ud3 = (c >> 32) & 0xffff; ud4 = (c >> 48) & 0xffff; - if ((ud4 == 0xffff && ud3 == 0xffff && ud2 == 0xffff && (ud1 & 0x8000)) + if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (c)) + { + emit_move_insn (dest, GEN_INT (c)); + } + else if ((ud4 == 0xffff && ud3 == 0xffff && ud2 == 0xffff && (ud1 & 0x8000)) || (ud4 == 0 && ud3 == 0 && ud2 == 0 && ! (ud1 & 0x8000))) emit_move_insn (dest, GEN_INT (sext_hwi (ud1, 16))); -- 2.25.1 ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/3]rs6000: using 'pli' to load 34bit-constant 2023-10-25 2:00 ` [PATCH 2/3]rs6000: using 'pli' to load 34bit-constant Jiufu Guo @ 2023-10-25 2:33 ` Kewen.Lin 2023-10-25 7:37 ` Jiufu Guo 0 siblings, 1 reply; 11+ messages in thread From: Kewen.Lin @ 2023-10-25 2:33 UTC (permalink / raw) To: Jiufu Guo; +Cc: segher, dje.gcc, linkw, bergner, gcc-patches on 2023/10/25 10:00, Jiufu Guo wrote: > Hi, > > For constants with 16bit values, 'li or lis' can be used to generate > the value. For 34bit constant, 'pli' is ok to generate the value. > > Bootstrap®test pass on ppc64{,le}. > Is this ok for trunk? > > BR, > Jeff (Jiufu Guo) > > gcc/ChangeLog: > > * config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Add code to use > pli for 34bit constant. > > --- > gcc/config/rs6000/rs6000.cc | 6 +++++- > 1 file changed, 5 insertions(+), 1 deletion(-) > > diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc > index b23ff3d7917..4690384cdbe 100644 > --- a/gcc/config/rs6000/rs6000.cc > +++ b/gcc/config/rs6000/rs6000.cc > @@ -10530,7 +10530,11 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c) > ud3 = (c >> 32) & 0xffff; > ud4 = (c >> 48) & 0xffff; > > - if ((ud4 == 0xffff && ud3 == 0xffff && ud2 == 0xffff && (ud1 & 0x8000)) > + if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (c)) > + { > + emit_move_insn (dest, GEN_INT (c)); > + } Nit: unexpected formatting, no {} needed. Is there any test case justifying this change? I think only one "li" or "lis" beats "pli" since the latter is a prefixed insn, it puts more burdens on insn decoding. BR, Kewen > + else if ((ud4 == 0xffff && ud3 == 0xffff && ud2 == 0xffff && (ud1 & 0x8000)) > || (ud4 == 0 && ud3 == 0 && ud2 == 0 && ! (ud1 & 0x8000))) > emit_move_insn (dest, GEN_INT (sext_hwi (ud1, 16))); > ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 2/3]rs6000: using 'pli' to load 34bit-constant 2023-10-25 2:33 ` Kewen.Lin @ 2023-10-25 7:37 ` Jiufu Guo 0 siblings, 0 replies; 11+ messages in thread From: Jiufu Guo @ 2023-10-25 7:37 UTC (permalink / raw) To: Kewen.Lin; +Cc: segher, dje.gcc, linkw, bergner, gcc-patches Hi, "Kewen.Lin" <linkw@linux.ibm.com> writes: > on 2023/10/25 10:00, Jiufu Guo wrote: >> Hi, >> >> For constants with 16bit values, 'li or lis' can be used to generate >> the value. For 34bit constant, 'pli' is ok to generate the value. >> >> Bootstrap®test pass on ppc64{,le}. >> Is this ok for trunk? >> >> BR, >> Jeff (Jiufu Guo) >> >> gcc/ChangeLog: >> >> * config/rs6000/rs6000.cc (rs6000_emit_set_long_const): Add code to use >> pli for 34bit constant. >> >> --- >> gcc/config/rs6000/rs6000.cc | 6 +++++- >> 1 file changed, 5 insertions(+), 1 deletion(-) >> >> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >> index b23ff3d7917..4690384cdbe 100644 >> --- a/gcc/config/rs6000/rs6000.cc >> +++ b/gcc/config/rs6000/rs6000.cc >> @@ -10530,7 +10530,11 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c) >> ud3 = (c >> 32) & 0xffff; >> ud4 = (c >> 48) & 0xffff; >> >> - if ((ud4 == 0xffff && ud3 == 0xffff && ud2 == 0xffff && (ud1 & 0x8000)) >> + if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (c)) >> + { >> + emit_move_insn (dest, GEN_INT (c)); >> + } > > Nit: unexpected formatting, no {} needed. > > Is there any test case justifying this change? Great catch! pr93012.c could cover this implicitly, but it only be changed in the [PATCH 3/3]. I would add a new case for this in this patch. > I think only one "li" or "lis" > beats "pli" since the latter is a prefixed insn, it puts more burdens on insn > decoding. Yes! Good news is "emit_move_insn (dest, GEN_INT (c));" is able to support "li, lis and pli". The "mov" would match/generate the best one. Thanks for your quick review and very helpful comments! BR, Jeff (Jiufu Guo) > > BR, > Kewen > >> + else if ((ud4 == 0xffff && ud3 == 0xffff && ud2 == 0xffff && (ud1 & 0x8000)) >> || (ud4 == 0 && ud3 == 0 && ud2 == 0 && ! (ud1 & 0x8000))) >> emit_move_insn (dest, GEN_INT (sext_hwi (ud1, 16))); >> ^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 3/3]rs6000: split complicate constant to constant pool 2023-10-25 2:00 [PATCH 1/3]rs6000: update num_insns_constant for 2 insns Jiufu Guo 2023-10-25 2:00 ` [PATCH 2/3]rs6000: using 'pli' to load 34bit-constant Jiufu Guo @ 2023-10-25 2:00 ` Jiufu Guo 2023-10-25 2:43 ` Kewen.Lin 2023-10-25 2:27 ` [PATCH 1/3]rs6000: update num_insns_constant for 2 insns Kewen.Lin 2 siblings, 1 reply; 11+ messages in thread From: Jiufu Guo @ 2023-10-25 2:00 UTC (permalink / raw) To: gcc-patches; +Cc: segher, dje.gcc, linkw, bergner, guojiufu Hi, Sometimes, a complicated constant is built via 3(or more) instructions to build. Generally speaking, it would not be as faster as loading it from the constant pool (as a few discussions in PR63281). For the concern that I raised in: https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599676.html The micro-cases would not be the major concern. Because as Segher explained in: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c18 It would just be about the benchmark method. As tested on spec2017, for visible performance changes, we can find the runtime improvement on 500.perlbench_r about ~1.8% (-O2) when support loading complicates constant from constant pool. And no visible performance recession on other benchmarks. Boostrap & regtest pass on ppc64{,le}. Is this ok for trunk? BR, Jeff (Jiufu Guo) PR target/63281 gcc/ChangeLog: * config/rs6000/rs6000.cc (rs6000_emit_set_const): Update to split complicate constant to memory. gcc/testsuite/ChangeLog: * gcc.target/powerpc/const_anchors.c: Update to test final-rtl. * gcc.target/powerpc/parall_5insn_const.c: Update to keep original test point. * gcc.target/powerpc/pr106550.c: Likewise.. * gcc.target/powerpc/pr106550_1.c: Likewise. * gcc.target/powerpc/pr87870.c: Update according to latest behavior. * gcc.target/powerpc/pr93012.c: Likewise. --- gcc/config/rs6000/rs6000.cc | 16 ++++++++++++++++ .../gcc.target/powerpc/const_anchors.c | 5 ++--- .../gcc.target/powerpc/parall_5insn_const.c | 14 ++++++++++++-- gcc/testsuite/gcc.target/powerpc/pr106550.c | 17 +++++++++++++++-- gcc/testsuite/gcc.target/powerpc/pr106550_1.c | 15 +++++++++++++-- gcc/testsuite/gcc.target/powerpc/pr87870.c | 5 ++++- gcc/testsuite/gcc.target/powerpc/pr93012.c | 4 +++- 7 files changed, 65 insertions(+), 11 deletions(-) diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc index 4690384cdbe..b9562f1ea0f 100644 --- a/gcc/config/rs6000/rs6000.cc +++ b/gcc/config/rs6000/rs6000.cc @@ -10292,6 +10292,22 @@ rs6000_emit_set_const (rtx dest, rtx source) c = sext_hwi (c, 32); emit_move_insn (lo, GEN_INT (c)); } + + /* If it can be stored to the constant pool and profitable. */ + else if (base_reg_operand (dest, mode) + && num_insns_constant (source, mode) > 2) + { + rtx sym = force_const_mem (mode, source); + if (TARGET_TOC && SYMBOL_REF_P (XEXP (sym, 0)) + && use_toc_relative_ref (XEXP (sym, 0), mode)) + { + rtx toc = create_TOC_reference (XEXP (sym, 0), copy_rtx (dest)); + sym = gen_const_mem (mode, toc); + set_mem_alias_set (sym, get_TOC_alias_set ()); + } + + emit_insn (gen_rtx_SET (dest, sym)); + } else rs6000_emit_set_long_const (dest, c); break; diff --git a/gcc/testsuite/gcc.target/powerpc/const_anchors.c b/gcc/testsuite/gcc.target/powerpc/const_anchors.c index 542e2674b12..188744165f2 100644 --- a/gcc/testsuite/gcc.target/powerpc/const_anchors.c +++ b/gcc/testsuite/gcc.target/powerpc/const_anchors.c @@ -1,5 +1,5 @@ /* { dg-do compile { target has_arch_ppc64 } } */ -/* { dg-options "-O2" } */ +/* { dg-options "-O2 -fdump-rtl-final" } */ #define C1 0x2351847027482577ULL #define C2 0x2351847027482578ULL @@ -16,5 +16,4 @@ void __attribute__ ((noinline)) foo1 (long long *a, long long b) if (b) *a++ = C2; } - -/* { dg-final { scan-assembler-times {\maddi\M} 2 } } */ +/* { dg-final { scan-rtl-dump-times {\madddi3\M} 2 "final" } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c index e3a9a7264cf..df0690b90be 100644 --- a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c @@ -9,8 +9,18 @@ void __attribute__ ((noinline)) foo (unsigned long long *a) { /* 2 lis + 2 ori + 1 rldimi for each constant. */ - *a++ = 0x800aabcdc167fa16ULL; - *a++ = 0x7543a876867f616ULL; + { + register long long d asm("r0") = 0x800aabcdc167fa16ULL; + long long n; + asm("mr %0, %1" : "=r"(n) : "r"(d)); + *a++ = n; + } + { + register long long d asm("r0") = 0x7543a876867f616ULL; + long long n; + asm("mr %0, %1" : "=r"(n) : "r"(d)); + *a++ = n; + } } long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL}; diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550.c b/gcc/testsuite/gcc.target/powerpc/pr106550.c index 74e395331ab..5eca2b2f701 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr106550.c +++ b/gcc/testsuite/gcc.target/powerpc/pr106550.c @@ -1,12 +1,25 @@ /* PR target/106550 */ /* { dg-options "-O2 -mdejagnu-cpu=power10" } */ /* { dg-require-effective-target power10_ok } */ +/* { dg-require-effective-target has_arch_ppc64 } */ void foo (unsigned long long *a) { - *a++ = 0x020805006106003; /* pli+pli+rldimi */ - *a++ = 0x2351847027482577;/* pli+pli+rldimi */ + { + /* pli+pli+rldimi */ + register long long d asm("r0") = 0x020805006106003ULL; + long long n; + asm("mr %0, %1" : "=r"(n) : "r"(d)); + *a++ = n; + } + { + /* pli+pli+rldimi */ + register long long d asm("r0") = 0x2351847027482577ULL; + long long n; + asm("mr %0, %1" : "=r"(n) : "r"(d)); + *a++ = n; + } } /* { dg-final { scan-assembler-times {\mpli\M} 4 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c index 7e709fcf9d8..11878d893a4 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c +++ b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c @@ -12,8 +12,19 @@ foo (unsigned long long *a) asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); *a++ = n; - *a++ = 0x235a8470a7480000ULL; /* pli+sldi+oris */ - *a++ = 0x23a184700000b677ULL; /* pli+sldi+ori */ + { + register long long d asm("r0") = 0x235a8470a7480000ULL; /* pli+sldi+oris */ + long long n; + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); + *a++ = n; + } + + { + register long long d asm("r0") = 0x23a184700000b677ULL; /* pli+sldi+ori */ + long long n; + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); + *a++ = n; + } } /* { dg-final { scan-assembler-times {\mpli\M} 3 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr87870.c b/gcc/testsuite/gcc.target/powerpc/pr87870.c index d2108ac3386..5fee06744ae 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr87870.c +++ b/gcc/testsuite/gcc.target/powerpc/pr87870.c @@ -25,4 +25,7 @@ test3 (void) return ((__int128)0xdeadbeefcafebabe << 64) | 0xfacefeedbaaaaaad; } -/* { dg-final { scan-assembler-not {\mld\M} } } */ +/* test3 using "ld" to load the value for r3 and r4. + test0, test1 and test2 are using "li". */ +/* { dg-final { scan-assembler-times {\mp?ld\M} 2 } } */ +/* { dg-final { scan-assembler-times {\mli\M} 6 } } */ diff --git a/gcc/testsuite/gcc.target/powerpc/pr93012.c b/gcc/testsuite/gcc.target/powerpc/pr93012.c index 4f764d0576f..87ce0b49b23 100644 --- a/gcc/testsuite/gcc.target/powerpc/pr93012.c +++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c @@ -10,4 +10,6 @@ unsigned long long mskh1() { return 0xffff9234ffff9234ULL; } unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; } unsigned long long mskse() { return 0xffff1234ffff1234ULL; } -/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */ +/* { dg-final { scan-assembler-times {\mrldimi\M} 7 { target { has_arch_pwr10 } } } } */ +/* { dg-final { scan-assembler-times {\mrldimi\M} 3 { target { ! has_arch_pwr10 } } } } */ +/* { dg-final { scan-assembler-times {\mp?ld\M} 4 { target { ! has_arch_pwr10 } } } } */ -- 2.25.1 ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 3/3]rs6000: split complicate constant to constant pool 2023-10-25 2:00 ` [PATCH 3/3]rs6000: split complicate constant to constant pool Jiufu Guo @ 2023-10-25 2:43 ` Kewen.Lin 2023-10-25 8:14 ` Jiufu Guo 0 siblings, 1 reply; 11+ messages in thread From: Kewen.Lin @ 2023-10-25 2:43 UTC (permalink / raw) To: Jiufu Guo; +Cc: segher, dje.gcc, linkw, bergner, gcc-patches Hi, on 2023/10/25 10:00, Jiufu Guo wrote: > Hi, > > Sometimes, a complicated constant is built via 3(or more) > instructions to build. Generally speaking, it would not be > as faster as loading it from the constant pool (as a few > discussions in PR63281). I may miss some previous discussions, but I'm curious why we chose ">=3" here, as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c9 which indicates that more than 3 (>3) should be considered with this change. > > For the concern that I raised in: > https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599676.html > The micro-cases would not be the major concern. Because as > Segher explained in: > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c18 > It would just be about the benchmark method. > > As tested on spec2017, for visible performance changes, we > can find the runtime improvement on 500.perlbench_r about > ~1.8% (-O2) when support loading complicates constant from > constant pool. And no visible performance recession on > other benchmarks. The improvement on 500.perlbench_r looks to match what PR63281 mentioned, nice! I'm curious that which options and which kinds of CPUs have you tested with? Since this is a general change, I'd expect we can test with P8/P9/P10 at O2/O3 (or Ofast) at least. BR, Kewen > > Boostrap & regtest pass on ppc64{,le}. > Is this ok for trunk? > > BR, > Jeff (Jiufu Guo) > > PR target/63281 > > gcc/ChangeLog: > > * config/rs6000/rs6000.cc (rs6000_emit_set_const): Update to split > complicate constant to memory. > > gcc/testsuite/ChangeLog: > > * gcc.target/powerpc/const_anchors.c: Update to test final-rtl. > * gcc.target/powerpc/parall_5insn_const.c: Update to keep original test > point. > * gcc.target/powerpc/pr106550.c: Likewise.. > * gcc.target/powerpc/pr106550_1.c: Likewise. > * gcc.target/powerpc/pr87870.c: Update according to latest behavior. > * gcc.target/powerpc/pr93012.c: Likewise. > > --- > gcc/config/rs6000/rs6000.cc | 16 ++++++++++++++++ > .../gcc.target/powerpc/const_anchors.c | 5 ++--- > .../gcc.target/powerpc/parall_5insn_const.c | 14 ++++++++++++-- > gcc/testsuite/gcc.target/powerpc/pr106550.c | 17 +++++++++++++++-- > gcc/testsuite/gcc.target/powerpc/pr106550_1.c | 15 +++++++++++++-- > gcc/testsuite/gcc.target/powerpc/pr87870.c | 5 ++++- > gcc/testsuite/gcc.target/powerpc/pr93012.c | 4 +++- > 7 files changed, 65 insertions(+), 11 deletions(-) > > diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc > index 4690384cdbe..b9562f1ea0f 100644 > --- a/gcc/config/rs6000/rs6000.cc > +++ b/gcc/config/rs6000/rs6000.cc > @@ -10292,6 +10292,22 @@ rs6000_emit_set_const (rtx dest, rtx source) > c = sext_hwi (c, 32); > emit_move_insn (lo, GEN_INT (c)); > } > + > + /* If it can be stored to the constant pool and profitable. */ > + else if (base_reg_operand (dest, mode) > + && num_insns_constant (source, mode) > 2) > + { > + rtx sym = force_const_mem (mode, source); > + if (TARGET_TOC && SYMBOL_REF_P (XEXP (sym, 0)) > + && use_toc_relative_ref (XEXP (sym, 0), mode)) > + { > + rtx toc = create_TOC_reference (XEXP (sym, 0), copy_rtx (dest)); > + sym = gen_const_mem (mode, toc); > + set_mem_alias_set (sym, get_TOC_alias_set ()); > + } > + > + emit_insn (gen_rtx_SET (dest, sym)); > + } > else > rs6000_emit_set_long_const (dest, c); > break; > diff --git a/gcc/testsuite/gcc.target/powerpc/const_anchors.c b/gcc/testsuite/gcc.target/powerpc/const_anchors.c > index 542e2674b12..188744165f2 100644 > --- a/gcc/testsuite/gcc.target/powerpc/const_anchors.c > +++ b/gcc/testsuite/gcc.target/powerpc/const_anchors.c > @@ -1,5 +1,5 @@ > /* { dg-do compile { target has_arch_ppc64 } } */ > -/* { dg-options "-O2" } */ > +/* { dg-options "-O2 -fdump-rtl-final" } */ > > #define C1 0x2351847027482577ULL > #define C2 0x2351847027482578ULL > @@ -16,5 +16,4 @@ void __attribute__ ((noinline)) foo1 (long long *a, long long b) > if (b) > *a++ = C2; > } > - > -/* { dg-final { scan-assembler-times {\maddi\M} 2 } } */ > +/* { dg-final { scan-rtl-dump-times {\madddi3\M} 2 "final" } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c > index e3a9a7264cf..df0690b90be 100644 > --- a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c > +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c > @@ -9,8 +9,18 @@ > void __attribute__ ((noinline)) foo (unsigned long long *a) > { > /* 2 lis + 2 ori + 1 rldimi for each constant. */ > - *a++ = 0x800aabcdc167fa16ULL; > - *a++ = 0x7543a876867f616ULL; > + { > + register long long d asm("r0") = 0x800aabcdc167fa16ULL; > + long long n; > + asm("mr %0, %1" : "=r"(n) : "r"(d)); > + *a++ = n; > + } > + { > + register long long d asm("r0") = 0x7543a876867f616ULL; > + long long n; > + asm("mr %0, %1" : "=r"(n) : "r"(d)); > + *a++ = n; > + } > } > > long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL}; > diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550.c b/gcc/testsuite/gcc.target/powerpc/pr106550.c > index 74e395331ab..5eca2b2f701 100644 > --- a/gcc/testsuite/gcc.target/powerpc/pr106550.c > +++ b/gcc/testsuite/gcc.target/powerpc/pr106550.c > @@ -1,12 +1,25 @@ > /* PR target/106550 */ > /* { dg-options "-O2 -mdejagnu-cpu=power10" } */ > /* { dg-require-effective-target power10_ok } */ > +/* { dg-require-effective-target has_arch_ppc64 } */ > > void > foo (unsigned long long *a) > { > - *a++ = 0x020805006106003; /* pli+pli+rldimi */ > - *a++ = 0x2351847027482577;/* pli+pli+rldimi */ > + { > + /* pli+pli+rldimi */ > + register long long d asm("r0") = 0x020805006106003ULL; > + long long n; > + asm("mr %0, %1" : "=r"(n) : "r"(d)); > + *a++ = n; > + } > + { > + /* pli+pli+rldimi */ > + register long long d asm("r0") = 0x2351847027482577ULL; > + long long n; > + asm("mr %0, %1" : "=r"(n) : "r"(d)); > + *a++ = n; > + } > } > > /* { dg-final { scan-assembler-times {\mpli\M} 4 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c > index 7e709fcf9d8..11878d893a4 100644 > --- a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c > +++ b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c > @@ -12,8 +12,19 @@ foo (unsigned long long *a) > asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); > *a++ = n; > > - *a++ = 0x235a8470a7480000ULL; /* pli+sldi+oris */ > - *a++ = 0x23a184700000b677ULL; /* pli+sldi+ori */ > + { > + register long long d asm("r0") = 0x235a8470a7480000ULL; /* pli+sldi+oris */ > + long long n; > + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); > + *a++ = n; > + } > + > + { > + register long long d asm("r0") = 0x23a184700000b677ULL; /* pli+sldi+ori */ > + long long n; > + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); > + *a++ = n; > + } > } > > /* { dg-final { scan-assembler-times {\mpli\M} 3 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr87870.c b/gcc/testsuite/gcc.target/powerpc/pr87870.c > index d2108ac3386..5fee06744ae 100644 > --- a/gcc/testsuite/gcc.target/powerpc/pr87870.c > +++ b/gcc/testsuite/gcc.target/powerpc/pr87870.c > @@ -25,4 +25,7 @@ test3 (void) > return ((__int128)0xdeadbeefcafebabe << 64) | 0xfacefeedbaaaaaad; > } > > -/* { dg-final { scan-assembler-not {\mld\M} } } */ > +/* test3 using "ld" to load the value for r3 and r4. > + test0, test1 and test2 are using "li". */ > +/* { dg-final { scan-assembler-times {\mp?ld\M} 2 } } */ > +/* { dg-final { scan-assembler-times {\mli\M} 6 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/pr93012.c b/gcc/testsuite/gcc.target/powerpc/pr93012.c > index 4f764d0576f..87ce0b49b23 100644 > --- a/gcc/testsuite/gcc.target/powerpc/pr93012.c > +++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c > @@ -10,4 +10,6 @@ unsigned long long mskh1() { return 0xffff9234ffff9234ULL; } > unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; } > unsigned long long mskse() { return 0xffff1234ffff1234ULL; } > > -/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */ > +/* { dg-final { scan-assembler-times {\mrldimi\M} 7 { target { has_arch_pwr10 } } } } */ > +/* { dg-final { scan-assembler-times {\mrldimi\M} 3 { target { ! has_arch_pwr10 } } } } */ > +/* { dg-final { scan-assembler-times {\mp?ld\M} 4 { target { ! has_arch_pwr10 } } } } */ ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 3/3]rs6000: split complicate constant to constant pool 2023-10-25 2:43 ` Kewen.Lin @ 2023-10-25 8:14 ` Jiufu Guo 2023-10-25 8:29 ` Kewen.Lin 0 siblings, 1 reply; 11+ messages in thread From: Jiufu Guo @ 2023-10-25 8:14 UTC (permalink / raw) To: Kewen.Lin; +Cc: segher, dje.gcc, linkw, bergner, gcc-patches Hi, "Kewen.Lin" <linkw@linux.ibm.com> writes: > Hi, > > on 2023/10/25 10:00, Jiufu Guo wrote: >> Hi, >> >> Sometimes, a complicated constant is built via 3(or more) >> instructions to build. Generally speaking, it would not be >> as faster as loading it from the constant pool (as a few >> discussions in PR63281). > > I may miss some previous discussions, but I'm curious why we > chose ">=3" here, as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c9 > which indicates that more than 3 (>3) should be considered > with this change. Thanks a lot for your great patience for reading the history! Yes, there are some discussions about "> 3" vs. "> 2". - In theory, "ld" is one instruction. If consider "address/toc" adjust, we may count it as 2 instructions. "pld" may need less cycles. - As test, it seems "> 2" could get better/stable runtime result during testing SPEC2017. > >> >> For the concern that I raised in: >> https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599676.html >> The micro-cases would not be the major concern. Because as >> Segher explained in: >> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c18 >> It would just be about the benchmark method. >> >> As tested on spec2017, for visible performance changes, we >> can find the runtime improvement on 500.perlbench_r about >> ~1.8% (-O2) when support loading complicates constant from >> constant pool. And no visible performance recession on >> other benchmarks. > > The improvement on 500.perlbench_r looks to match what PR63281 > mentioned, nice! I'm curious that which options and which kinds > of CPUs have you tested with? Since this is a general change, > I'd expect we can test with P8/P9/P10 at O2/O3 (or Ofast) at > least. Great advice! Thanks for pointing this! A few months ago, P8/P9/P10 are tested. While this time, I rerun SPEC2017 on P10 for -O2 and -O3. More test on latest code would be better. BR, Jeff (Jiufu Guo) > > BR, > Kewen > >> >> Boostrap & regtest pass on ppc64{,le}. >> Is this ok for trunk? >> >> BR, >> Jeff (Jiufu Guo) >> >> PR target/63281 >> >> gcc/ChangeLog: >> >> * config/rs6000/rs6000.cc (rs6000_emit_set_const): Update to split >> complicate constant to memory. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/powerpc/const_anchors.c: Update to test final-rtl. >> * gcc.target/powerpc/parall_5insn_const.c: Update to keep original test >> point. >> * gcc.target/powerpc/pr106550.c: Likewise.. >> * gcc.target/powerpc/pr106550_1.c: Likewise. >> * gcc.target/powerpc/pr87870.c: Update according to latest behavior. >> * gcc.target/powerpc/pr93012.c: Likewise. >> >> --- >> gcc/config/rs6000/rs6000.cc | 16 ++++++++++++++++ >> .../gcc.target/powerpc/const_anchors.c | 5 ++--- >> .../gcc.target/powerpc/parall_5insn_const.c | 14 ++++++++++++-- >> gcc/testsuite/gcc.target/powerpc/pr106550.c | 17 +++++++++++++++-- >> gcc/testsuite/gcc.target/powerpc/pr106550_1.c | 15 +++++++++++++-- >> gcc/testsuite/gcc.target/powerpc/pr87870.c | 5 ++++- >> gcc/testsuite/gcc.target/powerpc/pr93012.c | 4 +++- >> 7 files changed, 65 insertions(+), 11 deletions(-) >> >> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >> index 4690384cdbe..b9562f1ea0f 100644 >> --- a/gcc/config/rs6000/rs6000.cc >> +++ b/gcc/config/rs6000/rs6000.cc >> @@ -10292,6 +10292,22 @@ rs6000_emit_set_const (rtx dest, rtx source) >> c = sext_hwi (c, 32); >> emit_move_insn (lo, GEN_INT (c)); >> } >> + >> + /* If it can be stored to the constant pool and profitable. */ >> + else if (base_reg_operand (dest, mode) >> + && num_insns_constant (source, mode) > 2) >> + { >> + rtx sym = force_const_mem (mode, source); >> + if (TARGET_TOC && SYMBOL_REF_P (XEXP (sym, 0)) >> + && use_toc_relative_ref (XEXP (sym, 0), mode)) >> + { >> + rtx toc = create_TOC_reference (XEXP (sym, 0), copy_rtx (dest)); >> + sym = gen_const_mem (mode, toc); >> + set_mem_alias_set (sym, get_TOC_alias_set ()); >> + } >> + >> + emit_insn (gen_rtx_SET (dest, sym)); >> + } >> else >> rs6000_emit_set_long_const (dest, c); >> break; >> diff --git a/gcc/testsuite/gcc.target/powerpc/const_anchors.c b/gcc/testsuite/gcc.target/powerpc/const_anchors.c >> index 542e2674b12..188744165f2 100644 >> --- a/gcc/testsuite/gcc.target/powerpc/const_anchors.c >> +++ b/gcc/testsuite/gcc.target/powerpc/const_anchors.c >> @@ -1,5 +1,5 @@ >> /* { dg-do compile { target has_arch_ppc64 } } */ >> -/* { dg-options "-O2" } */ >> +/* { dg-options "-O2 -fdump-rtl-final" } */ >> >> #define C1 0x2351847027482577ULL >> #define C2 0x2351847027482578ULL >> @@ -16,5 +16,4 @@ void __attribute__ ((noinline)) foo1 (long long *a, long long b) >> if (b) >> *a++ = C2; >> } >> - >> -/* { dg-final { scan-assembler-times {\maddi\M} 2 } } */ >> +/* { dg-final { scan-rtl-dump-times {\madddi3\M} 2 "final" } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >> index e3a9a7264cf..df0690b90be 100644 >> --- a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >> @@ -9,8 +9,18 @@ >> void __attribute__ ((noinline)) foo (unsigned long long *a) >> { >> /* 2 lis + 2 ori + 1 rldimi for each constant. */ >> - *a++ = 0x800aabcdc167fa16ULL; >> - *a++ = 0x7543a876867f616ULL; >> + { >> + register long long d asm("r0") = 0x800aabcdc167fa16ULL; >> + long long n; >> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >> + *a++ = n; >> + } >> + { >> + register long long d asm("r0") = 0x7543a876867f616ULL; >> + long long n; >> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >> + *a++ = n; >> + } >> } >> >> long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL}; >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550.c b/gcc/testsuite/gcc.target/powerpc/pr106550.c >> index 74e395331ab..5eca2b2f701 100644 >> --- a/gcc/testsuite/gcc.target/powerpc/pr106550.c >> +++ b/gcc/testsuite/gcc.target/powerpc/pr106550.c >> @@ -1,12 +1,25 @@ >> /* PR target/106550 */ >> /* { dg-options "-O2 -mdejagnu-cpu=power10" } */ >> /* { dg-require-effective-target power10_ok } */ >> +/* { dg-require-effective-target has_arch_ppc64 } */ >> >> void >> foo (unsigned long long *a) >> { >> - *a++ = 0x020805006106003; /* pli+pli+rldimi */ >> - *a++ = 0x2351847027482577;/* pli+pli+rldimi */ >> + { >> + /* pli+pli+rldimi */ >> + register long long d asm("r0") = 0x020805006106003ULL; >> + long long n; >> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >> + *a++ = n; >> + } >> + { >> + /* pli+pli+rldimi */ >> + register long long d asm("r0") = 0x2351847027482577ULL; >> + long long n; >> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >> + *a++ = n; >> + } >> } >> >> /* { dg-final { scan-assembler-times {\mpli\M} 4 } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >> index 7e709fcf9d8..11878d893a4 100644 >> --- a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >> +++ b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >> @@ -12,8 +12,19 @@ foo (unsigned long long *a) >> asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >> *a++ = n; >> >> - *a++ = 0x235a8470a7480000ULL; /* pli+sldi+oris */ >> - *a++ = 0x23a184700000b677ULL; /* pli+sldi+ori */ >> + { >> + register long long d asm("r0") = 0x235a8470a7480000ULL; /* pli+sldi+oris */ >> + long long n; >> + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >> + *a++ = n; >> + } >> + >> + { >> + register long long d asm("r0") = 0x23a184700000b677ULL; /* pli+sldi+ori */ >> + long long n; >> + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >> + *a++ = n; >> + } >> } >> >> /* { dg-final { scan-assembler-times {\mpli\M} 3 } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr87870.c b/gcc/testsuite/gcc.target/powerpc/pr87870.c >> index d2108ac3386..5fee06744ae 100644 >> --- a/gcc/testsuite/gcc.target/powerpc/pr87870.c >> +++ b/gcc/testsuite/gcc.target/powerpc/pr87870.c >> @@ -25,4 +25,7 @@ test3 (void) >> return ((__int128)0xdeadbeefcafebabe << 64) | 0xfacefeedbaaaaaad; >> } >> >> -/* { dg-final { scan-assembler-not {\mld\M} } } */ >> +/* test3 using "ld" to load the value for r3 and r4. >> + test0, test1 and test2 are using "li". */ >> +/* { dg-final { scan-assembler-times {\mp?ld\M} 2 } } */ >> +/* { dg-final { scan-assembler-times {\mli\M} 6 } } */ >> diff --git a/gcc/testsuite/gcc.target/powerpc/pr93012.c b/gcc/testsuite/gcc.target/powerpc/pr93012.c >> index 4f764d0576f..87ce0b49b23 100644 >> --- a/gcc/testsuite/gcc.target/powerpc/pr93012.c >> +++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c >> @@ -10,4 +10,6 @@ unsigned long long mskh1() { return 0xffff9234ffff9234ULL; } >> unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; } >> unsigned long long mskse() { return 0xffff1234ffff1234ULL; } >> >> -/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */ >> +/* { dg-final { scan-assembler-times {\mrldimi\M} 7 { target { has_arch_pwr10 } } } } */ >> +/* { dg-final { scan-assembler-times {\mrldimi\M} 3 { target { ! has_arch_pwr10 } } } } */ >> +/* { dg-final { scan-assembler-times {\mp?ld\M} 4 { target { ! has_arch_pwr10 } } } } */ ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 3/3]rs6000: split complicate constant to constant pool 2023-10-25 8:14 ` Jiufu Guo @ 2023-10-25 8:29 ` Kewen.Lin 2023-10-25 9:40 ` Jiufu Guo 0 siblings, 1 reply; 11+ messages in thread From: Kewen.Lin @ 2023-10-25 8:29 UTC (permalink / raw) To: Jiufu Guo; +Cc: segher, dje.gcc, linkw, bergner, gcc-patches on 2023/10/25 16:14, Jiufu Guo wrote: > > Hi, > > "Kewen.Lin" <linkw@linux.ibm.com> writes: > >> Hi, >> >> on 2023/10/25 10:00, Jiufu Guo wrote: >>> Hi, >>> >>> Sometimes, a complicated constant is built via 3(or more) >>> instructions to build. Generally speaking, it would not be >>> as faster as loading it from the constant pool (as a few >>> discussions in PR63281). >> >> I may miss some previous discussions, but I'm curious why we >> chose ">=3" here, as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c9 >> which indicates that more than 3 (>3) should be considered >> with this change. > > Thanks a lot for your great patience for reading the history! > Yes, there are some discussions about "> 3" vs. "> 2". > - In theory, "ld" is one instruction. If consider "address/toc" > adjust, we may count it as 2 instructions. "pld" may need less > cycles. OK, even without prefixed insn support, the high part of address computation could be optimized as nop by linker further. It would be good to say something on this in commit log, otherwise people may be confused as the PR comment mentioned above. > - As test, it seems "> 2" could get better/stable runtime result > during testing SPEC2017. Ok, if you posted the conclusion previously, it would be good to mention it here with a link on the result comparisons. > >> >>> >>> For the concern that I raised in: >>> https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599676.html >>> The micro-cases would not be the major concern. Because as >>> Segher explained in: >>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c18 >>> It would just be about the benchmark method. >>> >>> As tested on spec2017, for visible performance changes, we >>> can find the runtime improvement on 500.perlbench_r about >>> ~1.8% (-O2) when support loading complicates constant from >>> constant pool. And no visible performance recession on >>> other benchmarks. >> >> The improvement on 500.perlbench_r looks to match what PR63281 >> mentioned, nice! I'm curious that which options and which kinds >> of CPUs have you tested with? Since this is a general change, >> I'd expect we can test with P8/P9/P10 at O2/O3 (or Ofast) at >> least. > > Great advice! Thanks for pointing this! > A few months ago, P8/P9/P10 are tested. While this time, I rerun > SPEC2017 on P10 for -O2 and -O3. More test on latest code would > be better. Was it tested previously with your recent commits on constant building together? or just with the trunk at that time? Anyway, I was curious how it's tested, thanks for replying, good to see those are covered. :) I'd leave the further review to Segher and David. BR, Kewen > > > BR, > Jeff (Jiufu Guo) > >> >> BR, >> Kewen >> >>> >>> Boostrap & regtest pass on ppc64{,le}. >>> Is this ok for trunk? >>> >>> BR, >>> Jeff (Jiufu Guo) >>> >>> PR target/63281 >>> >>> gcc/ChangeLog: >>> >>> * config/rs6000/rs6000.cc (rs6000_emit_set_const): Update to split >>> complicate constant to memory. >>> >>> gcc/testsuite/ChangeLog: >>> >>> * gcc.target/powerpc/const_anchors.c: Update to test final-rtl. >>> * gcc.target/powerpc/parall_5insn_const.c: Update to keep original test >>> point. >>> * gcc.target/powerpc/pr106550.c: Likewise.. >>> * gcc.target/powerpc/pr106550_1.c: Likewise. >>> * gcc.target/powerpc/pr87870.c: Update according to latest behavior. >>> * gcc.target/powerpc/pr93012.c: Likewise. >>> >>> --- >>> gcc/config/rs6000/rs6000.cc | 16 ++++++++++++++++ >>> .../gcc.target/powerpc/const_anchors.c | 5 ++--- >>> .../gcc.target/powerpc/parall_5insn_const.c | 14 ++++++++++++-- >>> gcc/testsuite/gcc.target/powerpc/pr106550.c | 17 +++++++++++++++-- >>> gcc/testsuite/gcc.target/powerpc/pr106550_1.c | 15 +++++++++++++-- >>> gcc/testsuite/gcc.target/powerpc/pr87870.c | 5 ++++- >>> gcc/testsuite/gcc.target/powerpc/pr93012.c | 4 +++- >>> 7 files changed, 65 insertions(+), 11 deletions(-) >>> >>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >>> index 4690384cdbe..b9562f1ea0f 100644 >>> --- a/gcc/config/rs6000/rs6000.cc >>> +++ b/gcc/config/rs6000/rs6000.cc >>> @@ -10292,6 +10292,22 @@ rs6000_emit_set_const (rtx dest, rtx source) >>> c = sext_hwi (c, 32); >>> emit_move_insn (lo, GEN_INT (c)); >>> } >>> + >>> + /* If it can be stored to the constant pool and profitable. */ >>> + else if (base_reg_operand (dest, mode) >>> + && num_insns_constant (source, mode) > 2) >>> + { >>> + rtx sym = force_const_mem (mode, source); >>> + if (TARGET_TOC && SYMBOL_REF_P (XEXP (sym, 0)) >>> + && use_toc_relative_ref (XEXP (sym, 0), mode)) >>> + { >>> + rtx toc = create_TOC_reference (XEXP (sym, 0), copy_rtx (dest)); >>> + sym = gen_const_mem (mode, toc); >>> + set_mem_alias_set (sym, get_TOC_alias_set ()); >>> + } >>> + >>> + emit_insn (gen_rtx_SET (dest, sym)); >>> + } >>> else >>> rs6000_emit_set_long_const (dest, c); >>> break; >>> diff --git a/gcc/testsuite/gcc.target/powerpc/const_anchors.c b/gcc/testsuite/gcc.target/powerpc/const_anchors.c >>> index 542e2674b12..188744165f2 100644 >>> --- a/gcc/testsuite/gcc.target/powerpc/const_anchors.c >>> +++ b/gcc/testsuite/gcc.target/powerpc/const_anchors.c >>> @@ -1,5 +1,5 @@ >>> /* { dg-do compile { target has_arch_ppc64 } } */ >>> -/* { dg-options "-O2" } */ >>> +/* { dg-options "-O2 -fdump-rtl-final" } */ >>> >>> #define C1 0x2351847027482577ULL >>> #define C2 0x2351847027482578ULL >>> @@ -16,5 +16,4 @@ void __attribute__ ((noinline)) foo1 (long long *a, long long b) >>> if (b) >>> *a++ = C2; >>> } >>> - >>> -/* { dg-final { scan-assembler-times {\maddi\M} 2 } } */ >>> +/* { dg-final { scan-rtl-dump-times {\madddi3\M} 2 "final" } } */ >>> diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >>> index e3a9a7264cf..df0690b90be 100644 >>> --- a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >>> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >>> @@ -9,8 +9,18 @@ >>> void __attribute__ ((noinline)) foo (unsigned long long *a) >>> { >>> /* 2 lis + 2 ori + 1 rldimi for each constant. */ >>> - *a++ = 0x800aabcdc167fa16ULL; >>> - *a++ = 0x7543a876867f616ULL; >>> + { >>> + register long long d asm("r0") = 0x800aabcdc167fa16ULL; >>> + long long n; >>> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >>> + *a++ = n; >>> + } >>> + { >>> + register long long d asm("r0") = 0x7543a876867f616ULL; >>> + long long n; >>> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >>> + *a++ = n; >>> + } >>> } >>> >>> long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL}; >>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550.c b/gcc/testsuite/gcc.target/powerpc/pr106550.c >>> index 74e395331ab..5eca2b2f701 100644 >>> --- a/gcc/testsuite/gcc.target/powerpc/pr106550.c >>> +++ b/gcc/testsuite/gcc.target/powerpc/pr106550.c >>> @@ -1,12 +1,25 @@ >>> /* PR target/106550 */ >>> /* { dg-options "-O2 -mdejagnu-cpu=power10" } */ >>> /* { dg-require-effective-target power10_ok } */ >>> +/* { dg-require-effective-target has_arch_ppc64 } */ >>> >>> void >>> foo (unsigned long long *a) >>> { >>> - *a++ = 0x020805006106003; /* pli+pli+rldimi */ >>> - *a++ = 0x2351847027482577;/* pli+pli+rldimi */ >>> + { >>> + /* pli+pli+rldimi */ >>> + register long long d asm("r0") = 0x020805006106003ULL; >>> + long long n; >>> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >>> + *a++ = n; >>> + } >>> + { >>> + /* pli+pli+rldimi */ >>> + register long long d asm("r0") = 0x2351847027482577ULL; >>> + long long n; >>> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >>> + *a++ = n; >>> + } >>> } >>> >>> /* { dg-final { scan-assembler-times {\mpli\M} 4 } } */ >>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >>> index 7e709fcf9d8..11878d893a4 100644 >>> --- a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >>> +++ b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >>> @@ -12,8 +12,19 @@ foo (unsigned long long *a) >>> asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >>> *a++ = n; >>> >>> - *a++ = 0x235a8470a7480000ULL; /* pli+sldi+oris */ >>> - *a++ = 0x23a184700000b677ULL; /* pli+sldi+ori */ >>> + { >>> + register long long d asm("r0") = 0x235a8470a7480000ULL; /* pli+sldi+oris */ >>> + long long n; >>> + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >>> + *a++ = n; >>> + } >>> + >>> + { >>> + register long long d asm("r0") = 0x23a184700000b677ULL; /* pli+sldi+ori */ >>> + long long n; >>> + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >>> + *a++ = n; >>> + } >>> } >>> >>> /* { dg-final { scan-assembler-times {\mpli\M} 3 } } */ >>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr87870.c b/gcc/testsuite/gcc.target/powerpc/pr87870.c >>> index d2108ac3386..5fee06744ae 100644 >>> --- a/gcc/testsuite/gcc.target/powerpc/pr87870.c >>> +++ b/gcc/testsuite/gcc.target/powerpc/pr87870.c >>> @@ -25,4 +25,7 @@ test3 (void) >>> return ((__int128)0xdeadbeefcafebabe << 64) | 0xfacefeedbaaaaaad; >>> } >>> >>> -/* { dg-final { scan-assembler-not {\mld\M} } } */ >>> +/* test3 using "ld" to load the value for r3 and r4. >>> + test0, test1 and test2 are using "li". */ >>> +/* { dg-final { scan-assembler-times {\mp?ld\M} 2 } } */ >>> +/* { dg-final { scan-assembler-times {\mli\M} 6 } } */ >>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr93012.c b/gcc/testsuite/gcc.target/powerpc/pr93012.c >>> index 4f764d0576f..87ce0b49b23 100644 >>> --- a/gcc/testsuite/gcc.target/powerpc/pr93012.c >>> +++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c >>> @@ -10,4 +10,6 @@ unsigned long long mskh1() { return 0xffff9234ffff9234ULL; } >>> unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; } >>> unsigned long long mskse() { return 0xffff1234ffff1234ULL; } >>> >>> -/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */ >>> +/* { dg-final { scan-assembler-times {\mrldimi\M} 7 { target { has_arch_pwr10 } } } } */ >>> +/* { dg-final { scan-assembler-times {\mrldimi\M} 3 { target { ! has_arch_pwr10 } } } } */ >>> +/* { dg-final { scan-assembler-times {\mp?ld\M} 4 { target { ! has_arch_pwr10 } } } } */ BR, Kewen ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 3/3]rs6000: split complicate constant to constant pool 2023-10-25 8:29 ` Kewen.Lin @ 2023-10-25 9:40 ` Jiufu Guo 0 siblings, 0 replies; 11+ messages in thread From: Jiufu Guo @ 2023-10-25 9:40 UTC (permalink / raw) To: Kewen.Lin; +Cc: segher, dje.gcc, linkw, bergner, gcc-patches Hi, "Kewen.Lin" <linkw@linux.ibm.com> writes: > on 2023/10/25 16:14, Jiufu Guo wrote: >> >> Hi, >> >> "Kewen.Lin" <linkw@linux.ibm.com> writes: >> >>> Hi, >>> >>> on 2023/10/25 10:00, Jiufu Guo wrote: >>>> Hi, >>>> >>>> Sometimes, a complicated constant is built via 3(or more) >>>> instructions to build. Generally speaking, it would not be >>>> as faster as loading it from the constant pool (as a few >>>> discussions in PR63281). >>> >>> I may miss some previous discussions, but I'm curious why we >>> chose ">=3" here, as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c9 >>> which indicates that more than 3 (>3) should be considered >>> with this change. >> >> Thanks a lot for your great patience for reading the history! >> Yes, there are some discussions about "> 3" vs. "> 2". >> - In theory, "ld" is one instruction. If consider "address/toc" >> adjust, we may count it as 2 instructions. "pld" may need less >> cycles. > > OK, even without prefixed insn support, the high part of address > computation could be optimized as nop by linker further. It would > be good to say something on this in commit log, otherwise people > may be confused as the PR comment mentioned above. > >> - As test, it seems "> 2" could get better/stable runtime result >> during testing SPEC2017. > > Ok, if you posted the conclusion previously, it would be good to > mention it here with a link on the result comparisons. Thanks a lot for your sugguestions. > >> >>> >>>> >>>> For the concern that I raised in: >>>> https://gcc.gnu.org/pipermail/gcc-patches/2022-August/599676.html >>>> The micro-cases would not be the major concern. Because as >>>> Segher explained in: >>>> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63281#c18 >>>> It would just be about the benchmark method. >>>> >>>> As tested on spec2017, for visible performance changes, we >>>> can find the runtime improvement on 500.perlbench_r about >>>> ~1.8% (-O2) when support loading complicates constant from >>>> constant pool. And no visible performance recession on >>>> other benchmarks. >>> >>> The improvement on 500.perlbench_r looks to match what PR63281 >>> mentioned, nice! I'm curious that which options and which kinds >>> of CPUs have you tested with? Since this is a general change, >>> I'd expect we can test with P8/P9/P10 at O2/O3 (or Ofast) at >>> least. >> >> Great advice! Thanks for pointing this! >> A few months ago, P8/P9/P10 are tested. While this time, I rerun >> SPEC2017 on P10 for -O2 and -O3. More test on latest code would >> be better. > > Was it tested previously with your recent commits on constant > building together? or just with the trunk at that time? Anyway, Just with the trunk at that time. > I was curious how it's tested, thanks for replying, good to see > those are covered. :) I'd leave the further review to Segher and > David. Thanks again. BR, Jeff (Jiufu Guo) > > BR, > Kewen > >> >> >> BR, >> Jeff (Jiufu Guo) >> >>> >>> BR, >>> Kewen >>> >>>> >>>> Boostrap & regtest pass on ppc64{,le}. >>>> Is this ok for trunk? >>>> >>>> BR, >>>> Jeff (Jiufu Guo) >>>> >>>> PR target/63281 >>>> >>>> gcc/ChangeLog: >>>> >>>> * config/rs6000/rs6000.cc (rs6000_emit_set_const): Update to split >>>> complicate constant to memory. >>>> >>>> gcc/testsuite/ChangeLog: >>>> >>>> * gcc.target/powerpc/const_anchors.c: Update to test final-rtl. >>>> * gcc.target/powerpc/parall_5insn_const.c: Update to keep original test >>>> point. >>>> * gcc.target/powerpc/pr106550.c: Likewise.. >>>> * gcc.target/powerpc/pr106550_1.c: Likewise. >>>> * gcc.target/powerpc/pr87870.c: Update according to latest behavior. >>>> * gcc.target/powerpc/pr93012.c: Likewise. >>>> >>>> --- >>>> gcc/config/rs6000/rs6000.cc | 16 ++++++++++++++++ >>>> .../gcc.target/powerpc/const_anchors.c | 5 ++--- >>>> .../gcc.target/powerpc/parall_5insn_const.c | 14 ++++++++++++-- >>>> gcc/testsuite/gcc.target/powerpc/pr106550.c | 17 +++++++++++++++-- >>>> gcc/testsuite/gcc.target/powerpc/pr106550_1.c | 15 +++++++++++++-- >>>> gcc/testsuite/gcc.target/powerpc/pr87870.c | 5 ++++- >>>> gcc/testsuite/gcc.target/powerpc/pr93012.c | 4 +++- >>>> 7 files changed, 65 insertions(+), 11 deletions(-) >>>> >>>> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >>>> index 4690384cdbe..b9562f1ea0f 100644 >>>> --- a/gcc/config/rs6000/rs6000.cc >>>> +++ b/gcc/config/rs6000/rs6000.cc >>>> @@ -10292,6 +10292,22 @@ rs6000_emit_set_const (rtx dest, rtx source) >>>> c = sext_hwi (c, 32); >>>> emit_move_insn (lo, GEN_INT (c)); >>>> } >>>> + >>>> + /* If it can be stored to the constant pool and profitable. */ >>>> + else if (base_reg_operand (dest, mode) >>>> + && num_insns_constant (source, mode) > 2) >>>> + { >>>> + rtx sym = force_const_mem (mode, source); >>>> + if (TARGET_TOC && SYMBOL_REF_P (XEXP (sym, 0)) >>>> + && use_toc_relative_ref (XEXP (sym, 0), mode)) >>>> + { >>>> + rtx toc = create_TOC_reference (XEXP (sym, 0), copy_rtx (dest)); >>>> + sym = gen_const_mem (mode, toc); >>>> + set_mem_alias_set (sym, get_TOC_alias_set ()); >>>> + } >>>> + >>>> + emit_insn (gen_rtx_SET (dest, sym)); >>>> + } >>>> else >>>> rs6000_emit_set_long_const (dest, c); >>>> break; >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/const_anchors.c b/gcc/testsuite/gcc.target/powerpc/const_anchors.c >>>> index 542e2674b12..188744165f2 100644 >>>> --- a/gcc/testsuite/gcc.target/powerpc/const_anchors.c >>>> +++ b/gcc/testsuite/gcc.target/powerpc/const_anchors.c >>>> @@ -1,5 +1,5 @@ >>>> /* { dg-do compile { target has_arch_ppc64 } } */ >>>> -/* { dg-options "-O2" } */ >>>> +/* { dg-options "-O2 -fdump-rtl-final" } */ >>>> >>>> #define C1 0x2351847027482577ULL >>>> #define C2 0x2351847027482578ULL >>>> @@ -16,5 +16,4 @@ void __attribute__ ((noinline)) foo1 (long long *a, long long b) >>>> if (b) >>>> *a++ = C2; >>>> } >>>> - >>>> -/* { dg-final { scan-assembler-times {\maddi\M} 2 } } */ >>>> +/* { dg-final { scan-rtl-dump-times {\madddi3\M} 2 "final" } } */ >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >>>> index e3a9a7264cf..df0690b90be 100644 >>>> --- a/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >>>> +++ b/gcc/testsuite/gcc.target/powerpc/parall_5insn_const.c >>>> @@ -9,8 +9,18 @@ >>>> void __attribute__ ((noinline)) foo (unsigned long long *a) >>>> { >>>> /* 2 lis + 2 ori + 1 rldimi for each constant. */ >>>> - *a++ = 0x800aabcdc167fa16ULL; >>>> - *a++ = 0x7543a876867f616ULL; >>>> + { >>>> + register long long d asm("r0") = 0x800aabcdc167fa16ULL; >>>> + long long n; >>>> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >>>> + *a++ = n; >>>> + } >>>> + { >>>> + register long long d asm("r0") = 0x7543a876867f616ULL; >>>> + long long n; >>>> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >>>> + *a++ = n; >>>> + } >>>> } >>>> >>>> long long A[] = {0x800aabcdc167fa16ULL, 0x7543a876867f616ULL}; >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550.c b/gcc/testsuite/gcc.target/powerpc/pr106550.c >>>> index 74e395331ab..5eca2b2f701 100644 >>>> --- a/gcc/testsuite/gcc.target/powerpc/pr106550.c >>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr106550.c >>>> @@ -1,12 +1,25 @@ >>>> /* PR target/106550 */ >>>> /* { dg-options "-O2 -mdejagnu-cpu=power10" } */ >>>> /* { dg-require-effective-target power10_ok } */ >>>> +/* { dg-require-effective-target has_arch_ppc64 } */ >>>> >>>> void >>>> foo (unsigned long long *a) >>>> { >>>> - *a++ = 0x020805006106003; /* pli+pli+rldimi */ >>>> - *a++ = 0x2351847027482577;/* pli+pli+rldimi */ >>>> + { >>>> + /* pli+pli+rldimi */ >>>> + register long long d asm("r0") = 0x020805006106003ULL; >>>> + long long n; >>>> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >>>> + *a++ = n; >>>> + } >>>> + { >>>> + /* pli+pli+rldimi */ >>>> + register long long d asm("r0") = 0x2351847027482577ULL; >>>> + long long n; >>>> + asm("mr %0, %1" : "=r"(n) : "r"(d)); >>>> + *a++ = n; >>>> + } >>>> } >>>> >>>> /* { dg-final { scan-assembler-times {\mpli\M} 4 } } */ >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >>>> index 7e709fcf9d8..11878d893a4 100644 >>>> --- a/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr106550_1.c >>>> @@ -12,8 +12,19 @@ foo (unsigned long long *a) >>>> asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >>>> *a++ = n; >>>> >>>> - *a++ = 0x235a8470a7480000ULL; /* pli+sldi+oris */ >>>> - *a++ = 0x23a184700000b677ULL; /* pli+sldi+ori */ >>>> + { >>>> + register long long d asm("r0") = 0x235a8470a7480000ULL; /* pli+sldi+oris */ >>>> + long long n; >>>> + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >>>> + *a++ = n; >>>> + } >>>> + >>>> + { >>>> + register long long d asm("r0") = 0x23a184700000b677ULL; /* pli+sldi+ori */ >>>> + long long n; >>>> + asm("cntlzd %0, %1" : "=r"(n) : "r"(d)); >>>> + *a++ = n; >>>> + } >>>> } >>>> >>>> /* { dg-final { scan-assembler-times {\mpli\M} 3 } } */ >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr87870.c b/gcc/testsuite/gcc.target/powerpc/pr87870.c >>>> index d2108ac3386..5fee06744ae 100644 >>>> --- a/gcc/testsuite/gcc.target/powerpc/pr87870.c >>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr87870.c >>>> @@ -25,4 +25,7 @@ test3 (void) >>>> return ((__int128)0xdeadbeefcafebabe << 64) | 0xfacefeedbaaaaaad; >>>> } >>>> >>>> -/* { dg-final { scan-assembler-not {\mld\M} } } */ >>>> +/* test3 using "ld" to load the value for r3 and r4. >>>> + test0, test1 and test2 are using "li". */ >>>> +/* { dg-final { scan-assembler-times {\mp?ld\M} 2 } } */ >>>> +/* { dg-final { scan-assembler-times {\mli\M} 6 } } */ >>>> diff --git a/gcc/testsuite/gcc.target/powerpc/pr93012.c b/gcc/testsuite/gcc.target/powerpc/pr93012.c >>>> index 4f764d0576f..87ce0b49b23 100644 >>>> --- a/gcc/testsuite/gcc.target/powerpc/pr93012.c >>>> +++ b/gcc/testsuite/gcc.target/powerpc/pr93012.c >>>> @@ -10,4 +10,6 @@ unsigned long long mskh1() { return 0xffff9234ffff9234ULL; } >>>> unsigned long long mskl1() { return 0x2bcdffff2bcdffffULL; } >>>> unsigned long long mskse() { return 0xffff1234ffff1234ULL; } >>>> >>>> -/* { dg-final { scan-assembler-times {\mrldimi\M} 7 } } */ >>>> +/* { dg-final { scan-assembler-times {\mrldimi\M} 7 { target { has_arch_pwr10 } } } } */ >>>> +/* { dg-final { scan-assembler-times {\mrldimi\M} 3 { target { ! has_arch_pwr10 } } } } */ >>>> +/* { dg-final { scan-assembler-times {\mp?ld\M} 4 { target { ! has_arch_pwr10 } } } } */ > > > BR, > Kewen ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/3]rs6000: update num_insns_constant for 2 insns 2023-10-25 2:00 [PATCH 1/3]rs6000: update num_insns_constant for 2 insns Jiufu Guo 2023-10-25 2:00 ` [PATCH 2/3]rs6000: using 'pli' to load 34bit-constant Jiufu Guo 2023-10-25 2:00 ` [PATCH 3/3]rs6000: split complicate constant to constant pool Jiufu Guo @ 2023-10-25 2:27 ` Kewen.Lin 2023-10-25 7:27 ` Jiufu Guo 2 siblings, 1 reply; 11+ messages in thread From: Kewen.Lin @ 2023-10-25 2:27 UTC (permalink / raw) To: Jiufu Guo; +Cc: segher, dje.gcc, linkw, bergner, gcc-patches Hi, on 2023/10/25 10:00, Jiufu Guo wrote: > Hi, > > Trunk gcc supports more constants to be built via two instructions: e.g. > "li/lis; xori/xoris/rldicl/rldicr/rldic". > And then num_insns_constant should also be updated. > Thanks for updating this. > Bootstrap & regtest pass ppc64{,le}. > Is this ok for trunk? > > BR, > Jeff (Jiufu Guo) > > gcc/ChangeLog: > > * config/rs6000/rs6000.cc (can_be_built_by_lilis_and_rldicX): New > function. > (num_insns_constant_gpr): Update to return 2 for more cases. > (rs6000_emit_set_long_const): Update to use > can_be_built_by_lilis_and_rldicX. > > --- > gcc/config/rs6000/rs6000.cc | 64 ++++++++++++++++++++++++------------- > 1 file changed, 41 insertions(+), 23 deletions(-) > > diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc > index cc24dd5301e..b23ff3d7917 100644 > --- a/gcc/config/rs6000/rs6000.cc > +++ b/gcc/config/rs6000/rs6000.cc > @@ -6032,6 +6032,9 @@ direct_return (void) > return 0; > } > > +static bool > +can_be_built_by_lilis_and_rldicX (HOST_WIDE_INT, int *, HOST_WIDE_INT *); > + > /* Helper for num_insns_constant. Calculate number of instructions to > load VALUE to a single gpr using combinations of addi, addis, ori, > oris, sldi and rldimi instructions. */ > @@ -6044,35 +6047,41 @@ num_insns_constant_gpr (HOST_WIDE_INT value) > return 1; > > /* constant loadable with addis */ > - else if ((value & 0xffff) == 0 > - && (value >> 31 == -1 || value >> 31 == 0)) > + if ((value & 0xffff) == 0 && (value >> 31 == -1 || value >> 31 == 0)) > return 1; > > /* PADDI can support up to 34 bit signed integers. */ > - else if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (value)) > + if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (value)) > return 1; > > - else if (TARGET_POWERPC64) > - { > - HOST_WIDE_INT low = sext_hwi (value, 32); > - HOST_WIDE_INT high = value >> 31; > + if (!TARGET_POWERPC64) > + return 2; > > - if (high == 0 || high == -1) > - return 2; > + HOST_WIDE_INT low = sext_hwi (value, 32); > + HOST_WIDE_INT high = value >> 31; > > - high >>= 1; > + if (high == 0 || high == -1) > + return 2; > > - if (low == 0 || low == high) > - return num_insns_constant_gpr (high) + 1; > - else if (high == 0) > - return num_insns_constant_gpr (low) + 1; > - else > - return (num_insns_constant_gpr (high) > - + num_insns_constant_gpr (low) + 1); > - } > + high >>= 1; > > - else > + HOST_WIDE_INT ud2 = (low >> 16) & 0xffff; > + HOST_WIDE_INT ud1 = low & 0xffff; > + if (high == -1 && ((!(ud2 & 0x8000) && ud1 == 0) || (ud1 & 0x8000))) > + return 2; > + if (high == 0 && (ud1 == 0 || (!(ud1 & 0x8000)))) > return 2; I was thinking that instead of enumerating all the cases in function rs6000_emit_set_long_const, if we can add one optional argument like "int* num_insns=nullptr" to function rs6000_emit_set_long_const, and when it's not nullptr, not emitting anything but update the count in rs6000_emit_set_long_const. It helps people remember to update num_insns when updating rs6000_emit_set_long_const in future, it's also more clear on how the number comes from. Does it sound good to you? BR, Kewen > + > + int shift; > + HOST_WIDE_INT mask; > + if (can_be_built_by_lilis_and_rldicX (value, &shift, &mask)) > + return 2; > + > + if (low == 0 || low == high) > + return num_insns_constant_gpr (high) + 1; > + if (high == 0) > + return num_insns_constant_gpr (low) + 1; > + return (num_insns_constant_gpr (high) + num_insns_constant_gpr (low) + 1); > } > > /* Helper for num_insns_constant. Allow constants formed by the > @@ -10492,6 +10501,18 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask) > return false; > } > > +/* Combine the above checking functions for li/lis;rldicX. */ > + > +static bool > +can_be_built_by_lilis_and_rldicX (HOST_WIDE_INT c, int *shift, > + HOST_WIDE_INT *mask) > +{ > + return (can_be_built_by_li_lis_and_rotldi (c, shift, mask) > + || can_be_built_by_li_lis_and_rldicl (c, shift, mask) > + || can_be_built_by_li_lis_and_rldicr (c, shift, mask) > + || can_be_built_by_li_and_rldic (c, shift, mask)); > +} > + > /* Subroutine of rs6000_emit_set_const, handling PowerPC64 DImode. > Output insns to set DEST equal to the constant C as a series of > lis, ori and shl instructions. */ > @@ -10538,10 +10559,7 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c) > emit_move_insn (dest, gen_rtx_XOR (DImode, temp, > GEN_INT ((ud2 ^ 0xffff) << 16))); > } > - else if (can_be_built_by_li_lis_and_rotldi (c, &shift, &mask) > - || can_be_built_by_li_lis_and_rldicl (c, &shift, &mask) > - || can_be_built_by_li_lis_and_rldicr (c, &shift, &mask) > - || can_be_built_by_li_and_rldic (c, &shift, &mask)) > + else if (can_be_built_by_lilis_and_rldicX (c, &shift, &mask)) > { > temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode); > unsigned HOST_WIDE_INT imm = (c | ~mask); ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 1/3]rs6000: update num_insns_constant for 2 insns 2023-10-25 2:27 ` [PATCH 1/3]rs6000: update num_insns_constant for 2 insns Kewen.Lin @ 2023-10-25 7:27 ` Jiufu Guo 0 siblings, 0 replies; 11+ messages in thread From: Jiufu Guo @ 2023-10-25 7:27 UTC (permalink / raw) To: Kewen.Lin; +Cc: segher, dje.gcc, linkw, bergner, gcc-patches Hi, "Kewen.Lin" <linkw@linux.ibm.com> writes: > Hi, > > on 2023/10/25 10:00, Jiufu Guo wrote: >> Hi, >> >> Trunk gcc supports more constants to be built via two instructions: e.g. >> "li/lis; xori/xoris/rldicl/rldicr/rldic". >> And then num_insns_constant should also be updated. >> > > Thanks for updating this. > >> Bootstrap & regtest pass ppc64{,le}. >> Is this ok for trunk? >> >> BR, >> Jeff (Jiufu Guo) >> >> gcc/ChangeLog: >> >> * config/rs6000/rs6000.cc (can_be_built_by_lilis_and_rldicX): New >> function. >> (num_insns_constant_gpr): Update to return 2 for more cases. >> (rs6000_emit_set_long_const): Update to use >> can_be_built_by_lilis_and_rldicX. >> >> --- >> gcc/config/rs6000/rs6000.cc | 64 ++++++++++++++++++++++++------------- >> 1 file changed, 41 insertions(+), 23 deletions(-) >> >> diff --git a/gcc/config/rs6000/rs6000.cc b/gcc/config/rs6000/rs6000.cc >> index cc24dd5301e..b23ff3d7917 100644 >> --- a/gcc/config/rs6000/rs6000.cc >> +++ b/gcc/config/rs6000/rs6000.cc >> @@ -6032,6 +6032,9 @@ direct_return (void) >> return 0; >> } >> >> +static bool >> +can_be_built_by_lilis_and_rldicX (HOST_WIDE_INT, int *, HOST_WIDE_INT *); >> + >> /* Helper for num_insns_constant. Calculate number of instructions to >> load VALUE to a single gpr using combinations of addi, addis, ori, >> oris, sldi and rldimi instructions. */ >> @@ -6044,35 +6047,41 @@ num_insns_constant_gpr (HOST_WIDE_INT value) >> return 1; >> >> /* constant loadable with addis */ >> - else if ((value & 0xffff) == 0 >> - && (value >> 31 == -1 || value >> 31 == 0)) >> + if ((value & 0xffff) == 0 && (value >> 31 == -1 || value >> 31 == 0)) >> return 1; >> >> /* PADDI can support up to 34 bit signed integers. */ >> - else if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (value)) >> + if (TARGET_PREFIXED && SIGNED_INTEGER_34BIT_P (value)) >> return 1; >> >> - else if (TARGET_POWERPC64) >> - { >> - HOST_WIDE_INT low = sext_hwi (value, 32); >> - HOST_WIDE_INT high = value >> 31; >> + if (!TARGET_POWERPC64) >> + return 2; >> >> - if (high == 0 || high == -1) >> - return 2; >> + HOST_WIDE_INT low = sext_hwi (value, 32); >> + HOST_WIDE_INT high = value >> 31; >> >> - high >>= 1; >> + if (high == 0 || high == -1) >> + return 2; >> >> - if (low == 0 || low == high) >> - return num_insns_constant_gpr (high) + 1; >> - else if (high == 0) >> - return num_insns_constant_gpr (low) + 1; >> - else >> - return (num_insns_constant_gpr (high) >> - + num_insns_constant_gpr (low) + 1); >> - } >> + high >>= 1; >> >> - else >> + HOST_WIDE_INT ud2 = (low >> 16) & 0xffff; >> + HOST_WIDE_INT ud1 = low & 0xffff; >> + if (high == -1 && ((!(ud2 & 0x8000) && ud1 == 0) || (ud1 & 0x8000))) >> + return 2; >> + if (high == 0 && (ud1 == 0 || (!(ud1 & 0x8000)))) >> return 2; > > I was thinking that instead of enumerating all the cases in function > rs6000_emit_set_long_const, if we can add one optional argument like > "int* num_insns=nullptr" to function rs6000_emit_set_long_const, and > when it's not nullptr, not emitting anything but update the count in > rs6000_emit_set_long_const. It helps people remember to update > num_insns when updating rs6000_emit_set_long_const in future, it's > also more clear on how the number comes from. > > Does it sound good to you? Thanks for your advice! Yes, "rs6000_emit_set_long_const" and "num_insns_constant_gpr" should be aligned. Using a same logic (same code place) would make sense. BR, Jeff (Jiufu Guo) > > BR, > Kewen > >> + >> + int shift; >> + HOST_WIDE_INT mask; >> + if (can_be_built_by_lilis_and_rldicX (value, &shift, &mask)) >> + return 2; >> + >> + if (low == 0 || low == high) >> + return num_insns_constant_gpr (high) + 1; >> + if (high == 0) >> + return num_insns_constant_gpr (low) + 1; >> + return (num_insns_constant_gpr (high) + num_insns_constant_gpr (low) + 1); >> } >> >> /* Helper for num_insns_constant. Allow constants formed by the >> @@ -10492,6 +10501,18 @@ can_be_built_by_li_and_rldic (HOST_WIDE_INT c, int *shift, HOST_WIDE_INT *mask) >> return false; >> } >> >> +/* Combine the above checking functions for li/lis;rldicX. */ >> + >> +static bool >> +can_be_built_by_lilis_and_rldicX (HOST_WIDE_INT c, int *shift, >> + HOST_WIDE_INT *mask) >> +{ >> + return (can_be_built_by_li_lis_and_rotldi (c, shift, mask) >> + || can_be_built_by_li_lis_and_rldicl (c, shift, mask) >> + || can_be_built_by_li_lis_and_rldicr (c, shift, mask) >> + || can_be_built_by_li_and_rldic (c, shift, mask)); >> +} >> + >> /* Subroutine of rs6000_emit_set_const, handling PowerPC64 DImode. >> Output insns to set DEST equal to the constant C as a series of >> lis, ori and shl instructions. */ >> @@ -10538,10 +10559,7 @@ rs6000_emit_set_long_const (rtx dest, HOST_WIDE_INT c) >> emit_move_insn (dest, gen_rtx_XOR (DImode, temp, >> GEN_INT ((ud2 ^ 0xffff) << 16))); >> } >> - else if (can_be_built_by_li_lis_and_rotldi (c, &shift, &mask) >> - || can_be_built_by_li_lis_and_rldicl (c, &shift, &mask) >> - || can_be_built_by_li_lis_and_rldicr (c, &shift, &mask) >> - || can_be_built_by_li_and_rldic (c, &shift, &mask)) >> + else if (can_be_built_by_lilis_and_rldicX (c, &shift, &mask)) >> { >> temp = !can_create_pseudo_p () ? dest : gen_reg_rtx (DImode); >> unsigned HOST_WIDE_INT imm = (c | ~mask); ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2023-10-25 9:40 UTC | newest] Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2023-10-25 2:00 [PATCH 1/3]rs6000: update num_insns_constant for 2 insns Jiufu Guo 2023-10-25 2:00 ` [PATCH 2/3]rs6000: using 'pli' to load 34bit-constant Jiufu Guo 2023-10-25 2:33 ` Kewen.Lin 2023-10-25 7:37 ` Jiufu Guo 2023-10-25 2:00 ` [PATCH 3/3]rs6000: split complicate constant to constant pool Jiufu Guo 2023-10-25 2:43 ` Kewen.Lin 2023-10-25 8:14 ` Jiufu Guo 2023-10-25 8:29 ` Kewen.Lin 2023-10-25 9:40 ` Jiufu Guo 2023-10-25 2:27 ` [PATCH 1/3]rs6000: update num_insns_constant for 2 insns Kewen.Lin 2023-10-25 7:27 ` Jiufu Guo
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).