[Bug rtl-optimization/115186] New: Suboptimal codes generated by rtl-expand for divmod

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug rtl-optimization/115186] New: Suboptimal codes generated by rtl-expand for divmod
@ 2024-05-22 10:20 dizhao at os dot amperecomputing.com
  2024-05-22 10:28 ` [Bug rtl-optimization/115186] " dizhao at os dot amperecomputing.com
  0 siblings, 1 reply; 2+ messages in thread
From: dizhao at os dot amperecomputing.com @ 2024-05-22 10:20 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115186

            Bug ID: 115186
           Summary: Suboptimal codes generated by rtl-expand for divmod
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: dizhao at os dot amperecomputing.com
  Target Milestone: ---

For the code below:

        typedef unsigned short int uint16_t;
        typedef unsigned char uint8_t;
        typedef long unsigned int size_t;

        uint16_t fletcher16(const uint8_t *data, const size_t len)
        {
                register uint16_t sum1 = 0, sum2 = 0;
                register size_t i;

                for (i = 0; i < len; i++) {
                        sum1 = (sum1 + data[i]) % 255;
                        sum2 = (sum2 + sum1) % 255;
                }
                return ((uint16_t)(sum2 << 8)) | sum1;
        }

, when compiled with "-O3 -mcpu=ampere1a", shift+add is used instead of div.
The dump file at 266r.expand shows:

   20: NOTE_INSN_BASIC_BLOCK 4
   21: debug i => r114:DI-r116:DI
   22: debug sum2 => r111:SI#0
   23: debug sum1 => r110:SI#0
   24: debug begin stmt marker
   25: r118:SI=zero_extend([r114:DI])
   26: r119:SI=r118:SI+r110:SI
   27: r120:DI=zero_extend(r119:SI)
   28: r121:DI=r120:DI
   29: r122:DI=r121:DI<<0x8
   30: r123:DI=r122:DI+r120:DI
      REG_EQUAL r120:DI*0x101
   31: r124:DI=r123:DI<<0x10
   32: r125:DI=r123:DI+r124:DI
      REG_EQUAL r120:DI*0x1010101
   33: r126:DI=r125:DI<<0x7
   34: r127:DI=r126:DI+r120:DI
      REG_EQUAL r120:DI*0x80808081
   35: r128:DI=r127:DI 0>>0x20
   36: r104:SI=r128:DI#0 0>>0x7
      REG_EQUAL udiv(r119:SI,0xff)
   ...

However, using mult instruction is better, like (the result can be produce with
"-mtune=neoverse-n1"):

   20: NOTE_INSN_BASIC_BLOCK 4
   21: debug i => r114:DI-r116:DI
   22: debug sum2 => r111:SI#0
   23: debug sum1 => r110:SI#0
   24: debug begin stmt marker
   25: r118:SI=zero_extend([r114:DI])
   26: r119:SI=r118:SI+r110:SI
   27: r121:SI=0xffffffff80808081
   28: r120:DI=zero_extend(r119:SI)*zero_extend(r121:SI)
   29: r122:DI=r120:DI 0>>0x20
   30: r104:SI=r122:DI#0 0>>0x7
      REG_EQUAL udiv(r119:SI,0xff)
   ...

The problem is, in expmed.cc:expmed_mult_highpart, when the result here is 0:
      /* See whether the specialized multiplication optabs are
         cheaper than the shift/add version.  */
      tem = expmed_mult_highpart_optab (mode, op0, narrow_op1, target,
                                        unsignedp,

, the "tem" produced by the code that follows (shift/add version) can be more
expensive than the result of:
      return expmed_mult_highpart_optab (mode, op0, op1, target, unsignedp,
                                     max_cost);

For -mcpu=ampere1a, the estimated cost of the former is 36, the cost of the
latter is 28.

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [Bug rtl-optimization/115186] Suboptimal codes generated by rtl-expand for divmod
  2024-05-22 10:20 [Bug rtl-optimization/115186] New: Suboptimal codes generated by rtl-expand for divmod dizhao at os dot amperecomputing.com
@ 2024-05-22 10:28 ` dizhao at os dot amperecomputing.com
  0 siblings, 0 replies; 2+ messages in thread
From: dizhao at os dot amperecomputing.com @ 2024-05-22 10:28 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115186

--- Comment #1 from Di Zhao <dizhao at os dot amperecomputing.com> ---
A raw fix below can fix this case (not sure if this is the right way):

diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 50d22762cae..bf42a0ff5ca 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -3976,7 +3976,10 @@ expmed_mult_highpart (scalar_int_mode mode, rtx op0, rtx
op1,
   bool sign_adjust = false;
   enum mult_variant variant;
   struct algorithm alg;
-  rtx narrow_op1, tem;
+  rtx narrow_op1, tem, tem2;
+  rtx_insn *shift_add_insns;
+  rtx_insn *mult_insns;
+  unsigned shift_add_cost, mult_cost;

   /* We can't support modes wider than HOST_BITS_PER_INT.  */
   gcc_assert (HWI_COMPUTABLE_MODE_P (mode));
@@ -4014,6 +4017,7 @@ expmed_mult_highpart (scalar_int_mode mode, rtx op0, rtx
op1,
       if (tem)
        return tem;

+      start_sequence ();
       tem = convert_to_mode (wider_mode, op0, unsignedp);
       tem = expand_mult_const (wider_mode, tem, cnst1, 0, &alg, variant);
       tem = extract_high_half (mode, tem);
@@ -4021,8 +4025,29 @@ expmed_mult_highpart (scalar_int_mode mode, rtx op0, rtx
op1,
       /* Adjust result for signedness.  */
       if (sign_adjust)
        tem = force_operand (gen_rtx_MINUS (mode, tem, op0), tem);
+      shift_add_insns = get_insns ();
+      end_sequence ();
+      shift_add_cost = seq_cost (shift_add_insns, speed);
+      
+      start_sequence ();
+      tem2 = expmed_mult_highpart_optab (mode, op0, op1, target, unsignedp,
+                                            max_cost);
+      mult_insns = get_insns ();
+      end_sequence ();
+      mult_cost = seq_cost (mult_insns, speed);

-      return tem;
+      if (tem2 && mult_cost < shift_add_cost)
+       {
+         emit_insn (mult_insns);
+         return tem2;
+       }
+      else if (shift_add_cost < max_cost)
+       {
+         emit_insn (shift_add_insns);
+         return tem;
+       }
+      else
+       return 0;
     }
   return expmed_mult_highpart_optab (mode, op0, narrow_op1, target,
                                     unsignedp, max_cost);

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2024-05-22 10:28 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-22 10:20 [Bug rtl-optimization/115186] New: Suboptimal codes generated by rtl-expand for divmod dizhao at os dot amperecomputing.com
2024-05-22 10:28 ` [Bug rtl-optimization/115186] " dizhao at os dot amperecomputing.com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).