public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/115534] New: intermediate stack use not eliminated
@ 2024-06-18  7:48 tnfchris at gcc dot gnu.org
  2024-06-18 12:36 ` [Bug tree-optimization/115534] " pinskia at gcc dot gnu.org
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-06-18  7:48 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115534

            Bug ID: 115534
           Summary: intermediate stack use not eliminated
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

Consider the following example:

#include <stdint.h>

typedef struct _pixel_t
{
  double red, green, blue, opacity;
} pixel_t;

typedef struct _PixelPacket
{
  unsigned short blue, green, red, opacity;
} PixelPacket;

pixel_t f (unsigned height, unsigned width, unsigned virt_width,
           uint8_t *restrict k, const PixelPacket *restrict k_pixels)
{
    pixel_t result = {};
    for (unsigned u=0; u < (width & -4); u++, k--) {
        result.red     += (*k)*k_pixels[u].red;
        result.green   += (*k)*k_pixels[u].green;
        result.blue    += (*k)*k_pixels[u].blue;
        result.opacity += (*k)*k_pixels[u].opacity;
        k_pixels += virt_width;
    }
    return result;
}

---

compiled with -O3 vectorizes as good, but the epilogue code is very
inefficient:

        fadd    v29.2d, v29.2d, v30.2d
        fadd    v28.2d, v28.2d, v31.2d
        cmp     w5, w1
        bhi     .L3
        mov     v31.16b, v28.16b
        ins     v31.d[1], v29.d[1]
        ins     v29.d[1], v28.d[1]
        stp     q31, q29, [sp, 32]
        ldp     d0, d1, [sp, 32]
        ldp     d2, d3, [sp, 48]
        add     sp, sp, 64
        ret
.L4:
        movi    v29.2d, 0
        mov     v31.16b, v29.16b
        stp     q31, q29, [sp, 32]
        ldp     d0, d1, [sp, 32]
        ldp     d2, d3, [sp, 48]
        add     sp, sp, 64
        ret

as in it goes through the stack to create the return registers.  This looks
like  at gimple we still have the store:

  <bb 5> [local count: 105119324]:
  _33 = VEC_PERM_EXPR <vect__10.16_41, vect__10.16_42, { 0, 3 }>;
  _31 = VEC_PERM_EXPR <vect__10.16_42, vect__10.16_41, { 0, 3 }>;

  <bb 6> [local count: 118111600]:
  # vect_result_red_64.18_28 = PHI <_33(5), { 0.0, 0.0 }(2)>
  # vect_result_red_64.18_105 = PHI <_31(5), { 0.0, 0.0 }(2)>
  MEM <vector(2) double> [(double *)&D.4535] = vect_result_red_64.18_28;
  MEM <vector(2) double> [(double *)&D.4535 + 16B] = vect_result_red_64.18_105;
  return D.4535;

clang is able to generate much better code here:

        fadd    v0.2d, v0.2d, v1.2d
        fadd    v2.2d, v2.2d, v3.2d
        b.ne    .LBB0_2
.LBB0_3:
        mov     d1, v2.d[1]
        mov     d3, v0.d[1]
        ret

The vectorized code gets reg-alloc'ed so that d0 an d2 are already in the right
registers at the end of the vector loop, and the epilogue only has to split the
registers up to get d1 and d3.

I think we would generate the same if we were to elide the intermediate stack
store.

See https://godbolt.org/z/ocqchWWs5

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-06-18 18:11 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-06-18  7:48 [Bug tree-optimization/115534] New: intermediate stack use not eliminated tnfchris at gcc dot gnu.org
2024-06-18 12:36 ` [Bug tree-optimization/115534] " pinskia at gcc dot gnu.org
2024-06-18 12:36 ` [Bug middle-end/115534] " pinskia at gcc dot gnu.org
2024-06-18 12:49 ` tnfchris at gcc dot gnu.org
2024-06-18 17:26 ` pinskia at gcc dot gnu.org
2024-06-18 17:29 ` pinskia at gcc dot gnu.org
2024-06-18 18:11 ` tnfchris at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).