public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug rtl-optimization/113597] New: [14 Regression] aarch64: Significant code quality regression since r14-8346-ga98d5130a6dcff
@ 2024-01-25 10:53 acoplan at gcc dot gnu.org
  2024-01-25 11:01 ` [Bug rtl-optimization/113597] " rguenth at gcc dot gnu.org
                   ` (15 more replies)
  0 siblings, 16 replies; 17+ messages in thread
From: acoplan at gcc dot gnu.org @ 2024-01-25 10:53 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113597

            Bug ID: 113597
           Summary: [14 Regression] aarch64: Significant code quality
                    regression since r14-8346-ga98d5130a6dcff
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: acoplan at gcc dot gnu.org
  Target Milestone: ---

The following testcase shows a significant regression in code quality
since r14-8346-ga98d5130a6dcff2ed4db371e500550134777b8cf on aarch64:

$ cat t.cc
#include <arm_neon.h>
typedef struct {
  float b;
  float c;
} d;
template <uint16_t e> void f(uint16_t g, d *u, d *v) {
  uint16_t j, l = j = e * e;
  float32_t b[j];
  float32_t c[l];
  float32x4_t m[j];
  for (int i = 0; i < j; i++)
    m[i] = vdupq_n_f32(0.F);
  float32x4_t n[l];
  for (int i = 0; i < l; i++)
    n[i] = vdupq_n_f32(0.F);
  for (uint16_t k = 0; k < g; k += 2) {
    float32x4_t o[e];
    for (int i = 0; i < e; i++)
      o[i] = vld1q_f32((float32_t *)&u[k]);
    int idx = 0;
    for (int a = 0; a < e; a++)
      for (int ah = a; ah < e; ah++)
        m[idx] = vfmaq_f32(m[idx], o[a], o[ah]);
    float32x4_t p[e];
    for (int i; i; i++)
      for (int a; a;)
        for (int ah;;)
          vfmsq_f32(n[idx], o[a], p[ah]);
  }
  for (int i = 0; i < j; i++)
    b[i] = vaddvq_f32(m[i]);
  for (int i = 0; i < l; i++)
    c[i] = vaddvq_f32(n[i]);
  constexpr uint16_t q(e * e);
  float32x4_t r[q];
  float32x2_t s;
  r[4] = float32x4_t{b[5] - c[3]};
  for (int i = 0; i < q; i++)
    vst1q_f32((float32_t *)&v[2 * i], r[i]);
  if (e % 2)
    vst1_f32((float32_t *)v, s);
}
void t() {
  d v, u;
  f<4>(0, &u, &v);
}

$ cat cmp.sh
#!/bin/bash
set -e

BEFORE=/work/builds/r14-8345/gcc
AFTER=/work/builds/r14-8346/gcc
SRC=t.cc

$BEFORE/xgcc -B $BEFORE -c -S -o before.s $SRC -Wall -Werror -Ofast
-mcpu=neoverse-v2
$AFTER/xgcc -B $AFTER -c -S -o after.s $SRC -Wall -Werror -Ofast
-mcpu=neoverse-v2

diff -u before.s after.s

$ ./cmp.sh
--- before.s    2024-01-25 10:35:56.977090552 +0000
+++ after.s     2024-01-25 10:35:57.385086341 +0000
@@ -9,16 +9,47 @@
 _Z1fILt4EEvtP1dS1_:
 .LFB3918:
        .cfi_startproc
-       ands    w0, w0, 65535
+       movi    v31.4s, 0
        sub     sp, sp, #768
        .cfi_def_cfa_offset 768
+       ands    w0, w0, 65535
        mov     w3, 0
+       stp     q31, q31, [sp, 256]
+       stp     q31, q31, [sp, 288]
+       stp     q31, q31, [sp, 320]
+       stp     q31, q31, [sp, 352]
+       stp     q31, q31, [sp, 384]
+       stp     q31, q31, [sp, 416]
+       stp     q31, q31, [sp, 448]
+       stp     q31, q31, [sp, 480]
+       stp     q31, q31, [sp, 512]
+       stp     q31, q31, [sp, 544]
+       stp     q31, q31, [sp, 576]
+       stp     q31, q31, [sp, 608]
+       stp     q31, q31, [sp, 640]
+       stp     q31, q31, [sp, 672]
+       stp     q31, q31, [sp, 704]
+       stp     q31, q31, [sp, 736]
+       movi    v31.4s, 0
        beq     .L3
        .p2align 5,,15
 .L2:
-       add     w1, w3, 2
-       and     w3, w1, 65535
-       cmp     w0, w1, uxth
+       ubfiz   x5, x3, 3, 16
+       add     w4, w3, 2
+       and     w3, w4, 65535
+       ldr     q30, [x1, x5]
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       fmla    v31.4s, v30.4s, v30.4s
+       str     q31, [sp, 256]
+       cmp     w0, w4, uxth
        bhi     .L2
 .L3:
        ldp     q30, q31, [sp]

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2024-05-07  7:44 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-25 10:53 [Bug rtl-optimization/113597] New: [14 Regression] aarch64: Significant code quality regression since r14-8346-ga98d5130a6dcff acoplan at gcc dot gnu.org
2024-01-25 11:01 ` [Bug rtl-optimization/113597] " rguenth at gcc dot gnu.org
2024-01-25 11:01 ` rguenth at gcc dot gnu.org
2024-01-25 11:05 ` acoplan at gcc dot gnu.org
2024-01-25 11:10 ` acoplan at gcc dot gnu.org
2024-01-25 11:10 ` acoplan at gcc dot gnu.org
2024-01-25 11:16 ` pinskia at gcc dot gnu.org
2024-01-25 11:27 ` acoplan at gcc dot gnu.org
2024-01-25 11:32 ` acoplan at gcc dot gnu.org
2024-01-25 11:38 ` pinskia at gcc dot gnu.org
2024-01-25 11:40 ` acoplan at gcc dot gnu.org
2024-01-25 11:56 ` rguenth at gcc dot gnu.org
2024-01-25 13:41 ` rguenth at gcc dot gnu.org
2024-01-25 14:03 ` rguenth at gcc dot gnu.org
2024-01-29 13:56 ` rguenth at gcc dot gnu.org
2024-03-07 20:45 ` law at gcc dot gnu.org
2024-05-07  7:44 ` [Bug rtl-optimization/113597] [14/15 " rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).