public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop
@ 2024-01-17 12:38 juzhe.zhong at rivai dot ai
  2024-01-17 12:45 ` [Bug tree-optimization/113441] " juzhe.zhong at rivai dot ai
                   ` (46 more replies)
  0 siblings, 47 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-17 12:38 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

            Bug ID: 113441
           Summary: [14 Regression] Fail to fold the last element with
                    multiple loop
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

Hi, We found there is a regression between GCC-12 vs GCC-14 when evaluating our
downstream RVV GCC vs upstream RVV GCC.

Such regression not only happens on our RVV GCC but also ARM SVE GCC.

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int16_t array1[4][1 * 273 * 12 * 2];

int16_t array2[4][1 * 273 * 12 * 2];
int16_t array3[4][4 * 2];

void
foo (uint8_t a, uint16_t b)
{
  int32_t sum[2];
  int32_t result[4][2];
  uint16_t j = 0;
  uint8_t i = 0;
  uint16_t l = 0;
  uint16_t k = 0;
  uint32_t m = 0;

  for (i = 0; i < 4; i++)
    {
      m = 0;
      for (j = 0; j < a; j++)
        {
          for (k = 0; k < b; k++)
            {
              for (l = 0; l < 12; l++)
                {
                  result[0][0] = array1[0][2 * m] * array3[i][0]
                                 - array1[0][2 * m + 1] * array3[i][1];
                  result[0][1] = array1[0][2 * m + 1] * array3[i][0]
                                 + array1[0][2 * m] * array3[i][1];

                  result[1][0] = array1[1][2 * m] * array3[i][2]
                                 - array1[1][2 * m + 1] * array3[i][3];
                  result[1][1] = array1[1][2 * m + 1] * array3[i][2]
                                 + array1[1][2 * m] * array3[i][3];

                  result[2][0] = array1[2][2 * m] * array3[i][4]
                                 - array1[2][2 * m + 1] * array3[i][5];
                  result[2][1] = array1[2][2 * m + 1] * array3[i][4]
                                 + array1[2][2 * m] * array3[i][5];

                  result[3][0] = array1[3][2 * m] * array3[i][6]
                                 - array1[3][2 * m + 1] * array3[i][7];
                  result[3][1] = array1[3][2 * m + 1] * array3[i][6]
                                 + array1[3][2 * m] * array3[i][7];
                  sum[0]
                    = result[0][0] + result[1][0] + result[2][0] +
result[3][0];
                  sum[1]
                    = result[0][1] + result[1][1] + result[2][1] +
result[3][1];
                  array2[i][2 * m] = (int16_t) (sum[0] >> 15);
                  array2[i][2 * m + 1] = (int16_t) (sum[1] >> 15);
                  m++;
                }
            }
        }
    }
}

Here is reference:

https://godbolt.org/z/hfqWvdf8e

Here is the analysis:

First, Note the inner loop iterations = 12 (for (l = 0; l < 12; l++))

GCC 14 process 11 elements and leave the last element using scalar:

```
        mov     x1, 11           ---> process 11 elements
        whilelo p5.s, xzr, x1
        ...
        vector codes
        ...
        scalar codes of the last element:
        ldrsh   w8, [x0, x5, lsl 1]
        add     x6, x5, x10
        ldrsh   w14, [x0, x7, lsl 1]
        add     x1, x4, x10
        ldrsh   w7, [x0, x4, lsl 1]
        add     x12, x5, x27
        ldrsh   w2, [x0, x2, lsl 1]
        add     x5, x28, x5
        mul     w11, w24, w8
        ldrsh   w13, [x0, x6, lsl 1]
        ldrsh   w1, [x0, x1, lsl 1]
        add     x6, x4, x27
        msub    w11, w21, w7, w11
        ldrsh   w12, [x0, x12, lsl 1]
        mul     w7, w24, w7
        add     x4, x28, x4
        madd    w8, w21, w8, w7
        ldrsh   w6, [x0, x6, lsl 1]
        mul     w7, w20, w14
        add     w3, w3, 24
        msub    w7, w19, w2, w7
        mul     w2, w20, w2
        add     w7, w7, w11
        mul     w11, w18, w13
        msub    w11, w17, w1, w11
        madd    w2, w19, w14, w2
        add     w11, w7, w11
        mul     w1, w18, w1
        mul     w7, w16, w12
        add     w2, w2, w8
        msub    w7, w15, w6, w7
        madd    w1, w17, w13, w1
        mul     w6, w16, w6
        add     w11, w11, w7
        madd    w6, w15, w12, w6
        add     w1, w2, w1
        asr     w11, w11, 15
        strh    w11, [x9, x5, lsl 1]
        add     w1, w1, w6
        asr     w1, w1, 15
        strh    w1, [x9, x4, lsl 1]
        cmp     w30, w3
        bne     .L4
        ldp     w2, w7, [sp, 108]
        ldr     w3, [sp, 116]
        add     w1, w2, 1
        add     w30, w30, w7
        ldr     x8, [sp, 96]
        and     w2, w1, 65535
        cmp     w3, w1, uxth
        bne     .L6
        ldr     x3, [sp, 120]
        add     x23, x23, x22
        ldr     w5, [sp, 116]
        add     x8, x8, 16
        add     x3, x3, 1
        cmp     x3, 4
        bne     .L3

```

GCC-12 has much better codegen (Fold 12 elements in vector codes):

```
        mov     x1, 12   ----> process 12 elements in vector.
        ptrue   p0.b, vl64
        whilelo p1.s, xzr, x1

```
       vector codes:
```
       No scalar epilogue.

This benchmark has over 70% performance drop between GCC-12 and GCC-14 for both
RVV and ARM SVE.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
@ 2024-01-17 12:45 ` juzhe.zhong at rivai dot ai
  2024-01-17 13:22 ` [Bug tree-optimization/113441] [13/14 " rguenth at gcc dot gnu.org
                   ` (45 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-17 12:45 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
GCC trunk RVV also process 11 elements in vector:

https://godbolt.org/z/q9bb8Gj4G

```
vsetivli        zero,11,e32,m1,ta,ma

```
vector codes

```
        lh      s8,0(t4)
        lh      t4,0(t1)
        ld      t1,16(sp)
        add     a3,a4,t1
        lh      a7,0(a7)
        add     t3,a4,s9
        slli    t3,t3,1
        slli    a3,a3,1
        add     t3,a2,t3
        add     a3,a2,a3
        add     a1,a5,t5
        mulw    t0,s6,s8
        lh      t5,0(t3)
        lh      t3,0(a3)
        add     a3,a5,t1
        add     a0,a5,s9
        slli    a0,a0,1
        add     a0,a2,a0
        lh      a0,0(a0)
        slli    a1,a1,1
        add     a1,a2,a1
        mulw    t1,s5,a7
        lh      a1,0(a1)
        slli    a3,a3,1
        add     a3,a2,a3
        lh      a3,0(a3)
        add     a4,s7,a4
        slli    a4,a4,1
        add     a4,t6,a4
        add     a5,s7,a5
        slli    a5,a5,1
        mulw    s8,s5,s8
        subw    t0,t0,t1
        add     a5,t6,a5
        addiw   a6,a6,24
        mulw    a7,s6,a7
        mulw    t1,s4,t5
        addw    a7,a7,s8
        mulw    t5,s3,t5
        mulw    s8,s3,a0
        mulw    a0,s4,a0
        subw    t1,t1,s8
        addw    t1,t0,t1
        addw    a0,a0,t5
        addw    a0,a7,a0
        mulw    t0,s2,t4
        mulw    a7,s1,a1
        mulw    t4,s1,t4
        subw    t0,t0,a7
        addw    t0,t0,t1
        mulw    a1,s2,a1
        addw    a1,a1,t4
        mulw    a7,s0,t3
        addw    a1,a1,a0
        mulw    a0,t2,a3
        subw    a7,a7,a0
        addw    a7,a7,t0
        sraiw   a7,a7,15
        sh      a7,0(a4)
        mulw    t3,t2,t3
        mulw    a4,s0,a3
        addw    a4,a4,t3
        addw    a4,a4,a1
        sraiw   a4,a4,15
        sh      a4,0(a5)
        bne     a6,s11,.L4
        ld      a0,56(sp)
        addiw   a5,a0,1
        slli    a0,a5,48
        ld      t4,72(sp)
        ld      t1,64(sp)
        srli    a0,a0,48
        ld      a5,80(sp)
        ld      a7,48(sp)
        addw    t1,t4,t1
        addw    s11,t4,a6
        bne     a0,a5,.L6

Same issue as ARM SVE, I think the tail scalar operations can be folded into
vector operations.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [13/14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
  2024-01-17 12:45 ` [Bug tree-optimization/113441] " juzhe.zhong at rivai dot ai
@ 2024-01-17 13:22 ` rguenth at gcc dot gnu.org
  2024-01-17 14:07 ` juzhe.zhong at rivai dot ai
                   ` (44 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-01-17 13:22 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Keywords|                            |missed-optimization
   Target Milestone|---                         |13.3
            Summary|[14 Regression] Fail to     |[13/14 Regression] Fail to
                   |fold the last element with  |fold the last element with
                   |multiple loop               |multiple loop

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
It might be

t.c:29:22: missed:   Data access with gaps requires scalar epilogue loop

required when vectorizing the load groups.  We end up with

t.c:29:22: note:   === vect_analyze_data_ref_accesses ===
t.c:29:22: note:   Detected single element interleaving array1[0][_8] step 4
t.c:29:22: note:   Detected single element interleaving array1[1][_8] step 4
t.c:29:22: note:   Detected single element interleaving array1[2][_8] step 4
t.c:29:22: note:   Detected single element interleaving array1[3][_8] step 4
t.c:29:22: note:   Detected single element interleaving array1[0][_1] step 4
t.c:29:22: note:   Detected single element interleaving array1[1][_1] step 4
t.c:29:22: note:   Detected single element interleaving array1[2][_1] step 4
t.c:29:22: note:   Detected single element interleaving array1[3][_1] step 4
t.c:29:22: missed:   not consecutive access array2[_4][_8] = _69;
t.c:29:22: note:   using strided accesses
t.c:29:22: missed:   not consecutive access array2[_4][_1] = _67;
t.c:29:22: note:   using strided accesses

it's better to use signed 'm' (or uint64_t I guess), then we get

t.c:29:22: note:   === vect_analyze_data_ref_accesses ===
t.c:29:22: note:   Detected interleaving load array1[0][_1] and array1[0][_8]
t.c:29:22: note:   Detected interleaving load array1[1][_1] and array1[1][_8]
t.c:29:22: note:   Detected interleaving load array1[2][_1] and array1[2][_8]
t.c:29:22: note:   Detected interleaving load array1[3][_1] and array1[3][_8]
t.c:29:22: note:   Detected interleaving store array2[_4][_1] and
array2[_4][_8]
t.c:29:22: note:   Detected interleaving load of size 2 
t.c:29:22: note:        _2 = array1[0][_1];
t.c:29:22: note:        _9 = array1[0][_8];
t.c:29:22: note:   Detected interleaving load of size 2
t.c:29:22: note:        _18 = array1[1][_1];
t.c:29:22: note:        _23 = array1[1][_8];
t.c:29:22: note:   Detected interleaving load of size 2
t.c:29:22: note:        _32 = array1[2][_1];
t.c:29:22: note:        _37 = array1[2][_8];
t.c:29:22: note:   Detected interleaving load of size 2
t.c:29:22: note:        _46 = array1[3][_1];
t.c:29:22: note:        _51 = array1[3][_8];
t.c:29:22: note:   Detected interleaving store of size 2
t.c:29:22: note:        array2[_4][_1] = _67;
t.c:29:22: note:        array2[_4][_8] = _69;

and no gap peeling required.

I guess you say GCC 13 is bad as well?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [13/14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
  2024-01-17 12:45 ` [Bug tree-optimization/113441] " juzhe.zhong at rivai dot ai
  2024-01-17 13:22 ` [Bug tree-optimization/113441] [13/14 " rguenth at gcc dot gnu.org
@ 2024-01-17 14:07 ` juzhe.zhong at rivai dot ai
  2024-01-17 14:35 ` rguenth at gcc dot gnu.org
                   ` (43 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-17 14:07 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #3 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
(In reply to Richard Biener from comment #2)
> It might be
> 
> t.c:29:22: missed:   Data access with gaps requires scalar epilogue loop
> 
> required when vectorizing the load groups.  We end up with
> 
> t.c:29:22: note:   === vect_analyze_data_ref_accesses ===
> t.c:29:22: note:   Detected single element interleaving array1[0][_8] step 4
> t.c:29:22: note:   Detected single element interleaving array1[1][_8] step 4
> t.c:29:22: note:   Detected single element interleaving array1[2][_8] step 4
> t.c:29:22: note:   Detected single element interleaving array1[3][_8] step 4
> t.c:29:22: note:   Detected single element interleaving array1[0][_1] step 4
> t.c:29:22: note:   Detected single element interleaving array1[1][_1] step 4
> t.c:29:22: note:   Detected single element interleaving array1[2][_1] step 4
> t.c:29:22: note:   Detected single element interleaving array1[3][_1] step 4
> t.c:29:22: missed:   not consecutive access array2[_4][_8] = _69;
> t.c:29:22: note:   using strided accesses
> t.c:29:22: missed:   not consecutive access array2[_4][_1] = _67;
> t.c:29:22: note:   using strided accesses
> 
> it's better to use signed 'm' (or uint64_t I guess), then we get
> 
> t.c:29:22: note:   === vect_analyze_data_ref_accesses ===
> t.c:29:22: note:   Detected interleaving load array1[0][_1] and array1[0][_8]
> t.c:29:22: note:   Detected interleaving load array1[1][_1] and array1[1][_8]
> t.c:29:22: note:   Detected interleaving load array1[2][_1] and array1[2][_8]
> t.c:29:22: note:   Detected interleaving load array1[3][_1] and array1[3][_8]
> t.c:29:22: note:   Detected interleaving store array2[_4][_1] and
> array2[_4][_8]
> t.c:29:22: note:   Detected interleaving load of size 2 
> t.c:29:22: note:        _2 = array1[0][_1];
> t.c:29:22: note:        _9 = array1[0][_8];
> t.c:29:22: note:   Detected interleaving load of size 2
> t.c:29:22: note:        _18 = array1[1][_1];
> t.c:29:22: note:        _23 = array1[1][_8];
> t.c:29:22: note:   Detected interleaving load of size 2
> t.c:29:22: note:        _32 = array1[2][_1];
> t.c:29:22: note:        _37 = array1[2][_8];
> t.c:29:22: note:   Detected interleaving load of size 2
> t.c:29:22: note:        _46 = array1[3][_1];
> t.c:29:22: note:        _51 = array1[3][_8];
> t.c:29:22: note:   Detected interleaving store of size 2
> t.c:29:22: note:        array2[_4][_1] = _67;
> t.c:29:22: note:        array2[_4][_8] = _69;
> 
> and no gap peeling required.
> 
> I guess you say GCC 13 is bad as well?

Sorry, I didn't check GCC13 but after investigation.

Now, I confirm GCC 13.2.0 doesn't have the regression:

https://godbolt.org/z/ndaWToaxP

In GCC 13.2.0, there is no appearance of "requires scalar epilogue loop",
wheras, GCC-14 has 72 times "requires scalar epilogue loop".

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [13/14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (2 preceding siblings ...)
  2024-01-17 14:07 ` juzhe.zhong at rivai dot ai
@ 2024-01-17 14:35 ` rguenth at gcc dot gnu.org
  2024-01-22 12:38 ` juzhe.zhong at rivai dot ai
                   ` (42 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-01-17 14:35 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
      Known to work|                            |13.2.0
      Known to fail|                            |14.0

--- Comment #4 from Richard Biener <rguenth at gcc dot gnu.org> ---
let's wait for bisection

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [13/14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (3 preceding siblings ...)
  2024-01-17 14:35 ` rguenth at gcc dot gnu.org
@ 2024-01-22 12:38 ` juzhe.zhong at rivai dot ai
  2024-01-22 12:41 ` tnfchris at gcc dot gnu.org
                   ` (41 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-22 12:38 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #5 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Confirm at Nov, 1. The regression is gone.

https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=eac0917bd3d2ead4829d56c8f2769176087c7b3d

This commit is ok, which has no regressions.

Still bisecting manually.....

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [13/14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (4 preceding siblings ...)
  2024-01-22 12:38 ` juzhe.zhong at rivai dot ai
@ 2024-01-22 12:41 ` tnfchris at gcc dot gnu.org
  2024-01-22 12:42 ` juzhe.zhong at rivai dot ai
                   ` (40 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-01-22 12:41 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #6 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
Hello,

I can bisect it if you want. it should only take a few seconds.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [13/14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (5 preceding siblings ...)
  2024-01-22 12:41 ` tnfchris at gcc dot gnu.org
@ 2024-01-22 12:42 ` juzhe.zhong at rivai dot ai
  2024-01-22 13:19 ` juzhe.zhong at rivai dot ai
                   ` (39 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-22 12:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #7 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
(In reply to Tamar Christina from comment #6)
> Hello,
> 
> I can bisect it if you want. it should only take a few seconds.

Ok. Thanks a lot ...

I take 2 hours to bisect it manually  but still didn't locate the accurate
commit
which causes regression...

It's great that you can bisect it easily.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [13/14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (6 preceding siblings ...)
  2024-01-22 12:42 ` juzhe.zhong at rivai dot ai
@ 2024-01-22 13:19 ` juzhe.zhong at rivai dot ai
  2024-01-22 13:52 ` [Bug tree-optimization/113441] [14 " rguenth at gcc dot gnu.org
                   ` (38 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-22 13:19 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #8 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
I believe the change between Nov and Dec causes regression.

But I don't continue on bisection.

Hope this information can help with your bisection.

Thanks.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (7 preceding siblings ...)
  2024-01-22 13:19 ` juzhe.zhong at rivai dot ai
@ 2024-01-22 13:52 ` rguenth at gcc dot gnu.org
  2024-01-22 16:16 ` tnfchris at gcc dot gnu.org
                   ` (37 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-01-22 13:52 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|13.3                        |14.0
            Summary|[13/14 Regression] Fail to  |[14 Regression] Fail to
                   |fold the last element with  |fold the last element with
                   |multiple loop               |multiple loop

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (8 preceding siblings ...)
  2024-01-22 13:52 ` [Bug tree-optimization/113441] [14 " rguenth at gcc dot gnu.org
@ 2024-01-22 16:16 ` tnfchris at gcc dot gnu.org
  2024-01-22 22:16 ` juzhe.zhong at rivai dot ai
                   ` (36 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-01-22 16:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #9 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
So on SVE the change is cost modelling.

Bisect landed on g:33c2b70dbabc02788caabcbc66b7baeafeb95bcf which changed the
compiler's defaults to using the new throughput matched cost modelling used be
newer cores.

It looks like this changes which mode the compiler picks for when using a fixed
register size.

This is because the new cost model (correctly) models the costs for FMAs and
promotions.

Before:

array1[0][_1] 1 times scalar_load costs 1 in prologue
int) _2 1 times scalar_stmt costs 1 in prologue

after:

array1[0][_1] 1 times scalar_load costs 1 in prologue 
(int) _2 1 times scalar_stmt costs 0 in prologue 

and the cost goes from:

Vector inside of loop cost: 125

to

Vector inside of loop cost: 83 

so far, nothing sticks out, and in fact the profitability for VNx4QI drops from

Calculated minimum iters for profitability: 5

to

Calculated minimum iters for profitability: 3

This causes a clash, as this is now exactly the same cost as VNx2QI which used
to be what it preferred before.

Which then leads it to pick the higher VF.

In the end smaller VF shows:

;; Guessed iterations of loop 4 is 0.500488. New upper bound 1.

and now we get:

Vectorization factor 16 seems too large for profile prevoiusly believed to be
consistent; reducing.  
;; Guessed iterations of loop 4 is 0.500488. New upper bound 0.
;; Scaling loop 4 with scale 66.6% (guessed) to reach upper bound 0

which I guess is the big difference.

There is a weird costing going on in the PHI nodes though:

m_108 = PHI <m_92(16), m_111(5)> 1 times vector_stmt costs 0 in body 
m_108 = PHI <m_92(16), m_111(5)> 2 times scalar_to_vec costs 0 in prologue

they have collapsed to 0. which can't be right..

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (9 preceding siblings ...)
  2024-01-22 16:16 ` tnfchris at gcc dot gnu.org
@ 2024-01-22 22:16 ` juzhe.zhong at rivai dot ai
  2024-01-23  6:42 ` rguenth at gcc dot gnu.org
                   ` (35 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-22 22:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #10 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
(In reply to Tamar Christina from comment #9)
> So on SVE the change is cost modelling.
> 
> Bisect landed on g:33c2b70dbabc02788caabcbc66b7baeafeb95bcf which changed
> the compiler's defaults to using the new throughput matched cost modelling
> used be newer cores.
> 
> It looks like this changes which mode the compiler picks for when using a
> fixed register size.
> 
> This is because the new cost model (correctly) models the costs for FMAs and
> promotions.
> 
> Before:
> 
> array1[0][_1] 1 times scalar_load costs 1 in prologue
> int) _2 1 times scalar_stmt costs 1 in prologue
> 
> after:
> 
> array1[0][_1] 1 times scalar_load costs 1 in prologue 
> (int) _2 1 times scalar_stmt costs 0 in prologue 
> 
> and the cost goes from:
> 
> Vector inside of loop cost: 125
> 
> to
> 
> Vector inside of loop cost: 83 
> 
> so far, nothing sticks out, and in fact the profitability for VNx4QI drops
> from
> 
> Calculated minimum iters for profitability: 5
> 
> to
> 
> Calculated minimum iters for profitability: 3
> 
> This causes a clash, as this is now exactly the same cost as VNx2QI which
> used to be what it preferred before.
> 
> Which then leads it to pick the higher VF.
> 
> In the end smaller VF shows:
> 
> ;; Guessed iterations of loop 4 is 0.500488. New upper bound 1.
> 
> and now we get:
> 
> Vectorization factor 16 seems too large for profile prevoiusly believed to
> be consistent; reducing.  
> ;; Guessed iterations of loop 4 is 0.500488. New upper bound 0.
> ;; Scaling loop 4 with scale 66.6% (guessed) to reach upper bound 0
> 
> which I guess is the big difference.
> 
> There is a weird costing going on in the PHI nodes though:
> 
> m_108 = PHI <m_92(16), m_111(5)> 1 times vector_stmt costs 0 in body 
> m_108 = PHI <m_92(16), m_111(5)> 2 times scalar_to_vec costs 0 in prologue
> 
> they have collapsed to 0. which can't be right..

I don't think this change makes the regression since the regression not only
happens on ARM SVE but also on RVV.
It should be middle-end.

I believe you'd better use -fno-vect-cost-model.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (10 preceding siblings ...)
  2024-01-22 22:16 ` juzhe.zhong at rivai dot ai
@ 2024-01-23  6:42 ` rguenth at gcc dot gnu.org
  2024-01-23  8:15 ` juzhe.zhong at rivai dot ai
                   ` (34 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-01-23  6:42 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #11 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Tamar Christina from comment #9)
> There is a weird costing going on in the PHI nodes though:
> 
> m_108 = PHI <m_92(16), m_111(5)> 1 times vector_stmt costs 0 in body 
> m_108 = PHI <m_92(16), m_111(5)> 2 times scalar_to_vec costs 0 in prologue
> 
> they have collapsed to 0. which can't be right..

Note this is likely because of the backend going wrong.

bool
vectorizable_phi (vec_info *,
                  stmt_vec_info stmt_info, gimple **vec_stmt,
                  slp_tree slp_node, stmt_vector_for_cost *cost_vec)
{
..

      /* For single-argument PHIs assume coalescing which means zero cost
         for the scalar and the vector PHIs.  This avoids artificially
         favoring the vector path (but may pessimize it in some cases).  */
      if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
        record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
                          vector_stmt, stmt_info, vectype, 0, vect_body);

You could check if we call this with sane values.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (11 preceding siblings ...)
  2024-01-23  6:42 ` rguenth at gcc dot gnu.org
@ 2024-01-23  8:15 ` juzhe.zhong at rivai dot ai
  2024-01-23  8:17 ` rguenther at suse dot de
                   ` (33 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-23  8:15 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #12 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
(In reply to Richard Biener from comment #11)
> (In reply to Tamar Christina from comment #9)
> > There is a weird costing going on in the PHI nodes though:
> > 
> > m_108 = PHI <m_92(16), m_111(5)> 1 times vector_stmt costs 0 in body 
> > m_108 = PHI <m_92(16), m_111(5)> 2 times scalar_to_vec costs 0 in prologue
> > 
> > they have collapsed to 0. which can't be right..
> 
> Note this is likely because of the backend going wrong.
> 
> bool
> vectorizable_phi (vec_info *,
>                   stmt_vec_info stmt_info, gimple **vec_stmt,
>                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> {
> ..
> 
>       /* For single-argument PHIs assume coalescing which means zero cost
>          for the scalar and the vector PHIs.  This avoids artificially
>          favoring the vector path (but may pessimize it in some cases).  */
>       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
>         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
>                           vector_stmt, stmt_info, vectype, 0, vect_body);
> 
> You could check if we call this with sane values.

Do you mean it's RISC-V backend cost model issue ?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (12 preceding siblings ...)
  2024-01-23  8:15 ` juzhe.zhong at rivai dot ai
@ 2024-01-23  8:17 ` rguenther at suse dot de
  2024-01-23  8:25 ` juzhe.zhong at rivai dot ai
                   ` (32 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenther at suse dot de @ 2024-01-23  8:17 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #13 from rguenther at suse dot de <rguenther at suse dot de> ---
On Tue, 23 Jan 2024, juzhe.zhong at rivai dot ai wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441
> 
> --- Comment #12 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
> (In reply to Richard Biener from comment #11)
> > (In reply to Tamar Christina from comment #9)
> > > There is a weird costing going on in the PHI nodes though:
> > > 
> > > m_108 = PHI <m_92(16), m_111(5)> 1 times vector_stmt costs 0 in body 
> > > m_108 = PHI <m_92(16), m_111(5)> 2 times scalar_to_vec costs 0 in prologue
> > > 
> > > they have collapsed to 0. which can't be right..
> > 
> > Note this is likely because of the backend going wrong.
> > 
> > bool
> > vectorizable_phi (vec_info *,
> >                   stmt_vec_info stmt_info, gimple **vec_stmt,
> >                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
> > {
> > ..
> > 
> >       /* For single-argument PHIs assume coalescing which means zero cost
> >          for the scalar and the vector PHIs.  This avoids artificially
> >          favoring the vector path (but may pessimize it in some cases).  */
> >       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
> >         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
> >                           vector_stmt, stmt_info, vectype, 0, vect_body);
> > 
> > You could check if we call this with sane values.
> 
> Do you mean it's RISC-V backend cost model issue ?

I responded to Tamar which means a aarch64 cost model issue - the
specific issue that the PHIs appear to have no cost.  I didn't look
at any of the rest.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (13 preceding siblings ...)
  2024-01-23  8:17 ` rguenther at suse dot de
@ 2024-01-23  8:25 ` juzhe.zhong at rivai dot ai
  2024-01-23 10:29 ` rguenther at suse dot de
                   ` (31 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-23  8:25 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #14 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
I just tried again both GCC-13.2 and GCC-14 with -fno-vect-cost-model.

https://godbolt.org/z/enEG3qf5K

GCC-14 requires scalar epilogue loop, whereas GCC-13.2 doesn't.

I believe it's not cost model issue.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (14 preceding siblings ...)
  2024-01-23  8:25 ` juzhe.zhong at rivai dot ai
@ 2024-01-23 10:29 ` rguenther at suse dot de
  2024-01-23 10:30 ` tnfchris at gcc dot gnu.org
                   ` (30 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenther at suse dot de @ 2024-01-23 10:29 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #15 from rguenther at suse dot de <rguenther at suse dot de> ---
On Tue, 23 Jan 2024, juzhe.zhong at rivai dot ai wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441
> 
> --- Comment #14 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
> I just tried again both GCC-13.2 and GCC-14 with -fno-vect-cost-model.
> 
> https://godbolt.org/z/enEG3qf5K
> 
> GCC-14 requires scalar epilogue loop, whereas GCC-13.2 doesn't.
> 
> I believe it's not cost model issue.

As said, please try to bisect to the point where we started to require
the epilogue.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (15 preceding siblings ...)
  2024-01-23 10:29 ` rguenther at suse dot de
@ 2024-01-23 10:30 ` tnfchris at gcc dot gnu.org
  2024-01-23 12:32 ` tnfchris at gcc dot gnu.org
                   ` (29 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-01-23 10:30 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #16 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to rguenther@suse.de from comment #13)
> > > You could check if we call this with sane values.
> > 
> > Do you mean it's RISC-V backend cost model issue ?
> 
> I responded to Tamar which means a aarch64 cost model issue - the
> specific issue that the PHIs appear to have no cost.  I didn't look
> at any of the rest.

Yeah, I'll be checking this separately and make a different issue if need be.

(In reply to JuzheZhong from comment #14)
> I just tried again both GCC-13.2 and GCC-14 with -fno-vect-cost-model.
> 
> https://godbolt.org/z/enEG3qf5K
> 
> GCC-14 requires scalar epilogue loop, whereas GCC-13.2 doesn't.
> 
> I believe it's not cost model issue.

Yes, my bisect originally stopped because of the costing change.  I've started
a new one with -fno-vect-cost-model but having trouble with the condition to
check for.  Will be back in a bit

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (16 preceding siblings ...)
  2024-01-23 10:30 ` tnfchris at gcc dot gnu.org
@ 2024-01-23 12:32 ` tnfchris at gcc dot gnu.org
  2024-01-23 12:50 ` rguenth at gcc dot gnu.org
                   ` (28 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-01-23 12:32 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #17 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
Ok, bisected to

g:2efe3a7de0107618397264017fb045f237764cc7 is the first bad commit
commit 2efe3a7de0107618397264017fb045f237764cc7
Author: Hao Liu <hliu@os.amperecomputing.com>
Date:   Wed Dec 6 14:52:19 2023 +0800

    tree-optimization/112774: extend the SCEV CHREC tree with a nonwrapping
flag

Before this commit we were unable to analyse the stride of the access.
After this niters seems to estimate the loop trip count at 4 and after that the
logs diverge enormously.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (17 preceding siblings ...)
  2024-01-23 12:32 ` tnfchris at gcc dot gnu.org
@ 2024-01-23 12:50 ` rguenth at gcc dot gnu.org
  2024-01-23 12:52 ` rguenth at gcc dot gnu.org
                   ` (27 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-01-23 12:50 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #18 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Tamar Christina from comment #17)
> Ok, bisected to
> 
> g:2efe3a7de0107618397264017fb045f237764cc7 is the first bad commit
> commit 2efe3a7de0107618397264017fb045f237764cc7
> Author: Hao Liu <hliu@os.amperecomputing.com>
> Date:   Wed Dec 6 14:52:19 2023 +0800
> 
>     tree-optimization/112774: extend the SCEV CHREC tree with a nonwrapping
> flag
> 
> Before this commit we were unable to analyse the stride of the access.
> After this niters seems to estimate the loop trip count at 4 and after that
> the logs diverge enormously.

Hum, but that's backward and would match to what I said in comment#2 - we
should get better code with that.

Juzhe - when you revert the above ontop of trunk does the generated code
look better for Risc-V?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (18 preceding siblings ...)
  2024-01-23 12:50 ` rguenth at gcc dot gnu.org
@ 2024-01-23 12:52 ` rguenth at gcc dot gnu.org
  2024-01-23 12:56 ` rguenth at gcc dot gnu.org
                   ` (26 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-01-23 12:52 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #19 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #18)
> (In reply to Tamar Christina from comment #17)
> > Ok, bisected to
> > 
> > g:2efe3a7de0107618397264017fb045f237764cc7 is the first bad commit
> > commit 2efe3a7de0107618397264017fb045f237764cc7
> > Author: Hao Liu <hliu@os.amperecomputing.com>
> > Date:   Wed Dec 6 14:52:19 2023 +0800
> > 
> >     tree-optimization/112774: extend the SCEV CHREC tree with a nonwrapping
> > flag
> > 
> > Before this commit we were unable to analyse the stride of the access.
> > After this niters seems to estimate the loop trip count at 4 and after that
> > the logs diverge enormously.
> 
> Hum, but that's backward and would match to what I said in comment#2 - we
> should get better code with that.
> 
> Juzhe - when you revert the above ontop of trunk does the generated code
> look better for Risc-V?

It doesn't revert but you can do

diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
index 25e3130e2f1..7870c8d76fb 100644
--- a/gcc/tree-scalar-evolution.cc
+++ b/gcc/tree-scalar-evolution.cc
@@ -2054,7 +2054,7 @@ analyze_scalar_evolution (class loop *loop, tree var)

 void record_nonwrapping_chrec (tree chrec)
 {
-  CHREC_NOWRAP(chrec) = 1;
+  CHREC_NOWRAP(chrec) = 0;

   if (dump_file && (dump_flags & TDF_SCEV))
     {

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (19 preceding siblings ...)
  2024-01-23 12:52 ` rguenth at gcc dot gnu.org
@ 2024-01-23 12:56 ` rguenth at gcc dot gnu.org
  2024-01-23 13:02 ` rguenth at gcc dot gnu.org
                   ` (25 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-01-23 12:56 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #20 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #19)
> (In reply to Richard Biener from comment #18)
> > (In reply to Tamar Christina from comment #17)
> > > Ok, bisected to
> > > 
> > > g:2efe3a7de0107618397264017fb045f237764cc7 is the first bad commit
> > > commit 2efe3a7de0107618397264017fb045f237764cc7
> > > Author: Hao Liu <hliu@os.amperecomputing.com>
> > > Date:   Wed Dec 6 14:52:19 2023 +0800
> > > 
> > >     tree-optimization/112774: extend the SCEV CHREC tree with a nonwrapping
> > > flag
> > > 
> > > Before this commit we were unable to analyse the stride of the access.
> > > After this niters seems to estimate the loop trip count at 4 and after that
> > > the logs diverge enormously.
> > 
> > Hum, but that's backward and would match to what I said in comment#2 - we
> > should get better code with that.
> > 
> > Juzhe - when you revert the above ontop of trunk does the generated code
> > look better for Risc-V?
> 
> It doesn't revert but you can do
> 
> diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> index 25e3130e2f1..7870c8d76fb 100644
> --- a/gcc/tree-scalar-evolution.cc
> +++ b/gcc/tree-scalar-evolution.cc
> @@ -2054,7 +2054,7 @@ analyze_scalar_evolution (class loop *loop, tree var)
>  
>  void record_nonwrapping_chrec (tree chrec)
>  {
> -  CHREC_NOWRAP(chrec) = 1;
> +  CHREC_NOWRAP(chrec) = 0;
>  
>    if (dump_file && (dump_flags & TDF_SCEV))
>      {

For me with this, on x86-64 we do not vectorize the loop at all.  With
-fno-vect-cost-model we vectorize some of the stores as part of BB
vectorization.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (20 preceding siblings ...)
  2024-01-23 12:56 ` rguenth at gcc dot gnu.org
@ 2024-01-23 13:02 ` rguenth at gcc dot gnu.org
  2024-01-23 13:05 ` tnfchris at gcc dot gnu.org
                   ` (24 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-01-23 13:02 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #21 from Richard Biener <rguenth at gcc dot gnu.org> ---
On aarch64 I can see already GCC 13.2 looking very much different from 12.3,
but I can't decipher the code to decide whether 12.3 vectorizes the loop or
not.
trunk looks similar to 13.2 here, so the bisected change can't really be
responsible here.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (21 preceding siblings ...)
  2024-01-23 13:02 ` rguenth at gcc dot gnu.org
@ 2024-01-23 13:05 ` tnfchris at gcc dot gnu.org
  2024-01-23 13:12 ` tnfchris at gcc dot gnu.org
                   ` (23 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-01-23 13:05 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #22 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
for me with `-fno-vect-cost-model` on without this commit we generate
https://gist.github.com/Mistuke/d9252bfcb2aa766327c5f377e162f5b7 for the loop
and with the commit well.. it doesn't fit on the screen but the codegen is
pretty horrible with

        smlal2  v24.4s, v13.8h, v5.8h
        smull   v31.4s, v30.4h, v17.4h
        add     v20.4s, v20.4s, v11.4s
        smlal2  v29.4s, v3.8h, v6.8h
        smull2  v25.4s, v25.8h, v15.8h
        add     v22.4s, v28.4s, v22.4s
        shrn    v21.4h, v21.4s, 15
        add     v20.4s, v20.4s, v26.4s
        add     v29.4s, v29.4s, v24.4s
        smlal2  v25.4s, v16.8h, v7.8h
        smlal   v31.4s, v18.4h, v8.4h
        smull2  v27.4s, v27.8h, v17.8h
        shrn2   v21.8h, v22.4s, 15
        add     v29.4s, v29.4s, v25.4s
        add     v31.4s, v31.4s, v20.4s
        smlal2  v27.4s, v18.8h, v8.8h
        str     h21, [x5, x9]
        add     x9, x9, 32
        add     x9, x5, x9
        shrn    v31.4h, v31.4s, 15
        st1     {v21.h}[1], [x10]
        add     v27.4s, v27.4s, v29.4s
        st1     {v21.h}[2], [x6]
        add     x6, x7, 20
        add     x10, x1, x21
        st1     {v21.h}[3], [x2]
        add     x2, x7, 24
        add     x7, x7, 28
        st1     {v21.h}[4], [x8]
        shrn2   v31.8h, v27.4s, 15
        st1     {v21.h}[5], [x6]
        lsl     x6, x10, 1
        add     x10, x5, x10, lsl 1
        st1     {v21.h}[6], [x2]
        add     x2, x10, 4
        st1     {v21.h}[7], [x7]
        add     x7, x10, 8
        str     h31, [x5, x6]
        add     x8, x10, 12
        lsl     x1, x1, 1
        add     x6, x6, 32
        st1     {v31.h}[1], [x2]
        add     x2, x10, 16
        st1     {v31.h}[2], [x7]
        add     x7, x10, 20
        st1     {v31.h}[3], [x8]
        add     x8, x10, 24
        add     x10, x10, 28
        st1     {v31.h}[4], [x2]
        st1     {v31.h}[5], [x7]
        add     x11, x1, 32
        st1     {v31.h}[6], [x8]
        add     x11, x0, x11
        st1     {v31.h}[7], [x10]
        add     x10, x1, x25
        ld1h    z31.s, p5/z, [x11]

going on for a while. i.e. single element lane stores. So with the cost model
disabled, it definitely does get worse witht that commit. with the cost model
on there's no difference.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (22 preceding siblings ...)
  2024-01-23 13:05 ` tnfchris at gcc dot gnu.org
@ 2024-01-23 13:12 ` tnfchris at gcc dot gnu.org
  2024-01-23 13:21 ` juzhe.zhong at rivai dot ai
                   ` (22 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-01-23 13:12 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #23 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
tamar:~/gcc-dsg/test$ extract-toolchain gcc 2efe3a7de01
A       1514 files
D       0 files
M       0 files
Extracted 'origin/manygcc-basepoints-gcc-14-6292-g2f512f6fcdd:2efe3a7de01'

> ./bin/gcc -S -o ../wlo-bad.s -march=armv8-a+sve -O3 -msve-vector-bits=512 -fno-vect-cost-model -g0 ../wlo.c -fdump-tree-vect-all

tamar:~/gcc-dsg/test$ extract-toolchain gcc 9f7ad5eff3b
A       1514 files
D       0 files
M       0 files
Extracted 'origin/manygcc-basepoints-gcc-14-6292-g2f512f6fcdd:9f7ad5eff3b'

> ./bin/gcc -S -o ../wlo-good.s -march=armv8-a+sve -O3 -msve-vector-bits=512 -fno-vect-cost-model -g0 ../wlo.c -fdump-tree-vect-all

> diff ../wlo-bad.s ../wlo-good.s  | wc -l
537

and for the record the bisect was scanning for  "requires scalar epilogue loop"
and that's the first commit they appear on.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (23 preceding siblings ...)
  2024-01-23 13:12 ` tnfchris at gcc dot gnu.org
@ 2024-01-23 13:21 ` juzhe.zhong at rivai dot ai
  2024-01-23 13:28 ` tnfchris at gcc dot gnu.org
                   ` (21 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-23 13:21 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #24 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
(In reply to Richard Biener from comment #19)
> (In reply to Richard Biener from comment #18)
> > (In reply to Tamar Christina from comment #17)
> > > Ok, bisected to
> > > 
> > > g:2efe3a7de0107618397264017fb045f237764cc7 is the first bad commit
> > > commit 2efe3a7de0107618397264017fb045f237764cc7
> > > Author: Hao Liu <hliu@os.amperecomputing.com>
> > > Date:   Wed Dec 6 14:52:19 2023 +0800
> > > 
> > >     tree-optimization/112774: extend the SCEV CHREC tree with a nonwrapping
> > > flag
> > > 
> > > Before this commit we were unable to analyse the stride of the access.
> > > After this niters seems to estimate the loop trip count at 4 and after that
> > > the logs diverge enormously.
> > 
> > Hum, but that's backward and would match to what I said in comment#2 - we
> > should get better code with that.
> > 
> > Juzhe - when you revert the above ontop of trunk does the generated code
> > look better for Risc-V?
> 
> It doesn't revert but you can do
> 
> diff --git a/gcc/tree-scalar-evolution.cc b/gcc/tree-scalar-evolution.cc
> index 25e3130e2f1..7870c8d76fb 100644
> --- a/gcc/tree-scalar-evolution.cc
> +++ b/gcc/tree-scalar-evolution.cc
> @@ -2054,7 +2054,7 @@ analyze_scalar_evolution (class loop *loop, tree var)
>  
>  void record_nonwrapping_chrec (tree chrec)
>  {
> -  CHREC_NOWRAP(chrec) = 1;
> +  CHREC_NOWRAP(chrec) = 0;
>  
>    if (dump_file && (dump_flags & TDF_SCEV))
>      {

Hmmm. With experiments. The codegen looks slightly better but still didn't
recover back to GCC-12.


Btw, I compare ARM SVE codegen, even with cost model:

https://godbolt.org/z/cKc1PG3dv

I think GCC 13.2 codegen is better than GCC trunk with cost model.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (24 preceding siblings ...)
  2024-01-23 13:21 ` juzhe.zhong at rivai dot ai
@ 2024-01-23 13:28 ` tnfchris at gcc dot gnu.org
  2024-02-22 16:18 ` [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7 tnfchris at gcc dot gnu.org
                   ` (20 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-01-23 13:28 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #25 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
> >  void record_nonwrapping_chrec (tree chrec)
> >  {
> > -  CHREC_NOWRAP(chrec) = 1;
> > +  CHREC_NOWRAP(chrec) = 0;
> >  
> >    if (dump_file && (dump_flags & TDF_SCEV))
> >      {
> 
> Hmmm. With experiments. The codegen looks slightly better but still didn't
> recover back to GCC-12.
> 
> 
> Btw, I compare ARM SVE codegen, even with cost model:
> 
> https://godbolt.org/z/cKc1PG3dv
> 
> I think GCC 13.2 codegen is better than GCC trunk with cost model.

If you have the cost model enabled you hit
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441#c9 which is just a target
bug I need to look into separately.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (25 preceding siblings ...)
  2024-01-23 13:28 ` tnfchris at gcc dot gnu.org
@ 2024-02-22 16:18 ` tnfchris at gcc dot gnu.org
  2024-02-26  8:10 ` tnfchris at gcc dot gnu.org
                   ` (19 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-02-22 16:18 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Tamar Christina <tnfchris at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
     Ever confirmed|0                           |1
            Summary|[14 Regression] Fail to     |[14 Regression] Fail to
                   |fold the last element with  |fold the last element with
                   |multiple loop               |multiple loop since
                   |                            |g:2efe3a7de0107618397264017
                   |                            |fb045f237764cc7
   Last reconfirmed|                            |2024-02-22
             Status|UNCONFIRMED                 |NEW
           Keywords|needs-bisection             |

--- Comment #26 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #18)
> (In reply to Tamar Christina from comment #17)
> > Ok, bisected to
> > 
> > g:2efe3a7de0107618397264017fb045f237764cc7 is the first bad commit
> > commit 2efe3a7de0107618397264017fb045f237764cc7
> > Author: Hao Liu <hliu@os.amperecomputing.com>
> > Date:   Wed Dec 6 14:52:19 2023 +0800
> > 
> >     tree-optimization/112774: extend the SCEV CHREC tree with a nonwrapping
> > flag
> > 
> > Before this commit we were unable to analyse the stride of the access.
> > After this niters seems to estimate the loop trip count at 4 and after that
> > the logs diverge enormously.
> 
> Hum, but that's backward and would match to what I said in comment#2 - we
> should get better code with that.
> 

Ok, so I've dug more into this today.  It's definitely this commit that's
causing it.  The reason is we no longer consider masked gather/scatters.

Before this commit we the gather pattern would trigger:

tresg.i:3:275: note:   gather/scatter pattern: detected: a[_2] = b.3_3;        
                                                                               
                                                                               
                             tresg.i:3:275: note:   gather_scatter pattern
recognized: .SCATTER_STORE ((sizetype) &a, _2, 4, b.3_3);   

and the use of the masked scatter is what's causing the epilogue to not be
required and why it generates better code.  It's not the loads.

The issue is that vect_analyze_data_refs only considers gather/scatters IF DR
analysis fails, which it did before:

tresg.c:31:29: missed:  failed: evolution of offset is not affine.
        base_address:
        offset from base address:
        constant offset from base address:
        step:
        base alignment: 0
        base misalignment: 0
        offset alignment: 0
        step alignment: 0
        base_object: array1
        Access function 0: {{m_112 * 2, +, 24}_3, +, 2}_4
        Access function 1: 0
Creating dr for array1[0][_8]

this now succeeds after the quoted commit:

success.
        base_address: &array1
        offset from base address: (ssizetype) ((sizetype) (m_111 * 2) * 2)
        constant offset from base address: 0
        step: 4
        base alignment: 8
        base misalignment: 0
        offset alignment: 4
        step alignment: 4
        base_object: array1
        Access function 0: {{m_112 * 2, +, 24}_3, +, 2}_4
        Access function 1: 0
Creating dr for array1[0][_8]

so we never enter

      /* Check that analysis of the data-ref succeeded.  */
      if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
          || !DR_STEP (dr))
        {

and without the IFN scatters it tries deinterleaving scalar stores to scatters:

tresg.c:29:22: note:   Detected single element interleaving array1[0][_8] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[1][_8] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[2][_8] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[3][_8] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[0][_1] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[1][_1] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[2][_1] step
4
tresg.c:29:22: note:   Detected single element interleaving array1[3][_1] step
4
tresg.c:29:22: missed:   not consecutive access array2[_4][_8] = _70;
tresg.c:29:22: note:   using strided accesses
tresg.c:29:22: missed:   not consecutive access array2[_4][_1] = _68;
tresg.c:29:22: note:   using strided accesses

...

tresg.c:29:22: note:   using gather/scatter for strided/grouped access, scale =
2

but without the SCATTER_STORE IFN it never tries masking the scatter, so we
lose MASK_SCATTER_STORE and hence we generate worse code because the whole loop
can no longer be predicated

However trying to force it generates an ICE so I guess it's not that simple.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (26 preceding siblings ...)
  2024-02-22 16:18 ` [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7 tnfchris at gcc dot gnu.org
@ 2024-02-26  8:10 ` tnfchris at gcc dot gnu.org
  2024-02-26  8:17 ` rguenther at suse dot de
                   ` (18 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-02-26  8:10 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #27 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
Created attachment 57538
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57538&action=edit
proposed1.patch

proposed patch, this gets the gathers and scatters back. doing regression run.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (27 preceding siblings ...)
  2024-02-26  8:10 ` tnfchris at gcc dot gnu.org
@ 2024-02-26  8:17 ` rguenther at suse dot de
  2024-02-27  8:01 ` tnfchris at gcc dot gnu.org
                   ` (17 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenther at suse dot de @ 2024-02-26  8:17 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #28 from rguenther at suse dot de <rguenther at suse dot de> ---
On Mon, 26 Feb 2024, tnfchris at gcc dot gnu.org wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441
> 
> --- Comment #27 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
> Created attachment 57538
>   --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57538&action=edit
> proposed1.patch
> 
> proposed patch, this gets the gathers and scatters back. doing regression run.

I don't think this will fly.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (28 preceding siblings ...)
  2024-02-26  8:17 ` rguenther at suse dot de
@ 2024-02-27  8:01 ` tnfchris at gcc dot gnu.org
  2024-02-27  8:08 ` rguenth at gcc dot gnu.org
                   ` (16 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: tnfchris at gcc dot gnu.org @ 2024-02-27  8:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Tamar Christina <tnfchris at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |rsandifo at gcc dot gnu.org

--- Comment #29 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
(In reply to rguenther@suse.de from comment #28)
> On Mon, 26 Feb 2024, tnfchris at gcc dot gnu.org wrote:
> 
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441
> > 
> > --- Comment #27 from Tamar Christina <tnfchris at gcc dot gnu.org> ---
> > Created attachment 57538 [details]
> >   --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57538&action=edit
> > proposed1.patch
> > 
> > proposed patch, this gets the gathers and scatters back. doing regression run.
> 
> I don't think this will fly.

Well.. I don't really know what the do here I guess.

per the discussion on irc, we only used to try gather/scatters when SCEV fails.

Now that it succeeds we no longer try using the pattern and try to handle it
during vectorizable_load/vectorizable_stores as recognizing the gather/scatters
inline through VMAT_GATHER_SCATTER.

This works fine for normal gather and scatters but doesn't work for widening
gathers and narrowing scatters which only the pattern seems to handle.

I don't know how to get this to be detected through get_load_store_type since
well, that's very late.  among others we've already determined the VF and the
unpacks have already been marked relevant. So
vectorizable_load/vectorizable_store would have to actively change the IL.

So I don't know how widening and narrowing operations are supposed to work
here.  given that.. I will leave it up to the maintainers I guess.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (29 preceding siblings ...)
  2024-02-27  8:01 ` tnfchris at gcc dot gnu.org
@ 2024-02-27  8:08 ` rguenth at gcc dot gnu.org
  2024-02-29 22:18 ` rsandifo at gcc dot gnu.org
                   ` (15 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-02-27  8:08 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #30 from Richard Biener <rguenth at gcc dot gnu.org> ---
The x86 and "emulation" paths handle narrowing/widening during code generation
(but yes, the IFN path doesn't).  A fix would be to do similar as for the
gs_info.decl case in vectorizable_load/store and handle select cases of
widening/narrowing (2x) and adjust vect_check_gather_scatter accordingly.
That might be against the spirit of how the IFN support was laid out
(possibly to be "cleaner"), but I don't see a good way to avoid the very
premature (during pattern selection) load/store vectorization choosing for
the cases there are multiple possibilities as seen here.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (30 preceding siblings ...)
  2024-02-27  8:08 ` rguenth at gcc dot gnu.org
@ 2024-02-29 22:18 ` rsandifo at gcc dot gnu.org
  2024-03-01  9:44 ` rguenth at gcc dot gnu.org
                   ` (14 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-02-29 22:18 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #31 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
(In reply to Tamar Christina from comment #29)
> This works fine for normal gather and scatters but doesn't work for widening
> gathers and narrowing scatters which only the pattern seems to handle.
I'm supposedly on holiday, so didn't see the IRC discussion, but: as I remember
it, there is no narrowing or widening for IFN gathers or scatters as such, even
for patterns.  One vector's worth of offsets corresponds to one vector's worth
of data.  But the widths of the data elements and the offset elements can be
different.  Any sign or zero extension of a loaded vector, or any operation to
double or halve the number of vectors, is done separately.

I think it does make sense to stick to that, rather than (say) have IFNs that
load two offset vectors into one data vector, or use one offset vector to load
two data vectors.  Supporting those combinations would mean that we have two
different ways in which the offset elements and data elements have different
widths.  And it isn't really a natural fusion on SVE.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (31 preceding siblings ...)
  2024-02-29 22:18 ` rsandifo at gcc dot gnu.org
@ 2024-03-01  9:44 ` rguenth at gcc dot gnu.org
  2024-03-01  9:53 ` rsandifo at gcc dot gnu.org
                   ` (13 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-03-01  9:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #32 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Sandiford from comment #31)
> (In reply to Tamar Christina from comment #29)
> > This works fine for normal gather and scatters but doesn't work for widening
> > gathers and narrowing scatters which only the pattern seems to handle.
> I'm supposedly on holiday, so didn't see the IRC discussion, but: as I
> remember it, there is no narrowing or widening for IFN gathers or scatters
> as such, even for patterns.  One vector's worth of offsets corresponds to
> one vector's worth of data.  But the widths of the data elements and the
> offset elements can be different.  Any sign or zero extension of a loaded
> vector, or any operation to double or halve the number of vectors, is done
> separately.

Yep.  The emulated gather/scatter and builtin paths do this widening/shortening
of the offset operand to what we expect on-the-fly.  This support is missing
from the IFN path which relies on patterns doing this.

Having widening/shortening explicitly represented is of course better but
using patterns for this has the unfortunate all-or-nothing effect (right now).

I do hope with SLP only, where it's easier to insert/remove "stmts", we can
delay "pattern recognition" in these cases eventually even up to
vectorizable_* which would "simply" insert a widening/shortening operation
into the SLP graph to make itself happy.

In the mean time I think making the IFN path work also the same way as
emuated/builtin would make sense.  It's already half-way there.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (32 preceding siblings ...)
  2024-03-01  9:44 ` rguenth at gcc dot gnu.org
@ 2024-03-01  9:53 ` rsandifo at gcc dot gnu.org
  2024-03-01 10:44 ` rguenther at suse dot de
                   ` (12 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-03-01  9:53 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #33 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
Can you give me a chance to look at it a bit when I back?  This doesn't feel
like the way to go to me.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (33 preceding siblings ...)
  2024-03-01  9:53 ` rsandifo at gcc dot gnu.org
@ 2024-03-01 10:44 ` rguenther at suse dot de
  2024-03-04 12:07 ` rsandifo at gcc dot gnu.org
                   ` (11 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenther at suse dot de @ 2024-03-01 10:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #34 from rguenther at suse dot de <rguenther at suse dot de> ---
On Fri, 1 Mar 2024, rsandifo at gcc dot gnu.org wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441
> 
> --- Comment #33 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
> Can you give me a chance to look at it a bit when I back?  This doesn't feel
> like the way to go to me.

Sure.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (34 preceding siblings ...)
  2024-03-01 10:44 ` rguenther at suse dot de
@ 2024-03-04 12:07 ` rsandifo at gcc dot gnu.org
  2024-03-04 13:26 ` rsandifo at gcc dot gnu.org
                   ` (10 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-03-04 12:07 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #35 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
Maybe I've misunderstood the flow of the ticket, but it looks to me like we do
still correctly recognise the truncating scatter stores.  And, on their own, we
would be able to convert them into masked scatters.

The reason for the epilogue is instead on the load side.  There we have a
non-strided grouped load, and currently we hard-code the assumption that it is
better to use contiguous loads and permutes rather than gather loads where
possible.  So we have:

      /* As a last resort, trying using a gather load or scatter store.

         ??? Although the code can handle all group sizes correctly,
         it probably isn't a win to use separate strided accesses based
         on nearby locations.  Or, even if it's a win over scalar code,
         it might not be a win over vectorizing at a lower VF, if that
         allows us to use contiguous accesses.  */
      if (*memory_access_type == VMAT_ELEMENTWISE
          && single_element_p
          && loop_vinfo
          && vect_use_strided_gather_scatters_p (stmt_info, loop_vinfo,
                                                 masked_p, gs_info))
        *memory_access_type = VMAT_GATHER_SCATTER;

only after we've tried and failed to use load lanes or load+permute.  If
instead I change the order so that the code above is tried first, then we do
use extending gather loads and truncating scatter stores as before, with no
epilogue loop.

So I suppose the question is: if we do prefer to use gathers over load+permute
for some cases, how do we decide which to use?  And can it be done a per-load
basis, or should it instead be a per-loop decision?  E.g., if we end up with a
loop that needs peeling for gaps, perhaps we should try again and forbid
peeling for gaps.  Then, if that succeeds, see which loop gives the better
overall cost.

Of course, trying more things means more compile time…

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (35 preceding siblings ...)
  2024-03-04 12:07 ` rsandifo at gcc dot gnu.org
@ 2024-03-04 13:26 ` rsandifo at gcc dot gnu.org
  2024-03-04 14:28 ` rguenth at gcc dot gnu.org
                   ` (9 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-03-04 13:26 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #36 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
Created attachment 57602
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57602&action=edit
proof-of-concept patch to suppress peeling for gaps

This patch does what I suggested in the previous comment: if the loop needs
peeling for gaps, try again without that, and pick the better loop.  It seems
to restore the original style of code for SVE.

A more polished version would be a bit smarter about when to retry.  E.g. it's
pointless if the main loop already operates on full vectors (i.e. if peeling 1
iteration is natural in any case).  Perhaps the condition should be that either
(a) the number of epilogue iterations is known to be equal to the VF of the
main loop or (b) the target is known to support partial vectors for the loop's
vector_mode.

Any thoughts?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (36 preceding siblings ...)
  2024-03-04 13:26 ` rsandifo at gcc dot gnu.org
@ 2024-03-04 14:28 ` rguenth at gcc dot gnu.org
  2024-03-04 14:48 ` rsandifo at gcc dot gnu.org
                   ` (8 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-03-04 14:28 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #37 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Sandiford from comment #36)
> Created attachment 57602 [details]
> proof-of-concept patch to suppress peeling for gaps
> 
> This patch does what I suggested in the previous comment: if the loop needs
> peeling for gaps, try again without that, and pick the better loop.  It
> seems to restore the original style of code for SVE.
> 
> A more polished version would be a bit smarter about when to retry.  E.g.
> it's pointless if the main loop already operates on full vectors (i.e. if
> peeling 1 iteration is natural in any case).  Perhaps the condition should
> be that either (a) the number of epilogue iterations is known to be equal to
> the VF of the main loop or (b) the target is known to support partial
> vectors for the loop's vector_mode.
> 
> Any thoughts?

Even more iteration looks bad.  I do wonder why when gather can avoid
peeling for GAPs using load-lanes cannot?  Also for the stores we
seem to use elementwise stores rather than store-lanes.

To me the most obvious thing to try optimizing in this testcase is DR
analysis.  With -march=armv8.3-a I still see

t.c:26:22: note:   === vect_analyze_data_ref_accesses ===
t.c:26:22: note:   Detected single element interleaving array1[0][_8] step 4
t.c:26:22: note:   Detected single element interleaving array1[1][_8] step 4
t.c:26:22: note:   Detected single element interleaving array1[2][_8] step 4
t.c:26:22: note:   Detected single element interleaving array1[3][_8] step 4
t.c:26:22: note:   Detected single element interleaving array1[0][_1] step 4
t.c:26:22: note:   Detected single element interleaving array1[1][_1] step 4
t.c:26:22: note:   Detected single element interleaving array1[2][_1] step 4
t.c:26:22: note:   Detected single element interleaving array1[3][_1] step 4
t.c:26:22: missed:   not consecutive access array2[_4][_8] = _69;
t.c:26:22: note:   using strided accesses
t.c:26:22: missed:   not consecutive access array2[_4][_1] = _67;
t.c:26:22: note:   using strided accesses

so we don't figure

Creating dr for array1[0][_1]
        base_address: &array1
        offset from base address: (ssizetype) ((sizetype) (m_111 * 2) * 2)
        constant offset from base address: 0
        step: 4
        base alignment: 16
        base misalignment: 0
        offset alignment: 4
        step alignment: 4
        base_object: array1
        Access function 0: {m_111 * 2, +, 2}<nw>_4
        Access function 1: 0
Creating dr for array1[0][_8]
analyze_innermost: success.
        base_address: &array1
        offset from base address: (ssizetype) ((sizetype) (m_111 * 2 + 1) * 2)
        constant offset from base address: 0
        step: 4
        base alignment: 16
        base misalignment: 0
        offset alignment: 2
        step alignment: 4
        base_object: array1
        Access function 0: {m_111 * 2 + 1, +, 2}<nw>_4
        Access function 1: 0

belong to the same group (but the access functions tell us it worked out).
Above we fail to split the + 1 to the constant offset.

See my hint to use int32_t m instead of uint32_t yielding

t.c:26:22: note:   Detected interleaving load of size 2
t.c:26:22: note:        _2 = array1[0][_1];
t.c:26:22: note:        _9 = array1[0][_8];
t.c:26:22: note:   Detected interleaving load of size 2
t.c:26:22: note:        _18 = array1[1][_1];
t.c:26:22: note:        _23 = array1[1][_8];
t.c:26:22: note:   Detected interleaving load of size 2
t.c:26:22: note:        _32 = array1[2][_1];
t.c:26:22: note:        _37 = array1[2][_8];
t.c:26:22: note:   Detected interleaving load of size 2
t.c:26:22: note:        _46 = array1[3][_1];
t.c:26:22: note:        _51 = array1[3][_8];
t.c:26:22: note:   Detected interleaving store of size 2
t.c:26:22: note:        array2[_4][_1] = _67;
t.c:26:22: note:        array2[_4][_8] = _69;

(and SLP being thrown away because we can use load/store lanes)

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (37 preceding siblings ...)
  2024-03-04 14:28 ` rguenth at gcc dot gnu.org
@ 2024-03-04 14:48 ` rsandifo at gcc dot gnu.org
  2024-03-04 15:01 ` rsandifo at gcc dot gnu.org
                   ` (7 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-03-04 14:48 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #38 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #37)
> Even more iteration looks bad.  I do wonder why when gather can avoid
> peeling for GAPs using load-lanes cannot?
Like you say, we don't realise that all the loads from array3[i] form a single
group.

Note that we're not using load-lanes in either case, since the group size (8)
is too big for that.  But load-lanes and load-and-permute have the same
restriction about when peeling for gaps is required.

In contrast, gather loads only ever load data that they actually need.

> Also for the stores we seem to use elementwise stores rather than store-lanes.
What configuration are you trying?  The original report was about SVE, so I was
trying that.  There we use a scatter store.

> To me the most obvious thing to try optimizing in this testcase is DR
> analysis.  With -march=armv8.3-a I still see
> 
> t.c:26:22: note:   === vect_analyze_data_ref_accesses ===
> t.c:26:22: note:   Detected single element interleaving array1[0][_8] step 4
> t.c:26:22: note:   Detected single element interleaving array1[1][_8] step 4
> t.c:26:22: note:   Detected single element interleaving array1[2][_8] step 4
> t.c:26:22: note:   Detected single element interleaving array1[3][_8] step 4
> t.c:26:22: note:   Detected single element interleaving array1[0][_1] step 4
> t.c:26:22: note:   Detected single element interleaving array1[1][_1] step 4
> t.c:26:22: note:   Detected single element interleaving array1[2][_1] step 4
> t.c:26:22: note:   Detected single element interleaving array1[3][_1] step 4
> t.c:26:22: missed:   not consecutive access array2[_4][_8] = _69;
> t.c:26:22: note:   using strided accesses
> t.c:26:22: missed:   not consecutive access array2[_4][_1] = _67;
> t.c:26:22: note:   using strided accesses
> 
> so we don't figure
> 
> Creating dr for array1[0][_1]
>         base_address: &array1
>         offset from base address: (ssizetype) ((sizetype) (m_111 * 2) * 2)
>         constant offset from base address: 0
>         step: 4
>         base alignment: 16
>         base misalignment: 0
>         offset alignment: 4
>         step alignment: 4
>         base_object: array1
>         Access function 0: {m_111 * 2, +, 2}<nw>_4
>         Access function 1: 0
> Creating dr for array1[0][_8]
> analyze_innermost: success.
>         base_address: &array1
>         offset from base address: (ssizetype) ((sizetype) (m_111 * 2 + 1) *
> 2)
>         constant offset from base address: 0
>         step: 4
>         base alignment: 16
>         base misalignment: 0
>         offset alignment: 2
>         step alignment: 4
>         base_object: array1
>         Access function 0: {m_111 * 2 + 1, +, 2}<nw>_4
>         Access function 1: 0
> 
> belong to the same group (but the access functions tell us it worked out).
> Above we fail to split the + 1 to the constant offset.
OK, but this is moving the question on to how we should optimise the testcase
for Advanced SIMD rather than SVE, and how we should optimise the testcase in
general, rather than simply recover what we could do before.  (SVE is only
enabled for -march=arvm9-a and above, in case armv8.3-a was intended to enable
SVE too.)

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (38 preceding siblings ...)
  2024-03-04 14:48 ` rsandifo at gcc dot gnu.org
@ 2024-03-04 15:01 ` rsandifo at gcc dot gnu.org
  2024-03-04 15:10 ` rguenth at gcc dot gnu.org
                   ` (6 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-03-04 15:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #39 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
(In reply to Richard Sandiford from comment #38)
> (In reply to Richard Biener from comment #37)
> > Even more iteration looks bad.  I do wonder why when gather can avoid
> > peeling for GAPs using load-lanes cannot?
> Like you say, we don't realise that all the loads from array3[i] form a
> single group.
Oops, sorry, I shouldn't have gone off memory.  So yeah, it's array1[] where
that happens, not array3[].  The reason we don't use load-lanes is that we
don't have load-lane instructions for smaller elements in larger containers, so
we're forced to use load-and-permute instead.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (39 preceding siblings ...)
  2024-03-04 15:01 ` rsandifo at gcc dot gnu.org
@ 2024-03-04 15:10 ` rguenth at gcc dot gnu.org
  2024-03-04 16:16 ` rsandifo at gcc dot gnu.org
                   ` (5 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-03-04 15:10 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #40 from Richard Biener <rguenth at gcc dot gnu.org> ---
So I wonder if we can use "local costing" to decide a gather is always OK
compared to the alternative with peeling for gaps.  On x86 gather tends
to be slow compared to open-coding it.

In the future we might want to explore whether we can re-do costing for
alternatives without re-running all of the analysis at least for decisions
we know have only "local" effect.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (40 preceding siblings ...)
  2024-03-04 15:10 ` rguenth at gcc dot gnu.org
@ 2024-03-04 16:16 ` rsandifo at gcc dot gnu.org
  2024-03-04 22:52 ` rsandifo at gcc dot gnu.org
                   ` (4 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-03-04 16:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #41 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #40)
> So I wonder if we can use "local costing" to decide a gather is always OK
> compared to the alternative with peeling for gaps.  On x86 gather tends
> to be slow compared to open-coding it.
Yeah, on SVE gathers are generally “enabling” instructions rather than
something to use for their own sake.

I suppose one problem is that we currently only try to use gathers for
single-element groups.  If we make a local decision to use gathers while
keeping that restriction, we could end up using gathers “unnecessarily” while
still needing to peel for gaps for (say) a two-element group.

That is, it's only better to use gathers than contiguous loads if by doing that
we avoid all need to peel for gaps (and if the cost of peeling for gaps was
high enough to justify the cost of using gathers over consecutive loads).

One of the things on the list to do (once everything is SLP!) is to support
loads with gaps directly via predication, so that we never load elements that
aren't needed.  E.g. on SVE, a 64-bit predicate (PTRUE .D) can be used with a
32-bit load (LD1W .S) to load only even-indexed elements.  So a single-element
group with a group size of 2 could be done cheaply with just consecutive loads,
without peeling for gaps.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (41 preceding siblings ...)
  2024-03-04 16:16 ` rsandifo at gcc dot gnu.org
@ 2024-03-04 22:52 ` rsandifo at gcc dot gnu.org
  2024-03-05  8:21 ` rguenther at suse dot de
                   ` (3 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2024-03-04 22:52 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Richard Sandiford <rsandifo at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
  Attachment #57602|0                           |1
        is obsolete|                            |

--- Comment #42 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
Created attachment 57605
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=57605&action=edit
proof-of-concept patch to suppress peeling for gaps

How about the attached?  It records whether all accesses that require peeling
for gaps could instead have used gathers, and only retries when that's true. 
It means that we retry for only 0.034% of calls to vect_analyze_loop_1 in a
build of SPEC2017 with -mcpu=neoverse-v1 -Ofast -fomit-frame-pointer.

The figures exclude wrf, which failed for me with:

module_mp_gsfcgce.fppized.f90:852:23:

  852 |    REAL FUNCTION ggamma(X)
      |                       ^
Error: definition in block 18 does not dominate use in block 13
for SSA_NAME: stmp_pf_6.5657_140 in statement:
pf_81 = PHI <stmp_pf_6.5657_140(13), stmp_pf_6.5657_140(18)>
PHI argument
stmp_pf_6.5657_140
for PHI node
pf_81 = PHI <stmp_pf_6.5657_140(13), stmp_pf_6.5657_140(18)>
during GIMPLE pass: vect
module_mp_gsfcgce.fppized.f90:852:23: internal compiler error: verify_ssa
failed

Will look at that tomorrow.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (42 preceding siblings ...)
  2024-03-04 22:52 ` rsandifo at gcc dot gnu.org
@ 2024-03-05  8:21 ` rguenther at suse dot de
  2024-03-05 10:44 ` rguenth at gcc dot gnu.org
                   ` (2 subsequent siblings)
  46 siblings, 0 replies; 48+ messages in thread
From: rguenther at suse dot de @ 2024-03-05  8:21 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #43 from rguenther at suse dot de <rguenther at suse dot de> ---
On Mon, 4 Mar 2024, rsandifo at gcc dot gnu.org wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441
> 
> --- Comment #41 from Richard Sandiford <rsandifo at gcc dot gnu.org> ---
> (In reply to Richard Biener from comment #40)
> > So I wonder if we can use "local costing" to decide a gather is always OK
> > compared to the alternative with peeling for gaps.  On x86 gather tends
> > to be slow compared to open-coding it.
> Yeah, on SVE gathers are generally ?enabling? instructions rather than
> something to use for their own sake.
> 
> I suppose one problem is that we currently only try to use gathers for
> single-element groups.  If we make a local decision to use gathers while
> keeping that restriction, we could end up using gathers ?unnecessarily? while
> still needing to peel for gaps for (say) a two-element group.
> 
> That is, it's only better to use gathers than contiguous loads if by doing that
> we avoid all need to peel for gaps (and if the cost of peeling for gaps was
> high enough to justify the cost of using gathers over consecutive loads).

Yep.  I do want to experiment with a way to have vectorizable_* register
multiple variants of vectorization and have ways to stitch together and 
cost the overall vectorization as a cheaper (and more flexible) way to
"iteration".  It will to some extent blow up combinations to try but
there might be a way to use greedy relaxation techniques to converge to
a lowest cost variant.

> One of the things on the list to do (once everything is SLP!) is to support
> loads with gaps directly via predication, so that we never load elements that
> aren't needed.  E.g. on SVE, a 64-bit predicate (PTRUE .D) can be used with a
> 32-bit load (LD1W .S) to load only even-indexed elements.  So a single-element
> group with a group size of 2 could be done cheaply with just consecutive loads,
> without peeling for gaps.

Yep.  Gap handling leaves to be desired (also when no predication is
available), I also plan to address some shortcomings in that area early
stage1.

Note that generally the idea is that gap peeling is very cheap - unless
that is the only reason to have an epilogue at all.  The exeption might
be small round-trip loops but those are best handled with predication
where there's no good reason to do peeling for gaps at all.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (43 preceding siblings ...)
  2024-03-05  8:21 ` rguenther at suse dot de
@ 2024-03-05 10:44 ` rguenth at gcc dot gnu.org
  2024-03-07 20:50 ` law at gcc dot gnu.org
  2024-05-07  7:43 ` [Bug tree-optimization/113441] [14/15 " rguenth at gcc dot gnu.org
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-03-05 10:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

--- Comment #44 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Sandiford from comment #42)
> Created attachment 57605 [details]
> proof-of-concept patch to suppress peeling for gaps
> 
> How about the attached?  It records whether all accesses that require
> peeling for gaps could instead have used gathers, and only retries when
> that's true.  It means that we retry for only 0.034% of calls to
> vect_analyze_loop_1 in a build of SPEC2017 with -mcpu=neoverse-v1 -Ofast
> -fomit-frame-pointer.

I guess this idea would work, but as said full re-analysis shouldn't be
required, instead "just" the updated cost on the affected loads/stores
need to be recomputed?  Of course this would require quite some
implementation work.  If we want to just fix this regression the approach
looks sensible but it would be also applied to x86 which doesn't want to
compare costs, right?  I'm not sure the gather vs. permute costing there
makes this a good idea for stage4?

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (44 preceding siblings ...)
  2024-03-05 10:44 ` rguenth at gcc dot gnu.org
@ 2024-03-07 20:50 ` law at gcc dot gnu.org
  2024-05-07  7:43 ` [Bug tree-optimization/113441] [14/15 " rguenth at gcc dot gnu.org
  46 siblings, 0 replies; 48+ messages in thread
From: law at gcc dot gnu.org @ 2024-03-07 20:50 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Jeffrey A. Law <law at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |law at gcc dot gnu.org
           Priority|P3                          |P2

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Bug tree-optimization/113441] [14/15 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7
  2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
                   ` (45 preceding siblings ...)
  2024-03-07 20:50 ` law at gcc dot gnu.org
@ 2024-05-07  7:43 ` rguenth at gcc dot gnu.org
  46 siblings, 0 replies; 48+ messages in thread
From: rguenth at gcc dot gnu.org @ 2024-05-07  7:43 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|14.0                        |14.2

--- Comment #45 from Richard Biener <rguenth at gcc dot gnu.org> ---
GCC 14.1 is being released, retargeting bugs to GCC 14.2.

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2024-05-07  7:43 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
2024-01-17 12:45 ` [Bug tree-optimization/113441] " juzhe.zhong at rivai dot ai
2024-01-17 13:22 ` [Bug tree-optimization/113441] [13/14 " rguenth at gcc dot gnu.org
2024-01-17 14:07 ` juzhe.zhong at rivai dot ai
2024-01-17 14:35 ` rguenth at gcc dot gnu.org
2024-01-22 12:38 ` juzhe.zhong at rivai dot ai
2024-01-22 12:41 ` tnfchris at gcc dot gnu.org
2024-01-22 12:42 ` juzhe.zhong at rivai dot ai
2024-01-22 13:19 ` juzhe.zhong at rivai dot ai
2024-01-22 13:52 ` [Bug tree-optimization/113441] [14 " rguenth at gcc dot gnu.org
2024-01-22 16:16 ` tnfchris at gcc dot gnu.org
2024-01-22 22:16 ` juzhe.zhong at rivai dot ai
2024-01-23  6:42 ` rguenth at gcc dot gnu.org
2024-01-23  8:15 ` juzhe.zhong at rivai dot ai
2024-01-23  8:17 ` rguenther at suse dot de
2024-01-23  8:25 ` juzhe.zhong at rivai dot ai
2024-01-23 10:29 ` rguenther at suse dot de
2024-01-23 10:30 ` tnfchris at gcc dot gnu.org
2024-01-23 12:32 ` tnfchris at gcc dot gnu.org
2024-01-23 12:50 ` rguenth at gcc dot gnu.org
2024-01-23 12:52 ` rguenth at gcc dot gnu.org
2024-01-23 12:56 ` rguenth at gcc dot gnu.org
2024-01-23 13:02 ` rguenth at gcc dot gnu.org
2024-01-23 13:05 ` tnfchris at gcc dot gnu.org
2024-01-23 13:12 ` tnfchris at gcc dot gnu.org
2024-01-23 13:21 ` juzhe.zhong at rivai dot ai
2024-01-23 13:28 ` tnfchris at gcc dot gnu.org
2024-02-22 16:18 ` [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7 tnfchris at gcc dot gnu.org
2024-02-26  8:10 ` tnfchris at gcc dot gnu.org
2024-02-26  8:17 ` rguenther at suse dot de
2024-02-27  8:01 ` tnfchris at gcc dot gnu.org
2024-02-27  8:08 ` rguenth at gcc dot gnu.org
2024-02-29 22:18 ` rsandifo at gcc dot gnu.org
2024-03-01  9:44 ` rguenth at gcc dot gnu.org
2024-03-01  9:53 ` rsandifo at gcc dot gnu.org
2024-03-01 10:44 ` rguenther at suse dot de
2024-03-04 12:07 ` rsandifo at gcc dot gnu.org
2024-03-04 13:26 ` rsandifo at gcc dot gnu.org
2024-03-04 14:28 ` rguenth at gcc dot gnu.org
2024-03-04 14:48 ` rsandifo at gcc dot gnu.org
2024-03-04 15:01 ` rsandifo at gcc dot gnu.org
2024-03-04 15:10 ` rguenth at gcc dot gnu.org
2024-03-04 16:16 ` rsandifo at gcc dot gnu.org
2024-03-04 22:52 ` rsandifo at gcc dot gnu.org
2024-03-05  8:21 ` rguenther at suse dot de
2024-03-05 10:44 ` rguenth at gcc dot gnu.org
2024-03-07 20:50 ` law at gcc dot gnu.org
2024-05-07  7:43 ` [Bug tree-optimization/113441] [14/15 " rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).