public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop
@ 2024-01-17 12:38 juzhe.zhong at rivai dot ai
  2024-01-17 12:45 ` [Bug tree-optimization/113441] " juzhe.zhong at rivai dot ai
                   ` (46 more replies)
  0 siblings, 47 replies; 48+ messages in thread
From: juzhe.zhong at rivai dot ai @ 2024-01-17 12:38 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113441

            Bug ID: 113441
           Summary: [14 Regression] Fail to fold the last element with
                    multiple loop
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: c
          Assignee: unassigned at gcc dot gnu.org
          Reporter: juzhe.zhong at rivai dot ai
  Target Milestone: ---

Hi, We found there is a regression between GCC-12 vs GCC-14 when evaluating our
downstream RVV GCC vs upstream RVV GCC.

Such regression not only happens on our RVV GCC but also ARM SVE GCC.

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int16_t array1[4][1 * 273 * 12 * 2];

int16_t array2[4][1 * 273 * 12 * 2];
int16_t array3[4][4 * 2];

void
foo (uint8_t a, uint16_t b)
{
  int32_t sum[2];
  int32_t result[4][2];
  uint16_t j = 0;
  uint8_t i = 0;
  uint16_t l = 0;
  uint16_t k = 0;
  uint32_t m = 0;

  for (i = 0; i < 4; i++)
    {
      m = 0;
      for (j = 0; j < a; j++)
        {
          for (k = 0; k < b; k++)
            {
              for (l = 0; l < 12; l++)
                {
                  result[0][0] = array1[0][2 * m] * array3[i][0]
                                 - array1[0][2 * m + 1] * array3[i][1];
                  result[0][1] = array1[0][2 * m + 1] * array3[i][0]
                                 + array1[0][2 * m] * array3[i][1];

                  result[1][0] = array1[1][2 * m] * array3[i][2]
                                 - array1[1][2 * m + 1] * array3[i][3];
                  result[1][1] = array1[1][2 * m + 1] * array3[i][2]
                                 + array1[1][2 * m] * array3[i][3];

                  result[2][0] = array1[2][2 * m] * array3[i][4]
                                 - array1[2][2 * m + 1] * array3[i][5];
                  result[2][1] = array1[2][2 * m + 1] * array3[i][4]
                                 + array1[2][2 * m] * array3[i][5];

                  result[3][0] = array1[3][2 * m] * array3[i][6]
                                 - array1[3][2 * m + 1] * array3[i][7];
                  result[3][1] = array1[3][2 * m + 1] * array3[i][6]
                                 + array1[3][2 * m] * array3[i][7];
                  sum[0]
                    = result[0][0] + result[1][0] + result[2][0] +
result[3][0];
                  sum[1]
                    = result[0][1] + result[1][1] + result[2][1] +
result[3][1];
                  array2[i][2 * m] = (int16_t) (sum[0] >> 15);
                  array2[i][2 * m + 1] = (int16_t) (sum[1] >> 15);
                  m++;
                }
            }
        }
    }
}

Here is reference:

https://godbolt.org/z/hfqWvdf8e

Here is the analysis:

First, Note the inner loop iterations = 12 (for (l = 0; l < 12; l++))

GCC 14 process 11 elements and leave the last element using scalar:

```
        mov     x1, 11           ---> process 11 elements
        whilelo p5.s, xzr, x1
        ...
        vector codes
        ...
        scalar codes of the last element:
        ldrsh   w8, [x0, x5, lsl 1]
        add     x6, x5, x10
        ldrsh   w14, [x0, x7, lsl 1]
        add     x1, x4, x10
        ldrsh   w7, [x0, x4, lsl 1]
        add     x12, x5, x27
        ldrsh   w2, [x0, x2, lsl 1]
        add     x5, x28, x5
        mul     w11, w24, w8
        ldrsh   w13, [x0, x6, lsl 1]
        ldrsh   w1, [x0, x1, lsl 1]
        add     x6, x4, x27
        msub    w11, w21, w7, w11
        ldrsh   w12, [x0, x12, lsl 1]
        mul     w7, w24, w7
        add     x4, x28, x4
        madd    w8, w21, w8, w7
        ldrsh   w6, [x0, x6, lsl 1]
        mul     w7, w20, w14
        add     w3, w3, 24
        msub    w7, w19, w2, w7
        mul     w2, w20, w2
        add     w7, w7, w11
        mul     w11, w18, w13
        msub    w11, w17, w1, w11
        madd    w2, w19, w14, w2
        add     w11, w7, w11
        mul     w1, w18, w1
        mul     w7, w16, w12
        add     w2, w2, w8
        msub    w7, w15, w6, w7
        madd    w1, w17, w13, w1
        mul     w6, w16, w6
        add     w11, w11, w7
        madd    w6, w15, w12, w6
        add     w1, w2, w1
        asr     w11, w11, 15
        strh    w11, [x9, x5, lsl 1]
        add     w1, w1, w6
        asr     w1, w1, 15
        strh    w1, [x9, x4, lsl 1]
        cmp     w30, w3
        bne     .L4
        ldp     w2, w7, [sp, 108]
        ldr     w3, [sp, 116]
        add     w1, w2, 1
        add     w30, w30, w7
        ldr     x8, [sp, 96]
        and     w2, w1, 65535
        cmp     w3, w1, uxth
        bne     .L6
        ldr     x3, [sp, 120]
        add     x23, x23, x22
        ldr     w5, [sp, 116]
        add     x8, x8, 16
        add     x3, x3, 1
        cmp     x3, 4
        bne     .L3

```

GCC-12 has much better codegen (Fold 12 elements in vector codes):

```
        mov     x1, 12   ----> process 12 elements in vector.
        ptrue   p0.b, vl64
        whilelo p1.s, xzr, x1

```
       vector codes:
```
       No scalar epilogue.

This benchmark has over 70% performance drop between GCC-12 and GCC-14 for both
RVV and ARM SVE.

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2024-05-07  7:43 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-17 12:38 [Bug c/113441] New: [14 Regression] Fail to fold the last element with multiple loop juzhe.zhong at rivai dot ai
2024-01-17 12:45 ` [Bug tree-optimization/113441] " juzhe.zhong at rivai dot ai
2024-01-17 13:22 ` [Bug tree-optimization/113441] [13/14 " rguenth at gcc dot gnu.org
2024-01-17 14:07 ` juzhe.zhong at rivai dot ai
2024-01-17 14:35 ` rguenth at gcc dot gnu.org
2024-01-22 12:38 ` juzhe.zhong at rivai dot ai
2024-01-22 12:41 ` tnfchris at gcc dot gnu.org
2024-01-22 12:42 ` juzhe.zhong at rivai dot ai
2024-01-22 13:19 ` juzhe.zhong at rivai dot ai
2024-01-22 13:52 ` [Bug tree-optimization/113441] [14 " rguenth at gcc dot gnu.org
2024-01-22 16:16 ` tnfchris at gcc dot gnu.org
2024-01-22 22:16 ` juzhe.zhong at rivai dot ai
2024-01-23  6:42 ` rguenth at gcc dot gnu.org
2024-01-23  8:15 ` juzhe.zhong at rivai dot ai
2024-01-23  8:17 ` rguenther at suse dot de
2024-01-23  8:25 ` juzhe.zhong at rivai dot ai
2024-01-23 10:29 ` rguenther at suse dot de
2024-01-23 10:30 ` tnfchris at gcc dot gnu.org
2024-01-23 12:32 ` tnfchris at gcc dot gnu.org
2024-01-23 12:50 ` rguenth at gcc dot gnu.org
2024-01-23 12:52 ` rguenth at gcc dot gnu.org
2024-01-23 12:56 ` rguenth at gcc dot gnu.org
2024-01-23 13:02 ` rguenth at gcc dot gnu.org
2024-01-23 13:05 ` tnfchris at gcc dot gnu.org
2024-01-23 13:12 ` tnfchris at gcc dot gnu.org
2024-01-23 13:21 ` juzhe.zhong at rivai dot ai
2024-01-23 13:28 ` tnfchris at gcc dot gnu.org
2024-02-22 16:18 ` [Bug tree-optimization/113441] [14 Regression] Fail to fold the last element with multiple loop since g:2efe3a7de0107618397264017fb045f237764cc7 tnfchris at gcc dot gnu.org
2024-02-26  8:10 ` tnfchris at gcc dot gnu.org
2024-02-26  8:17 ` rguenther at suse dot de
2024-02-27  8:01 ` tnfchris at gcc dot gnu.org
2024-02-27  8:08 ` rguenth at gcc dot gnu.org
2024-02-29 22:18 ` rsandifo at gcc dot gnu.org
2024-03-01  9:44 ` rguenth at gcc dot gnu.org
2024-03-01  9:53 ` rsandifo at gcc dot gnu.org
2024-03-01 10:44 ` rguenther at suse dot de
2024-03-04 12:07 ` rsandifo at gcc dot gnu.org
2024-03-04 13:26 ` rsandifo at gcc dot gnu.org
2024-03-04 14:28 ` rguenth at gcc dot gnu.org
2024-03-04 14:48 ` rsandifo at gcc dot gnu.org
2024-03-04 15:01 ` rsandifo at gcc dot gnu.org
2024-03-04 15:10 ` rguenth at gcc dot gnu.org
2024-03-04 16:16 ` rsandifo at gcc dot gnu.org
2024-03-04 22:52 ` rsandifo at gcc dot gnu.org
2024-03-05  8:21 ` rguenther at suse dot de
2024-03-05 10:44 ` rguenth at gcc dot gnu.org
2024-03-07 20:50 ` law at gcc dot gnu.org
2024-05-07  7:43 ` [Bug tree-optimization/113441] [14/15 " rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).