[Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
@ 2022-06-23 23:13 mpolacek at gcc dot gnu.org
  2022-06-23 23:15 ` [Bug target/106069] [12/13 Regression] " mpolacek at gcc dot gnu.org
                   ` (37 more replies)
  0 siblings, 38 replies; 39+ messages in thread
From: mpolacek at gcc dot gnu.org @ 2022-06-23 23:13 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

            Bug ID: 106069
           Summary: wrong code with -O -fno-tree-forwprop -maltivec on
                    ppc64le
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: mpolacek at gcc dot gnu.org
  Target Milestone: ---

The following test crashes at runtime since r12-4496-g0910c516a3d72a.

$ ./cc1plus.r12-4495 -quiet -O -fno-tree-forwprop -maltivec q.C; g++ q.s;
./a.out
$ ./cc1plus.r12-4496 -quiet -O -fno-tree-forwprop -maltivec q.C; g++ q.s;
./a.out 
Aborted

Target: powerpc64le-unknown-linux-gnu
Configured with: /home/polacek/src/gcc/configure --enable-languages=c,c++
--enable-checking=yes -with-system-zlib --disable-bootstrap --disable-libvtv
--disable-libitm --disable-libsanitizer
gcc version 13.0.0 20220622 (experimental) (GCC)


extern "C" void *memcpy(void *, const void *, unsigned long);
typedef __attribute__((altivec(vector__))) unsigned native_simd_type;

union {
  native_simd_type V;
  int R[4];
} store_le_vec;

struct S {
  S() = default;
  S(unsigned B0) {
    native_simd_type val{B0};
    m_simd = val;
  }
  void store_le(unsigned char out[]) {
    store_le_vec.V = m_simd;
    unsigned int x0 = store_le_vec.R[0];
    memcpy(out, &x0, 1);
  }
  S rotl(unsigned int r) {
    native_simd_type rot{r};
    return __builtin_vec_rl(m_simd, rot);
  }
  void operator+=(S other) {
    m_simd = __builtin_vec_add(m_simd, other.m_simd);
  }
  void operator^=(S other) {
    m_simd = __builtin_vec_xor(m_simd, other.m_simd);
  }
  static void transpose(S &B0, S B1, S B2, S B3) {
    native_simd_type T0 = __builtin_vec_mergeh(B0.m_simd, B2.m_simd);
    native_simd_type T1 = __builtin_vec_mergeh(B1.m_simd, B3.m_simd);
    native_simd_type T2 = __builtin_vec_mergel(B0.m_simd, B2.m_simd);
    native_simd_type T3 = __builtin_vec_mergel(B1.m_simd, B3.m_simd);
    B0 = __builtin_vec_mergeh(T0, T1);
    B3 = __builtin_vec_mergel(T2, T3);
  }
  S(native_simd_type x) : m_simd(x) {}
  native_simd_type m_simd;
};

void
foo (unsigned char output[], unsigned state[])
{
  S R00 = state[0];
  S R01 = state[0];
  S R02 = state[2];
  S R03 = state[0];
  S R05 = state[5];
  S R06 = state[6];
  S R07 = state[7];
  S R08 = state[8];
  S R09 = state[9];
  S R10 = state[10];
  S R11 = state[11];
  S R12 = state[12];
  S R13 = state[13];
  S R14 = state[4];
  S R15 = state[15];
  for (int r = 0; r != 10; ++r) {
    R09 += R13;
    R11 += R15;
    R05 ^= R09;
    R06 ^= R10;
    R07 ^= R11;
    R07 = R07.rotl(7);
    R00 += R05;
    R01 += R06;
    R02 += R07;
    R15 ^= R00;
    R12 ^= R01;
    R13 ^= R02;
    R00 += R05;
    R01 += R06;
    R02 += R07;
    R15 ^= R00;
    R12 = R12.rotl(8);
    R13 = R13.rotl(8);
    R10 += R15;
    R11 += R12;
    R08 += R13;
    R09 += R14;
    R05 ^= R10;
    R06 ^= R11;
    R07 ^= R08;
    R05 = R05.rotl(7);
    R06 = R06.rotl(7);
    R07 = R07.rotl(7);
  }
  R00 += state[0];
  S::transpose(R00, R01, R02, R03);
  R00.store_le(output);
}

unsigned char res[1];
unsigned main_state[]{1634760805, 60878,      2036477234, 6,
                      0,          825562964,  1471091955, 1346092787,
                      506976774,  4197066702, 518848283,  118491664,
                      0,          0,          0,          0};
int
main ()
{
  foo (res, main_state);
  if (res[0] != 152)
    __builtin_abort();
}

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
@ 2022-06-23 23:15 ` mpolacek at gcc dot gnu.org
  2022-06-23 23:18 ` mpolacek at gcc dot gnu.org
                   ` (36 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: mpolacek at gcc dot gnu.org @ 2022-06-23 23:15 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

Marek Polacek <mpolacek at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|---                         |12.2
           Keywords|                            |wrong-code
             Target|                            |powerpc64le-unknown-linux-g
                   |                            |nu
               Host|                            |powerpc64le-unknown-linux-g
                   |                            |nu
            Summary|wrong code with -O          |[12/13 Regression] wrong
                   |-fno-tree-forwprop          |code with -O
                   |-maltivec on ppc64le        |-fno-tree-forwprop
                   |                            |-maltivec on ppc64le
                 CC|                            |luoxhu at gcc dot gnu.org
              Build|                            |powerpc64le-unknown-linux-g
                   |                            |nu

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
  2022-06-23 23:15 ` [Bug target/106069] [12/13 Regression] " mpolacek at gcc dot gnu.org
@ 2022-06-23 23:18 ` mpolacek at gcc dot gnu.org
  2022-06-24  3:25 ` luoxhu at gcc dot gnu.org
                   ` (35 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: mpolacek at gcc dot gnu.org @ 2022-06-23 23:18 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #1 from Marek Polacek <mpolacek at gcc dot gnu.org> ---
The difference between r12-4495 and r12-4496:

$ diff -up b/q.C.252r.expand a/q.C.252r.expand
--- b/q.C.252r.expand   2022-06-23 23:16:44.753507476 +0000
+++ a/q.C.252r.expand   2022-06-23 23:16:16.232784087 +0000
@@ -831,10 +831,14 @@ try_optimize_cfg iteration 2
         (subreg:V16QI (reg/v:V4SI 168 [ R02$m_simd ]) 0)) "q.C":31:47 -1
      (nil))
 (insn 188 187 189 5 (set (reg:V4SI 339)
-        (unspec:V4SI [
-                (subreg:V4SI (reg:V16QI 338) 0)
-                (subreg:V4SI (reg:V16QI 337) 0)
-            ] UNSPEC_VMRGL_DIRECT)) "q.C":31:47 -1
+        (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 338) 0)
+                (subreg:V4SI (reg:V16QI 337) 0))
+            (parallel [
+                    (const_int 2 [0x2])
+                    (const_int 6 [0x6])
+                    (const_int 3 [0x3])
+                    (const_int 7 [0x7])
+                ]))) "q.C":31:47 -1
      (nil))
 (insn 189 188 190 5 (set (reg:V16QI 336)
         (subreg:V16QI (reg:V4SI 339) 0)) "q.C":31:47 -1
@@ -846,10 +850,14 @@ try_optimize_cfg iteration 2
         (subreg:V16QI (reg/v:V4SI 143 [ val ]) 0)) "q.C":32:47 -1
      (nil))
 (insn 192 191 193 5 (set (reg:V4SI 344)
-        (unspec:V4SI [
-                (subreg:V4SI (reg:V16QI 343) 0)
-                (subreg:V4SI (reg:V16QI 342) 0)
-            ] UNSPEC_VMRGL_DIRECT)) "q.C":32:47 -1
+        (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 343) 0)
+                (subreg:V4SI (reg:V16QI 342) 0))
+            (parallel [
+                    (const_int 2 [0x2])
+                    (const_int 6 [0x6])
+                    (const_int 3 [0x3])
+                    (const_int 7 [0x7])
+                ]))) "q.C":32:47 -1
      (nil))
 (insn 193 192 194 5 (set (reg:V16QI 341)
         (subreg:V16QI (reg:V4SI 344) 0)) "q.C":32:47 -1
@@ -861,10 +869,14 @@ try_optimize_cfg iteration 2
         (subreg:V4SI (reg:V16QI 341) 0)) "q.C":35:8 -1
      (nil))
 (insn 196 195 197 5 (set (reg:V4SI 349)
-        (unspec:V4SI [
-                (subreg:V4SI (reg:V16QI 341) 0)
-                (subreg:V4SI (reg:V16QI 336) 0)
-            ] UNSPEC_VMRGL_DIRECT)) "q.C":35:8 -1
+        (vec_select:V4SI (vec_concat:V8SI (subreg:V4SI (reg:V16QI 341) 0)
+                (subreg:V4SI (reg:V16QI 336) 0))
+            (parallel [
+                    (const_int 2 [0x2])
+                    (const_int 6 [0x6])
+                    (const_int 3 [0x3])
+                    (const_int 7 [0x7])
+                ]))) "q.C":35:8 -1
      (nil))
 (insn 197 196 198 5 (set (reg:V16QI 348)
         (subreg:V16QI (reg:V4SI 349) 0)) "q.C":35:8 -1
@@ -934,7 +946,7 @@ try_optimize_cfg iteration 2
         (reg/f:DI 120)) "q.C":103:7 -1
      (nil))
 (call_insn 9 8 10 2 (parallel [
-            (call (mem:SI (symbol_ref:DI ("_Z3fooPhPj") [flags 0x3] 
<function_decl 0x3fff7f473a00 foo>) [0 foo S4 A8])
+            (call (mem:SI (symbol_ref:DI ("_Z3fooPhPj") [flags 0x3] 
<function_decl 0x3fffa9823a00 foo>) [0 foo S4 A8])
                 (const_int 0 [0]))
             (use (const_int 0 [0]))
             (clobber (reg:DI 96 lr))
@@ -968,7 +980,7 @@ try_optimize_cfg iteration 2
  -> 17)
 (note 14 13 15 4 [bb 4] NOTE_INSN_BASIC_BLOCK)
 (call_insn 15 14 16 4 (parallel [
-            (call (mem:SI (symbol_ref:DI ("abort") [flags 0x41] 
<function_decl 0x3fff7f2b5300 __builtin_abort>) [0 __builtin_abort S4 A8])
+            (call (mem:SI (symbol_ref:DI ("abort") [flags 0x41] 
<function_decl 0x3fffa9665300 __builtin_abort>) [0 __builtin_abort S4 A8])
                 (const_int 0 [0]))
             (use (const_int 0 [0]))
             (clobber (reg:DI 96 lr))

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
  2022-06-23 23:15 ` [Bug target/106069] [12/13 Regression] " mpolacek at gcc dot gnu.org
  2022-06-23 23:18 ` mpolacek at gcc dot gnu.org
@ 2022-06-24  3:25 ` luoxhu at gcc dot gnu.org
  2022-06-24 13:03 ` mpolacek at gcc dot gnu.org
                   ` (34 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-06-24  3:25 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #2 from luoxhu at gcc dot gnu.org ---
Could you also paste the ASM difference please? (I don't have environment at
handle so far..)

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (2 preceding siblings ...)
  2022-06-24  3:25 ` luoxhu at gcc dot gnu.org
@ 2022-06-24 13:03 ` mpolacek at gcc dot gnu.org
  2022-06-30  8:13 ` luoxhu at gcc dot gnu.org
                   ` (33 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: mpolacek at gcc dot gnu.org @ 2022-06-24 13:03 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #3 from Marek Polacek <mpolacek at gcc dot gnu.org> ---
Sure.  (If you're looking for a ppc64le machine, the compile farm has a few.)

$ diff -up q95.s q96.s
--- q95.s       2022-06-23 23:08:22.870777519 +0000
+++ q96.s       2022-06-23 23:08:10.990476157 +0000
@@ -12,12 +12,12 @@ _Z3fooPhPj:
 0:     addis 2,12,.TOC.-.LCF0@ha
        addi 2,2,.TOC.-.LCF0@l
        .localentry     _Z3fooPhPj,.-_Z3fooPhPj
-       lxsiwzx 50,0,4
+       lxsiwzx 49,0,4
        xxlxor 0,0,0
-       xxpermdi 50,0,50,0
+       xxpermdi 49,0,49,0
        addi 9,4,8
-       lxsiwzx 51,0,9
-       xxpermdi 51,0,51,0
+       lxsiwzx 50,0,9
+       xxpermdi 50,0,50,0
        addi 9,4,20
        lxsiwzx 44,0,9
        xxpermdi 44,0,44,0
@@ -28,8 +28,8 @@ _Z3fooPhPj:
        lxsiwzx 32,0,9
        xxpermdi 32,0,32,0
        addi 9,4,32
-       lxsiwzx 49,0,9
-       xxpermdi 49,0,49,0
+       lxsiwzx 34,0,9
+       xxpermdi 34,0,34,0
        addi 9,4,36
        lxsiwzx 43,0,9
        xxpermdi 43,0,43,0
@@ -40,8 +40,8 @@ _Z3fooPhPj:
        lxsiwzx 33,0,9
        xxpermdi 33,0,33,0
        addi 9,4,48
-       lxsiwzx 34,0,9
-       xxpermdi 34,0,34,0
+       lxsiwzx 35,0,9
+       xxpermdi 35,0,35,0
        addi 9,4,52
        lxsiwzx 38,0,9
        xxpermdi 38,0,38,0
@@ -51,14 +51,14 @@ _Z3fooPhPj:
        addi 9,4,60
        lxsiwzx 39,0,9
        xxpermdi 39,0,39,0
-       xxlor 48,50,50
-       xxlor 35,50,50
+       xxlor 47,49,49
+       xxlor 51,49,49
        addis 9,2,.LC0@toc@ha
        addi 9,9,.LC0@toc@l
        lvx 4,0,9
        addis 9,2,.LC1@toc@ha
        addi 9,9,.LC1@toc@l
-       lvx 15,0,9
+       lvx 16,0,9
        li 9,10
        mtctr 9
 .L2:
@@ -68,34 +68,30 @@ _Z3fooPhPj:
        xxlxor 45,45,37
        xxlxor 32,32,33
        vrlw 0,0,4
-       vadduwm 8,3,12
-       vadduwm 9,16,13
-       vadduwm 10,19,0
-       vadduwm 3,8,12
-       vadduwm 16,9,13
-       vadduwm 19,10,0
-       xxlxor 40,40,35
+       vadduwm 8,19,12
+       vadduwm 9,15,13
+       vadduwm 10,18,0
+       vadduwm 19,8,12
+       vadduwm 15,9,13
+       vadduwm 18,10,0
+       xxlxor 40,40,51
        xxlxor 39,39,40
-       xxlxor 41,34,41
-       vrlw 2,9,15
+       xxlxor 41,35,41
+       vrlw 3,9,16
        xxlxor 42,38,42
-       vrlw 6,10,15
+       vrlw 6,10,16
        vadduwm 5,5,7
-       vadduwm 1,1,2
-       vadduwm 17,17,6
+       vadduwm 1,1,3
+       vadduwm 2,2,6
        vadduwm 11,11,14
        xxlxor 44,37,44
        vrlw 12,12,4
        xxlxor 45,33,45
        vrlw 13,13,4
-       xxlxor 32,49,32
+       xxlxor 32,34,32
        vrlw 0,0,4
        bdnz .L2
-       vadduwm 3,3,18
-       xxmrglw 51,51,35
-       xxmrglw 50,50,48
-       xxmrglw 50,50,51
-       vspltw 0,18,3
+       vspltw 0,17,0
        mfvsrwz 9,32
        stb 9,0(3)
        blr

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (3 preceding siblings ...)
  2022-06-24 13:03 ` mpolacek at gcc dot gnu.org
@ 2022-06-30  8:13 ` luoxhu at gcc dot gnu.org
  2022-06-30  8:15 ` luoxhu at gcc dot gnu.org
                   ` (32 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-06-30  8:13 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #4 from luoxhu at gcc dot gnu.org ---
Reduced to:

#include <stdio.h>
extern "C" void *memcpy(void *, const void *, unsigned long);
typedef __attribute__((altivec(vector__))) unsigned native_simd_type;

union {
        native_simd_type V;
        int R[4];
} store_le_vec;

struct S {
        S() = default;
        S(unsigned B0) {
                native_simd_type val{B0};
                m_simd = val;
        }
        void store_le(unsigned char out[]) {
                store_le_vec.V = m_simd;
                unsigned int x0 = store_le_vec.R[0];
                memcpy(out, &x0, 1);
        }
        static void transpose(S &B0, S B1, S B2, S B3) {
                native_simd_type T0 = __builtin_vec_mergeh(B0.m_simd,
B2.m_simd);
                native_simd_type T1 = __builtin_vec_mergeh(B1.m_simd,
B3.m_simd);
                native_simd_type T2 = __builtin_vec_mergel(B0.m_simd,
B2.m_simd);
                native_simd_type T3 = __builtin_vec_mergel(B1.m_simd,
B3.m_simd);
                B0 = __builtin_vec_mergeh(T0, T1);
                B3 = __builtin_vec_mergel(T2, T3);
                printf ("B0: %x, %x,%x,%x\n", B0.m_simd[0], B0.m_simd[1],
B0.m_simd[2], B0.m_simd[3]);
        }
        S(native_simd_type x) : m_simd(x) {}
        native_simd_type m_simd;
};

        void
foo (unsigned char output[], unsigned state[], native_simd_type R0,
native_simd_type R1, native_simd_type R2, native_simd_type R3)
{
        S R00; R00.m_simd = R0;
        S R01; R01.m_simd = R1;
        S R02; R02.m_simd = R2;
        S R03; R03.m_simd = R3;
        S::transpose(R00, R01, R02, R03);
        R00.store_le(output);
}

unsigned char res[1];
unsigned main_state[]{1634760805, 60878,      2036477234, 6,
        0,          825562964,  1471091955, 1346092787,
        506976774,  4197066702, 518848283,  118491664,
        0,          0,          0,          0};
int
main ()
{
        native_simd_type R0 = native_simd_type {0x41fcef98, 0,0,0};
        native_simd_type R1 =  native_simd_type {0x91648e8b, 0,0,0};
        native_simd_type R2 = native_simd_type  {0x7dca18c6, 0,0,0};
        native_simd_type R3 = native_simd_type  {0x61707865, 0,0,0};
        foo (res, main_state, R0, R1, R2, R3);
        if (res[0] != 152)
                __builtin_abort();
}

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (4 preceding siblings ...)
  2022-06-30  8:13 ` luoxhu at gcc dot gnu.org
@ 2022-06-30  8:15 ` luoxhu at gcc dot gnu.org
  2022-06-30 17:32 ` segher at gcc dot gnu.org
                   ` (31 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-06-30  8:15 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #5 from luoxhu at gcc dot gnu.org ---
Seems combine wrongly merged two vec_select instructions:

Trying 188 -> 199:
  188: r343:V4SI=vec_select(vec_concat(r168:V4SI,r338:V4SI),parallel)
      REG_DEAD r338:V4SI
      REG_DEAD r168:V4SI
  199: {r353:SI=vec_select(r343:V4SI,parallel);clobber scratch;}
Failed to match this instruction:
(parallel [
        (set (reg:SI 353)
            (vec_select:SI (reg:V4SI 338)
                (parallel [
                        (const_int 3 [0x3])
                    ])))
        (clobber (scratch:V4SI))
        (set (reg:V4SI 343)
            (vec_select:V4SI (vec_concat:V8SI (reg/v:V4SI 168 [ R02$m_simd ])
                    (reg:V4SI 338))
                (parallel [
                        (const_int 2 [0x2])
                        (const_int 6 [0x6])
                        (const_int 3 [0x3])
                        (const_int 7 [0x7])
                    ])))
    ])
Failed to match this instruction:
(parallel [
        (set (reg:SI 353)
            (vec_select:SI (reg:V4SI 338)
                (parallel [
                        (const_int 3 [0x3])
                    ])))
        (set (reg:V4SI 343)
            (vec_select:V4SI (vec_concat:V8SI (reg/v:V4SI 168 [ R02$m_simd ])
                    (reg:V4SI 338))
                (parallel [
                        (const_int 2 [0x2])
                        (const_int 6 [0x6])
                        (const_int 3 [0x3])
                        (const_int 7 [0x7])
                    ])))
    ])
Successfully matched this instruction:
(set (reg:V4SI 343)
    (vec_select:V4SI (vec_concat:V8SI (reg/v:V4SI 168 [ R02$m_simd ])
            (reg:V4SI 338))
        (parallel [
                (const_int 2 [0x2])
                (const_int 6 [0x6])
                (const_int 3 [0x3])
                (const_int 7 [0x7])
            ])))
Successfully matched this instruction:
(set (reg:SI 353)
    (vec_select:SI (reg:V4SI 338)
        (parallel [
                (const_int 3 [0x3])
            ])))
allowing combination of insns 188 and 199
original costs 4 + 8 = 12
replacement costs 4 + 8 = 12
modifying insn i2   188:
r343:V4SI=vec_select(vec_concat(r168:V4SI,r338:V4SI),parallel)
      REG_DEAD r168:V4SI
deferring rescan insn with uid = 188.
modifying insn i3   199: {r353:SI=vec_select(r338:V4SI,parallel);clobber
scratch;}
      REG_DEAD r338:V4SI
deferring rescan insn with uid = 199.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (5 preceding siblings ...)
  2022-06-30  8:15 ` luoxhu at gcc dot gnu.org
@ 2022-06-30 17:32 ` segher at gcc dot gnu.org
  2022-06-30 17:34 ` segher at gcc dot gnu.org
                   ` (30 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: segher at gcc dot gnu.org @ 2022-06-30 17:32 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #6 from Segher Boessenkool <segher at gcc dot gnu.org> ---
What is wrong there?  It isn't obvious.  You may need to show insns 188 and 199
in non-slim form, "slim" is very lossy.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (6 preceding siblings ...)
  2022-06-30 17:32 ` segher at gcc dot gnu.org
@ 2022-06-30 17:34 ` segher at gcc dot gnu.org
  2022-07-01  1:52 ` luoxhu at gcc dot gnu.org
                   ` (29 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: segher at gcc dot gnu.org @ 2022-06-30 17:34 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #7 from Segher Boessenkool <segher at gcc dot gnu.org> ---
(The original insns, before this combination.)

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (7 preceding siblings ...)
  2022-06-30 17:34 ` segher at gcc dot gnu.org
@ 2022-07-01  1:52 ` luoxhu at gcc dot gnu.org
  2022-07-25 15:54 ` rguenth at gcc dot gnu.org
                   ` (28 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-07-01  1:52 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #8 from luoxhu at gcc dot gnu.org ---
init-regs:

(insn 13 8 17 2 (set (reg:V4SI 141)
        (vec_select:V4SI (vec_concat:V8SI (reg/v:V4SI 135 [ R2 ])
                (reg/v:V4SI 133 [ R0 ]))
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 6 [0x6])
                    (const_int 3 [0x3])
                    (const_int 7 [0x7])
                ]))) "q.C":22:45 1785 {altivec_vmrglw_direct_v4si}
     (expr_list:REG_DEAD (reg/v:V4SI 135 [ R2 ])
        (expr_list:REG_DEAD (reg/v:V4SI 133 [ R0 ])
            (nil))))
(insn 17 13 21 2 (set (reg:V4SI 146)
        (vec_select:V4SI (vec_concat:V8SI (reg/v:V4SI 136 [ R3 ])
                (reg/v:V4SI 134 [ R1 ]))
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 6 [0x6])
                    (const_int 3 [0x3])
                    (const_int 7 [0x7])
                ]))) "q.C":23:45 1785 {altivec_vmrglw_direct_v4si}
     (expr_list:REG_DEAD (reg/v:V4SI 136 [ R3 ])
        (expr_list:REG_DEAD (reg/v:V4SI 134 [ R1 ])
            (nil))))
(insn 21 17 24 2 (set (reg:V4SI 150)
        (vec_select:V4SI (vec_concat:V8SI (reg:V4SI 146)
                (reg:V4SI 141))
            (parallel [
                    (const_int 2 [0x2])
                    (const_int 6 [0x6])
                    (const_int 3 [0x3])
                    (const_int 7 [0x7])
                ]))) "q.C":26:6 1785 {altivec_vmrglw_direct_v4si}
     (expr_list:REG_DEAD (reg:V4SI 146)
        (expr_list:REG_DEAD (reg:V4SI 141)
            (nil))))
(insn 24 21 25 2 (parallel [
            (set (reg:SI 151)
                (vec_select:SI (reg:V4SI 150)
                    (parallel [
                            (const_int 3 [0x3])
                        ])))
            (clobber (scratch:V4SI))
        ]) "q.C":28:10 1400 {*vsx_extract_si}
     (nil))
(insn 25 24 26 2 (set (reg:DI 152)
        (zero_extend:DI (reg:SI 151))) "q.C":28:10 16 {zero_extendsidi2}
     (expr_list:REG_DEAD (reg:SI 151)
        (nil)))
(insn 26 25 27 2 (parallel [
            (set (reg:SI 153)
                (vec_select:SI (reg:V4SI 150)
                    (parallel [
                            (const_int 2 [0x2])
                        ])))
            (clobber (scratch:V4SI))
        ]) "q.C":28:10 1400 {*vsx_extract_si}
     (nil))
(insn 27 26 28 2 (set (reg:DI 154)
        (zero_extend:DI (reg:SI 153))) "q.C":28:10 16 {zero_extendsidi2}
     (expr_list:REG_DEAD (reg:SI 153)
        (nil)))
(insn 28 27 29 2 (parallel [
            (set (reg:SI 155)
                (vec_select:SI (reg:V4SI 150)
                    (parallel [
                            (const_int 1 [0x1])
                        ])))
            (clobber (scratch:V4SI))
        ]) "q.C":28:10 1400 {*vsx_extract_si}
     (nil))
(insn 29 28 30 2 (set (reg:DI 156)
        (zero_extend:DI (reg:SI 155))) "q.C":28:10 16 {zero_extendsidi2}
     (expr_list:REG_DEAD (reg:SI 155)
        (nil)))
(insn 30 29 31 2 (parallel [
            (set (reg:SI 157)
                (vec_select:SI (reg:V4SI 150)
                    (parallel [
                            (const_int 0 [0])
                        ])))
            (clobber (scratch:V4SI))
        ]) "q.C":28:10 1400 {*vsx_extract_si}
     (expr_list:REG_DEAD (reg:V4SI 150)
        (nil)))


combine:

Trying 13 -> 28:
   13: r141:V4SI=vec_select(vec_concat(r164:V4SI,r162:V4SI),parallel)
      REG_DEAD r164:V4SI
   28: {r155:SI=vec_select(r141:V4SI,parallel);clobber scratch;}
      REG_DEAD r141:V4SI
Successfully matched this instruction:
(parallel [
        (set (reg:SI 155)
            (vec_select:SI (reg:V4SI 164)
                (parallel [
                        (const_int 3 [0x3])
                    ])))
        (clobber (scratch:V4SI))
    ])
allowing combination of insns 13 and 28
original costs 4 + 8 = 12
replacement cost 8
deferring deletion of insn with uid = 13.
modifying insn i3    28: {r155:SI=vec_select(r164:V4SI,parallel);clobber
scratch;}
      REG_DEAD r164:V4SI
deferring rescan insn with uid = 28.



(note 7 47 8 2 NOTE_INSN_DELETED)
(note 8 7 13 2 NOTE_INSN_FUNCTION_BEG)
(note 13 8 17 2 NOTE_INSN_DELETED)
(note 17 13 21 2 NOTE_INSN_DELETED)
(note 21 17 24 2 NOTE_INSN_DELETED)
(insn 24 21 25 2 (parallel [
            (set (reg:SI 151)
                (vec_select:SI (reg:V4SI 162)
                    (parallel [
                            (const_int 3 [0x3])
                        ])))
            (clobber (scratch:V4SI))
        ]) "q.C":28:10 1400 {*vsx_extract_si}
     (expr_list:REG_DEAD (reg:V4SI 162)
        (nil)))
(note 25 24 26 2 NOTE_INSN_DELETED)
(insn 26 25 27 2 (parallel [
            (set (reg:SI 153)
                (vec_select:SI (reg:V4SI 163)
                    (parallel [
                            (const_int 3 [0x3])
                        ])))
            (clobber (scratch:V4SI))
        ]) "q.C":28:10 1400 {*vsx_extract_si}
     (expr_list:REG_DEAD (reg:V4SI 163)
        (nil)))
(note 27 26 28 2 NOTE_INSN_DELETED)
(insn 28 27 29 2 (parallel [
            (set (reg:SI 155)
                (vec_select:SI (reg:V4SI 164)
                    (parallel [
                            (const_int 3 [0x3])
                        ])))
            (clobber (scratch:V4SI))
        ]) "q.C":28:10 1400 {*vsx_extract_si}
     (expr_list:REG_DEAD (reg:V4SI 164)
        (nil)))
(note 29 28 30 2 NOTE_INSN_DELETED)
(insn 33 32 34 2 (set (reg:DI 7 7)
        (zero_extend:DI (reg:SI 151))) "q.C":28:10 16 {zero_extendsidi2}
     (expr_list:REG_DEAD (reg:SI 151)
        (nil)))
(insn 34 33 35 2 (set (reg:DI 6 6)
        (zero_extend:DI (reg:SI 153))) "q.C":28:10 16 {zero_extendsidi2}
     (expr_list:REG_DEAD (reg:SI 153)
        (nil)))
(insn 35 34 36 2 (set (reg:DI 5 5)
        (zero_extend:DI (reg:SI 155))) "q.C":28:10 16 {zero_extendsidi2}
     (expr_list:REG_DEAD (reg:SI 155)
        (nil)))
(insn 36 35 37 2 (set (reg:DI 4 4)
        (zero_extend:DI (reg:SI 157))) "q.C":28:10 16 {zero_extendsidi2}
     (nil))

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (8 preceding siblings ...)
  2022-07-01  1:52 ` luoxhu at gcc dot gnu.org
@ 2022-07-25 15:54 ` rguenth at gcc dot gnu.org
  2022-07-25 20:16 ` segher at gcc dot gnu.org
                   ` (27 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-07-25 15:54 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2022-07-25
           Priority|P3                          |P2
     Ever confirmed|0                           |1

--- Comment #9 from Richard Biener <rguenth at gcc dot gnu.org> ---
If the issue is in combine it's probably latent on older branches.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (9 preceding siblings ...)
  2022-07-25 15:54 ` rguenth at gcc dot gnu.org
@ 2022-07-25 20:16 ` segher at gcc dot gnu.org
  2022-07-25 20:18 ` segher at gcc dot gnu.org
                   ` (26 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: segher at gcc dot gnu.org @ 2022-07-25 20:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #10 from Segher Boessenkool <segher at gcc dot gnu.org> ---
This happened after
  commit 0910c516a3d72af048af27308349167f25c406c2
  Author: Xionghu Luo <luoxhu@linux.ibm.com>
  Date:   Tue Oct 19 04:02:04 2021 -0500
which probably caused it.  That means it would be GCC 12 and later.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (10 preceding siblings ...)
  2022-07-25 20:16 ` segher at gcc dot gnu.org
@ 2022-07-25 20:18 ` segher at gcc dot gnu.org
  2022-07-26  3:34 ` luoxhu at gcc dot gnu.org
                   ` (25 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: segher at gcc dot gnu.org @ 2022-07-25 20:18 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #11 from Segher Boessenkool <segher at gcc dot gnu.org> ---
I mean, if that patch is actually flawed, this is GCC 12 and latter; if the
problem is more generic (combine, probably simplify-rtx to be exact) it is
more widespread.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (11 preceding siblings ...)
  2022-07-25 20:18 ` segher at gcc dot gnu.org
@ 2022-07-26  3:34 ` luoxhu at gcc dot gnu.org
  2022-07-26  3:34 ` luoxhu at gcc dot gnu.org
                   ` (24 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-07-26  3:34 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #12 from luoxhu at gcc dot gnu.org ---
Created attachment 53352
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=53352&action=edit
combine

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (12 preceding siblings ...)
  2022-07-26  3:34 ` luoxhu at gcc dot gnu.org
@ 2022-07-26  3:34 ` luoxhu at gcc dot gnu.org
  2022-07-26  3:35 ` luoxhu at gcc dot gnu.org
                   ` (23 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-07-26  3:34 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #13 from luoxhu at gcc dot gnu.org ---
Created attachment 53353
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=53353&action=edit
after combine

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (13 preceding siblings ...)
  2022-07-26  3:34 ` luoxhu at gcc dot gnu.org
@ 2022-07-26  3:35 ` luoxhu at gcc dot gnu.org
  2022-07-26  3:53 ` luoxhu at gcc dot gnu.org
                   ` (22 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-07-26  3:35 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #14 from luoxhu at gcc dot gnu.org ---
Created attachment 53354
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=53354&action=edit
split2

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (14 preceding siblings ...)
  2022-07-26  3:35 ` luoxhu at gcc dot gnu.org
@ 2022-07-26  3:53 ` luoxhu at gcc dot gnu.org
  2022-07-26  6:28 ` luoxhu at gcc dot gnu.org
                   ` (21 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-07-26  3:53 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #15 from luoxhu at gcc dot gnu.org ---
In combine: vec_select(vec_concat and the followed vec_select are combined to a
single extract instruction, which seems reasonable for both LE and BE?

R146:   0 1 2 3
R141:   4 5 6 7
R150:   2 6 3 7    // vec_select(vec_concat(r146:V4SI,r141:V4SI),[2 6 3 7])
R151:   R150[3]    // vec_select(r150:V4SI,3)

=> 

R151:   R141[3]   //  vec_select(r141:V4SI,3)



Trying 21 -> 24:
   21: r150:V4SI=vec_select(vec_concat(r146:V4SI,r141:V4SI),parallel)
      REG_DEAD r146:V4SI
      REG_DEAD r141:V4SI
   24: {r151:SI=vec_select(r150:V4SI,parallel);clobber scratch;}
Failed to match this instruction:
(parallel [
        (set (reg:SI 151)
            (vec_select:SI (reg:V4SI 141)
                (parallel [
                        (const_int 3 [0x3])
                    ])))
        (clobber (scratch:SI))
        (set (reg:V4SI 150)
            (vec_select:V4SI (vec_concat:V8SI (reg:V4SI 146)
                    (reg:V4SI 141))
                (parallel [
                        (const_int 2 [0x2])
                        (const_int 6 [0x6])
                        (const_int 3 [0x3])
                        (const_int 7 [0x7])
                    ])))
    ])
Failed to match this instruction:
(parallel [
        (set (reg:SI 151)
            (vec_select:SI (reg:V4SI 141)
                (parallel [
                        (const_int 3 [0x3])
                    ])))
        (set (reg:V4SI 150)
            (vec_select:V4SI (vec_concat:V8SI (reg:V4SI 146)
                    (reg:V4SI 141))
                (parallel [
                        (const_int 2 [0x2])
                        (const_int 6 [0x6])
                        (const_int 3 [0x3])
                        (const_int 7 [0x7])
                    ])))
    ])
Successfully matched this instruction:
(set (reg:V4SI 150)
    (vec_select:V4SI (vec_concat:V8SI (reg:V4SI 146)
            (reg:V4SI 141))
        (parallel [
                (const_int 2 [0x2])
                (const_int 6 [0x6])
                (const_int 3 [0x3])
                (const_int 7 [0x7])
            ])))
Successfully matched this instruction:
(set (reg:SI 151)
    (vec_select:SI (reg:V4SI 141)
        (parallel [
                (const_int 3 [0x3])
            ])))
allowing combination of insns 21 and 24
original costs 4 + 4 = 8
replacement costs 4 + 4 = 8
modifying insn i2    21:
r150:V4SI=vec_select(vec_concat(r146:V4SI,r141:V4SI),parallel)
      REG_DEAD r146:V4SI
deferring rescan insn with uid = 21.
modifying insn i3    24: {r151:SI=vec_select(r141:V4SI,parallel);clobber
scratch;}
      REG_DEAD r141:V4SI
deferring rescan insn with uid = 24.


I guess the previous unspec implementation bypassed the LE + LE swap check, so
now in split2, we should generate vextuwlx instead of vextuwrx on little
endian?

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (15 preceding siblings ...)
  2022-07-26  3:53 ` luoxhu at gcc dot gnu.org
@ 2022-07-26  6:28 ` luoxhu at gcc dot gnu.org
  2022-07-29 11:10 ` rguenth at gcc dot gnu.org
                   ` (20 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: luoxhu at gcc dot gnu.org @ 2022-07-26  6:28 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #16 from luoxhu at gcc dot gnu.org ---
The attached files are all built with -mcpu=power8 and the case also fails on
P8LE.
Also I verified the code produces expected output on P8BE. ('Aborted' is caused
by BE returns 0x41 instead of 0x98 for LE.)

P8LE &P9LE:

luoxhu@gcc135 build $ ./q.bad
B0: 0, 0,0,0
Aborted

P8BE:
luoxhu@gcc203:~/workspace/build$ ./q.bad
B0: 41fcef98, 91648e8b,7dca18c6,61707865
Aborted


P8BE seems generates better code with the patch:

luoxhu@gcc203:~/workspace/build$ diff q.good.S q.bad.S -U5
--- q.good.S    2022-07-26 09:19:32.487216946 +0300
+++ q.bad.S     2022-07-26 09:15:58.006770996 +0300
@@ -1,6 +1,7 @@
        .file   "q.C"
+       .machine power8
        .section        ".text"
        .section        .rodata.str1.8,"aMS",@progbits,1
        .align 3
 .LC0:
        .string "B0: %x, %x,%x,%x\n"
@@ -24,19 +25,17 @@
        .cfi_def_cfa_offset 128
        .cfi_offset 65, 16
        .cfi_offset 30, -16
        .cfi_offset 31, -8
        mr %r30,%r3
-       vmrghw %v2,%v2,%v4
-       vmrghw %v5,%v3,%v5
-       vmrghw %v5,%v2,%v5
-       vspltw %v0,%v5,3
+       vspltw %v0,%v5,0
        mfvsrwz %r7,%vs32
-       vspltw %v0,%v5,2
+       vspltw %v0,%v4,0
        mfvsrwz %r6,%vs32
-       mfvsrwz %r5,%vs37
-       vspltw %v0,%v5,0
+       vspltw %v0,%v3,0
+       mfvsrwz %r5,%vs32
+       vspltw %v0,%v2,0
        mfvsrwz %r31,%vs32
        rldicl %r7,%r7,0,32
        rldicl %r6,%r6,0,32
        rldicl %r5,%r5,0,32
        rldicl %r4,%r31,0,32
@@ -169,6 +168,6 @@
        .set    .LANCHOR1,. + 0
        .type   res, @object
        .size   res, 1
 res:
        .zero   1
-       .ident  "GCC: (Debian 9.5.0-1) 9.5.0"
+       .ident  "GCC: (GNU) 13.0.0 20220726 (experimental)"

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (16 preceding siblings ...)
  2022-07-26  6:28 ` luoxhu at gcc dot gnu.org
@ 2022-07-29 11:10 ` rguenth at gcc dot gnu.org
  2022-07-29 11:21 ` rguenth at gcc dot gnu.org
                   ` (19 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-07-29 11:10 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #17 from Richard Biener <rguenth at gcc dot gnu.org> ---
Seeing

Trying 21 -> 24:
   21: r150:V4SI=vec_select(vec_concat(r146:V4SI,r141:V4SI),parallel)
      REG_DEAD r146:V4SI
      REG_DEAD r141:V4SI
   24: {r151:SI=vec_select(r150:V4SI,parallel);clobber scratch;}
...
Successfully matched this instruction:
(set (reg:SI 151)
    (vec_select:SI (reg:V4SI 141)
        (parallel [
                (const_int 3 [0x3])
            ])))

shouldn't that be (vec_select:SI (reg:V4SI 146) ...)?  Or does
(vec_concat:V8SI (xx:V4SI xx:V4SI)) magically swap the two V4SI vectors?

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (17 preceding siblings ...)
  2022-07-29 11:10 ` rguenth at gcc dot gnu.org
@ 2022-07-29 11:21 ` rguenth at gcc dot gnu.org
  2022-08-03  6:10 ` yinyuefengyi at gmail dot com
                   ` (18 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-07-29 11:21 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #18 from Richard Biener <rguenth at gcc dot gnu.org> ---
(In reply to Richard Biener from comment #17)
> Seeing
> 
> Trying 21 -> 24:
>    21: r150:V4SI=vec_select(vec_concat(r146:V4SI,r141:V4SI),parallel)
>       REG_DEAD r146:V4SI
>       REG_DEAD r141:V4SI
>    24: {r151:SI=vec_select(r150:V4SI,parallel);clobber scratch;}
> ...
> Successfully matched this instruction:
> (set (reg:SI 151)
>     (vec_select:SI (reg:V4SI 141)
>         (parallel [
>                 (const_int 3 [0x3])
>             ])))
> 
> shouldn't that be (vec_select:SI (reg:V4SI 146) ...)?  Or does
> (vec_concat:V8SI (xx:V4SI xx:V4SI)) magically swap the two V4SI vectors?

Ah, no, the combination looks correct to me.  After all the select in 21
interleaves the two inputs.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (18 preceding siblings ...)
  2022-07-29 11:21 ` rguenth at gcc dot gnu.org
@ 2022-08-03  6:10 ` yinyuefengyi at gmail dot com
  2022-08-03  6:38 ` yinyuefengyi at gmail dot com
                   ` (17 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: yinyuefengyi at gmail dot com @ 2022-08-03  6:10 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #19 from Xionghu Luo (luoxhu at gcc dot gnu.org) <yinyuefengyi at gmail dot com> ---
(In reply to Xionghu Luo (luoxhu@gcc.gnu.org) from comment #15)
> In combine: vec_select(vec_concat and the followed vec_select are combined
> to a single extract instruction, which seems reasonable for both LE and BE?
> 
> R146:   0 1 2 3
> R141:   4 5 6 7
> R150:   2 6 3 7    // vec_select(vec_concat(r146:V4SI,r141:V4SI),[2 6 3 7])
> R151:   R150[3]    // vec_select(r150:V4SI,3)
> 
> => 
> 
> R151:   R141[3]   //  vec_select(r141:V4SI,3)
> 
>   
> 
> Trying 21 -> 24:
>    21: r150:V4SI=vec_select(vec_concat(r146:V4SI,r141:V4SI),parallel)
>       REG_DEAD r146:V4SI
>       REG_DEAD r141:V4SI
>    24: {r151:SI=vec_select(r150:V4SI,parallel);clobber scratch;}
> Failed to match this instruction:
> (parallel [
>         (set (reg:SI 151)
>             (vec_select:SI (reg:V4SI 141)
>                 (parallel [
>                         (const_int 3 [0x3])
>                     ])))
>         (clobber (scratch:SI))
>         (set (reg:V4SI 150)
>             (vec_select:V4SI (vec_concat:V8SI (reg:V4SI 146)
>                     (reg:V4SI 141))
>                 (parallel [
>                         (const_int 2 [0x2])
>                         (const_int 6 [0x6])
>                         (const_int 3 [0x3])
>                         (const_int 7 [0x7])
>                     ])))
>     ])
> Failed to match this instruction:
> (parallel [
>         (set (reg:SI 151)
>             (vec_select:SI (reg:V4SI 141)
>                 (parallel [
>                         (const_int 3 [0x3])
>                     ])))
>         (set (reg:V4SI 150)
>             (vec_select:V4SI (vec_concat:V8SI (reg:V4SI 146)
>                     (reg:V4SI 141))
>                 (parallel [
>                         (const_int 2 [0x2])
>                         (const_int 6 [0x6])
>                         (const_int 3 [0x3])
>                         (const_int 7 [0x7])
>                     ])))
>     ])
> Successfully matched this instruction:
> (set (reg:V4SI 150)
>     (vec_select:V4SI (vec_concat:V8SI (reg:V4SI 146)
>             (reg:V4SI 141))
>         (parallel [
>                 (const_int 2 [0x2])
>                 (const_int 6 [0x6])
>                 (const_int 3 [0x3])
>                 (const_int 7 [0x7])
>             ])))
> Successfully matched this instruction:
> (set (reg:SI 151)
>     (vec_select:SI (reg:V4SI 141)
>         (parallel [
>                 (const_int 3 [0x3])
>             ])))
> allowing combination of insns 21 and 24
> original costs 4 + 4 = 8
> replacement costs 4 + 4 = 8
> modifying insn i2    21:
> r150:V4SI=vec_select(vec_concat(r146:V4SI,r141:V4SI),parallel)
>       REG_DEAD r146:V4SI
> deferring rescan insn with uid = 21.
> modifying insn i3    24: {r151:SI=vec_select(r141:V4SI,parallel);clobber
> scratch;}
>       REG_DEAD r141:V4SI
> deferring rescan insn with uid = 24.
> 
> 
> I guess the previous unspec implementation bypassed the LE + LE swap check,
> so now in split2, we should generate vextuwlx instead of vextuwrx on little
> endian?


This nested vec_select+vec_select+vec_concat optimization is introduced by Uros
in simplify-rtx.c by PR32661, unfortunately it only works for Power BE
platforms, disable that piece of code could work due to not combined the nested
vec_select optimizations...

For Power LE, firstly:

Trying 21 -> 24:

 R146:   3 2 1 0
 R141:   7 6 5 4
 R150:   7 3 6 2    // vec_select(vec_concat(r146:V4SI,r141:V4SI),[2 6 3 7])
 R151:   R150[3]    // vec_select(r150:V4SI,3)

 => 

currently:
 R151:   R141[3]   //  vec_select(r141:V4SI,3)

But it should be:
 R151:   R146[3]   //  vec_select(r146:V4SI,3)

Which means current:

R151: R150[3] R141[3]
R153: R150[2] R146[3]
R155: R150[1] R141[2]
R157: R150[0] R146[2]

Should be optimized to after the first nested vec_select optimization:

R151: R150[3] R146[3]
R153: R150[2] R141[3]
R155: R150[1] R146[2]
R157: R150[0] R141[2]

With some little endian check and swap could achieve the result (swap op00 and
op01).  But
Secondly there is another "nested vec_select" optimisation which produces
R151=R165[3]:

Trying 21 -> 26:
...

R146 R165 R163 [7 3 6 2]
R151: R146[3]   =>  R165[3]  (this is wrong!)

While R162, R163, R164, R165 is input value R0 R1 R2 R3.  the
vsx_extract_v4si_di_p9 index should be "0" instead of "3".

correct should be:

R151: R165[0]
R153: R164[0]
R155: R163[0]
R157: R162[0]


(insn 44 2 4 2 (set (reg:V4SI 162)
        (reg:V4SI 66 2 [ R0 ])) "q.C":36:1 1157 {vsx_movv4si_64bit}
     (expr_list:REG_DEAD (reg:V4SI 66 2 [ R0 ])
        (nil)))
(note 4 44 45 2 NOTE_INSN_DELETED)
(insn 45 4 5 2 (set (reg:V4SI 163)
        (reg:V4SI 67 3 [ R1 ])) "q.C":36:1 1157 {vsx_movv4si_64bit}
     (expr_list:REG_DEAD (reg:V4SI 67 3 [ R1 ])
        (nil)))
(note 5 45 46 2 NOTE_INSN_DELETED)
(insn 46 5 6 2 (set (reg:V4SI 164)
        (reg:V4SI 68 4 [ R2 ])) "q.C":36:1 1157 {vsx_movv4si_64bit}
     (expr_list:REG_DEAD (reg:V4SI 68 4 [ R2 ])
        (nil)))
(note 6 46 47 2 NOTE_INSN_DELETED)
(insn 47 6 7 2 (set (reg:V4SI 165)
        (reg:V4SI 69 5 [ R3 ])) "q.C":36:1 1157 {vsx_movv4si_64bit}
     (expr_list:REG_DEAD (reg:V4SI 69 5 [ R3 ])
        (nil)))
...
(insn 33 32 34 2 (parallel [
            (set (reg:DI 7 7)
                (zero_extend:DI (vec_select:SI (reg:V4SI 162)
                        (parallel [
                                (const_int 3 [0x3])
                            ]))))
            (clobber (scratch:SI))
        ]) "q.C":28:10 1396 {*vsx_extract_v4si_di_p9}
     (expr_list:REG_DEAD (reg:V4SI 162)
        (nil)))
(insn 34 33 35 2 (parallel [
            (set (reg:DI 6 6)
                (zero_extend:DI (vec_select:SI (reg:V4SI 163)
                        (parallel [
                                (const_int 3 [0x3])
                            ]))))
            (clobber (scratch:SI))
        ]) "q.C":28:10 1396 {*vsx_extract_v4si_di_p9}
     (expr_list:REG_DEAD (reg:V4SI 163)
        (nil)))
(insn 35 34 36 2 (parallel [
            (set (reg:DI 5 5)
                (zero_extend:DI (vec_select:SI (reg:V4SI 164)
                        (parallel [
                                (const_int 3 [0x3])
                            ]))))
            (clobber (scratch:SI))
        ]) "q.C":28:10 1396 {*vsx_extract_v4si_di_p9}
     (expr_list:REG_DEAD (reg:V4SI 164)
        (nil)))
(insn 36 35 37 2 (parallel [
            (set (reg:DI 4 4)
                (zero_extend:DI (vec_select:SI (reg:V4SI 165)
                        (parallel [
                                (const_int 3 [0x3])
                            ]))))
            (clobber (scratch:SI))
        ]) "q.C":28:10 1396 {*vsx_extract_v4si_di_p9}
     (expr_list:REG_DEAD (reg:V4SI 165)
        (nil)))



But this is not easy to change the index again... Is the analysis reasonable?
@Segher.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (19 preceding siblings ...)
  2022-08-03  6:10 ` yinyuefengyi at gmail dot com
@ 2022-08-03  6:38 ` yinyuefengyi at gmail dot com
  2022-08-03  8:06 ` linkw at gcc dot gnu.org
                   ` (16 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: yinyuefengyi at gmail dot com @ 2022-08-03  6:38 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #20 from Xionghu Luo (luoxhu at gcc dot gnu.org) <yinyuefengyi at gmail dot com> ---
Another reference is manually change the generated assembly with modifying the
source and index vspltw to verify:

luoxhu@gcc135 build $ diff q.bad.s q.good.s -U12
--- q.bad.s     2022-08-03 06:30:08.298451116 +0000
+++ q.good.s    2022-08-03 06:30:52.887250451 +0000
@@ -18,31 +18,31 @@
        addi 2,2,.TOC.-.LCF0@l
        .localentry     _Z3fooPhPjDv4_jS1_S1_S1_,.-_Z3fooPhPjDv4_jS1_S1_S1_
        mflr %r0
        std %r0,16(%r1)
        std %r30,-16(%r1)
        std %r31,-8(%r1)
        stdu %r1,-112(%r1)
        .cfi_def_cfa_offset 112
        .cfi_offset 65, 16
        .cfi_offset 30, -16
        .cfi_offset 31, -8
        mr %r30,%r3
-       vspltw %v0,%v2,0
+       vspltw %v0,%v5,3
        mfvsrwz %r7,%vs32
-       vspltw %v0,%v3,0
+       vspltw %v0,%v4,3
        mfvsrwz %r6,%vs32
-       vspltw %v0,%v4,0
+       vspltw %v0,%v3,3
        mfvsrwz %r5,%vs32
-       vspltw %v0,%v5,0
+       vspltw %v0,%v2,3
        mfvsrwz %r31,%vs32
        rldicl %r7,%r7,0,32
        rldicl %r6,%r6,0,32
        rldicl %r5,%r5,0,32
        rldicl %r4,%r31,0,32
        addis %r3,%r2,.LC0@toc@ha
        addi %r3,%r3,.LC0@toc@l
        bl printf
        nop
        stb %r31,0(%r30)
        addi %r1,%r1,112
        .cfi_def_cfa_offset 0

luoxhu@gcc135 build $ gcc q.good.s -o q.good
luoxhu@gcc135 build $ ./q.good
B0: 41fcef98, 91648e8b,7dca18c6,61707865

Which means both register and index are incorrectly used in LE nested
vec_select optimization.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (20 preceding siblings ...)
  2022-08-03  6:38 ` yinyuefengyi at gmail dot com
@ 2022-08-03  8:06 ` linkw at gcc dot gnu.org
  2022-08-03  8:24 ` rguenther at suse dot de
                   ` (15 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: linkw at gcc dot gnu.org @ 2022-08-03  8:06 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #21 from Kewen Lin <linkw at gcc dot gnu.org> ---
I didn't look into this in details, but something in the culprit commit caught
my eyes, take altivec_vmrghh as example:

Before the patch, the pattern

   [(set (match_operand:V8HI 0 "register_operand" "=v")
         (vec_select:V8HI
           (vec_concat:V16HI
             (match_operand:V8HI 1 "register_operand" "v")
             (match_operand:V8HI 2 "register_operand" "v"))
           (parallel [(const_int 0) (const_int 8)
                      (const_int 1) (const_int 9)
                      (const_int 2) (const_int 10)
                      (const_int 3) (const_int 11)])))]

can match vmrghh on BE while vmrglh on LE. It indicates this pattern has
different semantic from underlying instruction perspectives.

After the patch, this pattern only matches vmrghh.

IMHO, this part has semantic change before and after the patch. The code before
the patch looks more reasonable to me, since the pattern can have different
meanings on BE and LE (underlying behavior).

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (21 preceding siblings ...)
  2022-08-03  8:06 ` linkw at gcc dot gnu.org
@ 2022-08-03  8:24 ` rguenther at suse dot de
  2022-08-03  8:50 ` linkw at gcc dot gnu.org
                   ` (14 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rguenther at suse dot de @ 2022-08-03  8:24 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #22 from rguenther at suse dot de <rguenther at suse dot de> ---
On Wed, 3 Aug 2022, linkw at gcc dot gnu.org wrote:

> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069
> 
> --- Comment #21 from Kewen Lin <linkw at gcc dot gnu.org> ---
> I didn't look into this in details, but something in the culprit commit caught
> my eyes, take altivec_vmrghh as example:
> 
> Before the patch, the pattern
> 
>    [(set (match_operand:V8HI 0 "register_operand" "=v")
>          (vec_select:V8HI
>            (vec_concat:V16HI
>              (match_operand:V8HI 1 "register_operand" "v")
>              (match_operand:V8HI 2 "register_operand" "v"))
>            (parallel [(const_int 0) (const_int 8)
>                       (const_int 1) (const_int 9)
>                       (const_int 2) (const_int 10)
>                       (const_int 3) (const_int 11)])))]
> 
> can match vmrghh on BE while vmrglh on LE. It indicates this pattern has
> different semantic from underlying instruction perspectives.
> 
> After the patch, this pattern only matches vmrghh.
> 
> IMHO, this part has semantic change before and after the patch. The code before
> the patch looks more reasonable to me, since the pattern can have different
> meanings on BE and LE (underlying behavior).

Ideally we would avoid semantic difference of RTL depending on the target.
If that's not avoidable there should be target macros/hooks that specify
the desired semantics.  I assume the semantic difference is in
vec_concat behavior but that's just documented as

@findex vec_concat
@item (vec_concat:@var{m} @var{x1} @var{x2})
Describes a vector concat operation.  The result is a concatenation of the
vectors or scalars @var{x1} and @var{x2}; its length is the sum of the
lengths of the two inputs.

which is a bit unspecific.  To me it implies that
vec_select of a single lane N of the concat result can be distributed
to the operands of the vec_concat in the obvious way (if N >=
GET_MODE_NUNITS (x1) subtract GET_MODE_NUNITS and use x2)

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (22 preceding siblings ...)
  2022-08-03  8:24 ` rguenther at suse dot de
@ 2022-08-03  8:50 ` linkw at gcc dot gnu.org
  2022-08-03  8:59 ` rguenth at gcc dot gnu.org
                   ` (13 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: linkw at gcc dot gnu.org @ 2022-08-03  8:50 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #23 from Kewen Lin <linkw at gcc dot gnu.org> ---
> Ideally we would avoid semantic difference of RTL depending on the target.
> If that's not avoidable there should be target macros/hooks that specify
> the desired semantics.  

Not sure, IMHO it seems it doesn't depend on the target but on endianness
(BYTES_BIG_ENDIAN)? Segher and Mike may have more insights on this.

> I assume the semantic difference is in
> vec_concat behavior but that's just documented as
> 
> @findex vec_concat
> @item (vec_concat:@var{m} @var{x1} @var{x2})
> Describes a vector concat operation.  The result is a concatenation of the
> vectors or scalars @var{x1} and @var{x2}; its length is the sum of the
> lengths of the two inputs.
> 
> which is a bit unspecific.  To me it implies that
> vec_select of a single lane N of the concat result can be distributed
> to the operands of the vec_concat in the obvious way (if N >=
> GET_MODE_NUNITS (x1) subtract GET_MODE_NUNITS and use x2)

Yeah, the documentation isn't clear, neither for vec_select. I guess vec_select
also matters here, the indexes for vec_select would have the LE ordering like
subreg byte offset on LE?

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (23 preceding siblings ...)
  2022-08-03  8:50 ` linkw at gcc dot gnu.org
@ 2022-08-03  8:59 ` rguenth at gcc dot gnu.org
  2022-08-03  9:20 ` rsandifo at gcc dot gnu.org
                   ` (12 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rguenth at gcc dot gnu.org @ 2022-08-03  8:59 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |rearnsha at gcc dot gnu.org,
                   |                            |rsandifo at gcc dot gnu.org

--- Comment #24 from Richard Biener <rguenth at gcc dot gnu.org> ---
Richards, how is this handled on arm BE vs LE?  We don't have a specific
VECTOR_LANES_BIG_ENDIAN, but we are using BYTES_BIG_ENDIAN already for some
of the VEC_*_{LO,HI}_EXPR tree codes (but IIRC not for anything regarding
to VEC_PERM_EXPR for example which looks most related to select/concat on RTL)

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (24 preceding siblings ...)
  2022-08-03  8:59 ` rguenth at gcc dot gnu.org
@ 2022-08-03  9:20 ` rsandifo at gcc dot gnu.org
  2022-08-03  9:25 ` rsandifo at gcc dot gnu.org
                   ` (11 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2022-08-03  9:20 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #25 from rsandifo at gcc dot gnu.org <rsandifo at gcc dot gnu.org> ---
AIUI the rules are:

- GCC vector lane numbers always correspond to memory array indices.
  For example, lane 0 always comes first in memory.

- On big-endian targets, vector loads and stores are assumed to put the
  first memory element at the most significant end of the vector register.

So lane 0 refers to the most-significant register element on big-endian
targets and to the least-significant register element on little-endian
targets.  So:

  (vec_select:V4SI (reg:V4SI R)
    [(const_int 2) (const_int 6) (const_int 3) (const_int 7)])

describes a different option on big-endian and little-endian but:

  (vec_select:V4SI (mem:V4SI M)
    [(const_int 2) (const_int 6) (const_int 3) (const_int 7)])

is endian-independent.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (25 preceding siblings ...)
  2022-08-03  9:20 ` rsandifo at gcc dot gnu.org
@ 2022-08-03  9:25 ` rsandifo at gcc dot gnu.org
  2022-08-03 18:01 ` segher at gcc dot gnu.org
                   ` (10 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2022-08-03  9:25 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #26 from rsandifo at gcc dot gnu.org <rsandifo at gcc dot gnu.org> ---
> describes a different option on big-endian and little-endian

should have said: describes a different instruction.  In other words,
the mapping of gimple to RTL operations is fixed, but the mapping of
those RTL operations to machine instructions varies by endianness
(if registers are involved).

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (26 preceding siblings ...)
  2022-08-03  9:25 ` rsandifo at gcc dot gnu.org
@ 2022-08-03 18:01 ` segher at gcc dot gnu.org
  2022-08-03 18:06 ` segher at gcc dot gnu.org
                   ` (9 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: segher at gcc dot gnu.org @ 2022-08-03 18:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #27 from Segher Boessenkool <segher at gcc dot gnu.org> ---
IMO what vec_select calls element 0 is always in the first argument of the
vec_concat it works on, in BE as well as LE.  But yes this is quite
underdefined
in our documentation, and who know what is actually implemented, in targets as
well as in generic code :-(

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (27 preceding siblings ...)
  2022-08-03 18:01 ` segher at gcc dot gnu.org
@ 2022-08-03 18:06 ` segher at gcc dot gnu.org
  2022-08-04  9:17 ` rsandifo at gcc dot gnu.org
                   ` (8 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: segher at gcc dot gnu.org @ 2022-08-03 18:06 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #28 from Segher Boessenkool <segher at gcc dot gnu.org> ---
(In reply to rsandifo@gcc.gnu.org from comment #25)
> - On big-endian targets, vector loads and stores are assumed to put the
>   first memory element at the most significant end of the vector register.

I agree with everything here, except calling this "most significant".  That
just makes no sense for vectors.  It is element 0, but that is not more
significant than any other element :-)  Vectors aren't integers.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (28 preceding siblings ...)
  2022-08-03 18:06 ` segher at gcc dot gnu.org
@ 2022-08-04  9:17 ` rsandifo at gcc dot gnu.org
  2022-08-04  9:21 ` rearnsha at gcc dot gnu.org
                   ` (7 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rsandifo at gcc dot gnu.org @ 2022-08-04  9:17 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #29 from rsandifo at gcc dot gnu.org <rsandifo at gcc dot gnu.org> ---
(In reply to Segher Boessenkool from comment #28)
> (In reply to rsandifo@gcc.gnu.org from comment #25)
> > - On big-endian targets, vector loads and stores are assumed to put the
> >   first memory element at the most significant end of the vector register.
> 
> I agree with everything here, except calling this "most significant".  That
> just makes no sense for vectors.  It is element 0, but that is not more
> significant than any other element :-)  Vectors aren't integers.
Ah, yeah, I should have said “most significant end of the vector register
if the vector register is viewed as a single integer” (which is an important
difference).  The point here was that:

(a) by default, TI subregs of V4SI registers are assumed to be nops
    (and vice versa)

(b) consequently, TImode loads are assumed to perform the same operation
    as V4SI loads

So endianness assumptions for single integers carry over to like-sized
vector modes.  On big-endian, the first element of the V4SI is assumed
to line up with the top 32 bits of a 128-bit integer.

Although it's possible to force subregs not to be nops (via
targetm.can_change_mode_class), that only really affects changes
to specific hard register classes.  The generic rules still apply
to the layout of pseudos.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (29 preceding siblings ...)
  2022-08-04  9:17 ` rsandifo at gcc dot gnu.org
@ 2022-08-04  9:21 ` rearnsha at gcc dot gnu.org
  2022-08-04  9:59 ` yinyuefengyi at gmail dot com
                   ` (6 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: rearnsha at gcc dot gnu.org @ 2022-08-04  9:21 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #30 from Richard Earnshaw <rearnsha at gcc dot gnu.org> ---
(In reply to rsandifo@gcc.gnu.org from comment #29)
> (In reply to Segher Boessenkool from comment #28)
> > (In reply to rsandifo@gcc.gnu.org from comment #25)
> > > - On big-endian targets, vector loads and stores are assumed to put the
> > >   first memory element at the most significant end of the vector register.
> > 
> > I agree with everything here, except calling this "most significant".  That
> > just makes no sense for vectors.  It is element 0, but that is not more
> > significant than any other element :-)  Vectors aren't integers.
> Ah, yeah, I should have said “most significant end of the vector register
> if the vector register is viewed as a single integer” (which is an important
> difference).  The point here was that:
> 
> (a) by default, TI subregs of V4SI registers are assumed to be nops
>     (and vice versa)
> 
> (b) consequently, TImode loads are assumed to perform the same operation
>     as V4SI loads
> 
> So endianness assumptions for single integers carry over to like-sized
> vector modes.  On big-endian, the first element of the V4SI is assumed
> to line up with the top 32 bits of a 128-bit integer.
> 
> Although it's possible to force subregs not to be nops (via
> targetm.can_change_mode_class), that only really affects changes
> to specific hard register classes.  The generic rules still apply
> to the layout of pseudos.

It's not just about when the vector is viewed as an integer.  There's also the
case when a vector of NxM is viewed as N/2xM*2 or vice versa, or even for other
powers of two.  We've tended to call the lane ordering 'little-endian' if the
lower numbered lane is in the least significant bits of a wider element size,
and 'big-endian' if it's in the most significant bits.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (30 preceding siblings ...)
  2022-08-04  9:21 ` rearnsha at gcc dot gnu.org
@ 2022-08-04  9:59 ` yinyuefengyi at gmail dot com
  2022-08-04 10:01 ` yinyuefengyi at gmail dot com
                   ` (5 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: yinyuefengyi at gmail dot com @ 2022-08-04  9:59 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #31 from Xionghu Luo (luoxhu at gcc dot gnu.org) <yinyuefengyi at gmail dot com> ---
Created attachment 53408
  --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=53408&action=edit
0001-rs6000-Fix-incorrect-RTL-for-Power-LE-when-removing-

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (31 preceding siblings ...)
  2022-08-04  9:59 ` yinyuefengyi at gmail dot com
@ 2022-08-04 10:01 ` yinyuefengyi at gmail dot com
  2023-01-16 18:00 ` jakub at gcc dot gnu.org
                   ` (4 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: yinyuefengyi at gmail dot com @ 2022-08-04 10:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #32 from Xionghu Luo (luoxhu at gcc dot gnu.org) <yinyuefengyi at gmail dot com> ---
Thanks for all the information! It inspires to me that "native RTL should be
endian-independent". So both big-endian and little-endian platform should
generate same (vec_select (vec_concat (R0 R1) [0 4 1 5])) for altivec_vmrghw,
then combine pass could do correct "nested vec_select" optimization, the endian
check are left to ASM generation at last, that's the benefit for removing the
UNSPECS.  My culprit patch did change the LE representation, sorry for the
stupid mistake...

Attached the fix patch.  If it is reasonable, I will continue refine it and
send to maillist.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (32 preceding siblings ...)
  2022-08-04 10:01 ` yinyuefengyi at gmail dot com
@ 2023-01-16 18:00 ` jakub at gcc dot gnu.org
  2023-03-24 16:16 ` jakub at gcc dot gnu.org
                   ` (3 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: jakub at gcc dot gnu.org @ 2023-01-16 18:00 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

Jakub Jelinek <jakub at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |jakub at gcc dot gnu.org

--- Comment #34 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
What is the state of this PR?  I see patches posted in August, but don't see
anything committed...

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (33 preceding siblings ...)
  2023-01-16 18:00 ` jakub at gcc dot gnu.org
@ 2023-03-24 16:16 ` jakub at gcc dot gnu.org
  2023-03-24 17:01 ` bergner at gcc dot gnu.org
                   ` (2 subsequent siblings)
  37 siblings, 0 replies; 39+ messages in thread
From: jakub at gcc dot gnu.org @ 2023-03-24 16:16 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #35 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Ping again.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (34 preceding siblings ...)
  2023-03-24 16:16 ` jakub at gcc dot gnu.org
@ 2023-03-24 17:01 ` bergner at gcc dot gnu.org
  2023-03-31  2:57 ` yinyuefengyi at gmail dot com
  2023-05-08 12:24 ` [Bug target/106069] [12/13/14 " rguenth at gcc dot gnu.org
  37 siblings, 0 replies; 39+ messages in thread
From: bergner at gcc dot gnu.org @ 2023-03-24 17:01 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #36 from Peter Bergner <bergner at gcc dot gnu.org> ---
(In reply to Jakub Jelinek from comment #34)
> What is the state of this PR?  I see patches posted in August, but don't see
> anything committed...

I've seen some patch submissions and pings in February and it looks like it
just needs a review.  I'll work with the team to get this reviewed.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (35 preceding siblings ...)
  2023-03-24 17:01 ` bergner at gcc dot gnu.org
@ 2023-03-31  2:57 ` yinyuefengyi at gmail dot com
  2023-05-08 12:24 ` [Bug target/106069] [12/13/14 " rguenth at gcc dot gnu.org
  37 siblings, 0 replies; 39+ messages in thread
From: yinyuefengyi at gmail dot com @ 2023-03-31  2:57 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

--- Comment #37 from Xionghu Luo (luoxhu at gcc dot gnu.org) <yinyuefengyi at gmail dot com> ---
https://gcc.gnu.org/pipermail/gcc-patches/2023-March/614932.html

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [Bug target/106069] [12/13/14 Regression] wrong code with -O -fno-tree-forwprop -maltivec on ppc64le
  2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
                   ` (36 preceding siblings ...)
  2023-03-31  2:57 ` yinyuefengyi at gmail dot com
@ 2023-05-08 12:24 ` rguenth at gcc dot gnu.org
  37 siblings, 0 replies; 39+ messages in thread
From: rguenth at gcc dot gnu.org @ 2023-05-08 12:24 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=106069

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
   Target Milestone|12.3                        |12.4

--- Comment #38 from Richard Biener <rguenth at gcc dot gnu.org> ---
GCC 12.3 is being released, retargeting bugs to GCC 12.4.

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2023-05-08 12:24 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-06-23 23:13 [Bug target/106069] New: wrong code with -O -fno-tree-forwprop -maltivec on ppc64le mpolacek at gcc dot gnu.org
2022-06-23 23:15 ` [Bug target/106069] [12/13 Regression] " mpolacek at gcc dot gnu.org
2022-06-23 23:18 ` mpolacek at gcc dot gnu.org
2022-06-24  3:25 ` luoxhu at gcc dot gnu.org
2022-06-24 13:03 ` mpolacek at gcc dot gnu.org
2022-06-30  8:13 ` luoxhu at gcc dot gnu.org
2022-06-30  8:15 ` luoxhu at gcc dot gnu.org
2022-06-30 17:32 ` segher at gcc dot gnu.org
2022-06-30 17:34 ` segher at gcc dot gnu.org
2022-07-01  1:52 ` luoxhu at gcc dot gnu.org
2022-07-25 15:54 ` rguenth at gcc dot gnu.org
2022-07-25 20:16 ` segher at gcc dot gnu.org
2022-07-25 20:18 ` segher at gcc dot gnu.org
2022-07-26  3:34 ` luoxhu at gcc dot gnu.org
2022-07-26  3:34 ` luoxhu at gcc dot gnu.org
2022-07-26  3:35 ` luoxhu at gcc dot gnu.org
2022-07-26  3:53 ` luoxhu at gcc dot gnu.org
2022-07-26  6:28 ` luoxhu at gcc dot gnu.org
2022-07-29 11:10 ` rguenth at gcc dot gnu.org
2022-07-29 11:21 ` rguenth at gcc dot gnu.org
2022-08-03  6:10 ` yinyuefengyi at gmail dot com
2022-08-03  6:38 ` yinyuefengyi at gmail dot com
2022-08-03  8:06 ` linkw at gcc dot gnu.org
2022-08-03  8:24 ` rguenther at suse dot de
2022-08-03  8:50 ` linkw at gcc dot gnu.org
2022-08-03  8:59 ` rguenth at gcc dot gnu.org
2022-08-03  9:20 ` rsandifo at gcc dot gnu.org
2022-08-03  9:25 ` rsandifo at gcc dot gnu.org
2022-08-03 18:01 ` segher at gcc dot gnu.org
2022-08-03 18:06 ` segher at gcc dot gnu.org
2022-08-04  9:17 ` rsandifo at gcc dot gnu.org
2022-08-04  9:21 ` rearnsha at gcc dot gnu.org
2022-08-04  9:59 ` yinyuefengyi at gmail dot com
2022-08-04 10:01 ` yinyuefengyi at gmail dot com
2023-01-16 18:00 ` jakub at gcc dot gnu.org
2023-03-24 16:16 ` jakub at gcc dot gnu.org
2023-03-24 17:01 ` bergner at gcc dot gnu.org
2023-03-31  2:57 ` yinyuefengyi at gmail dot com
2023-05-08 12:24 ` [Bug target/106069] [12/13/14 " rguenth at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).