public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug tree-optimization/61338] New: too many permutation in a vectorized "reverse loop"
@ 2014-05-28  9:12 vincenzo.innocente at cern dot ch
  2014-05-28  9:19 ` [Bug tree-optimization/61338] " vincenzo.innocente at cern dot ch
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2014-05-28  9:12 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61338

            Bug ID: 61338
           Summary: too many permutation in a vectorized "reverse loop"
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: vincenzo.innocente at cern dot ch

in this example gcc generates 4 permutations for foo (while none is required)
On the positive side the code for bar (which is a more realistic use case)
seems optimal.

float x[1024];
float y[1024];
float z[1024];

void foo() {
  for (int i=0; i<512; ++i)
    x[1023-i] += y[1023-i]*z[512-i];
}


void bar() {
  for (int i=0; i<512; ++i)
    x[1023-i] += y[i]*z[i+512];
}

c++ -Ofast -march=haswell -S revloop.cc; cat revloop.s

__Z3foov:
LFB0:
    vmovdqa    LC0(%rip), %ymm2
    xorl    %eax, %eax
    leaq    4064+_x(%rip), %rdx
    leaq    4064+_y(%rip), %rsi
    leaq    2020+_z(%rip), %rcx
    .align 4,0x90
L2:
    vpermd    (%rdx,%rax), %ymm2, %ymm0
    vpermd    (%rcx,%rax), %ymm2, %ymm1
    vpermd    (%rsi,%rax), %ymm2, %ymm3
    vfmadd231ps    %ymm1, %ymm3, %ymm0
    vpermd    %ymm0, %ymm2, %ymm0
    vmovaps    %ymm0, (%rdx,%rax)
    subq    $32, %rax
    cmpq    $-2048, %rax
    jne    L2
    vzeroupper
    ret
LFE0:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDE1:
    .text
LHOTE1:
    .section __TEXT,__text_cold,regular,pure_instructions
LCOLDB2:
    .text
LHOTB2:
    .align 4,0x90
    .globl __Z3barv
__Z3barv:
LFB1:
    vmovdqa    LC0(%rip), %ymm1
    leaq    2048+_z(%rip), %rdx
    leaq    _y(%rip), %rcx
    leaq    4064+_x(%rip), %rax
    leaq    4096+_z(%rip), %rsi
    .align 4,0x90
L6:
    vmovaps    (%rdx), %ymm2
    addq    $32, %rdx
    vpermd    (%rax), %ymm1, %ymm0
    addq    $32, %rcx
    vfmadd231ps    -32(%rcx), %ymm2, %ymm0
    subq    $32, %rax
    vpermd    %ymm0, %ymm1, %ymm0
    vmovaps    %ymm0, 32(%rax)
    cmpq    %rsi, %rdx
    jne    L6
    vzeroupper
    ret
LFE1:


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/61338] too many permutation in a vectorized "reverse loop"
  2014-05-28  9:12 [Bug tree-optimization/61338] New: too many permutation in a vectorized "reverse loop" vincenzo.innocente at cern dot ch
@ 2014-05-28  9:19 ` vincenzo.innocente at cern dot ch
  2014-05-28 10:33 ` rguenth at gcc dot gnu.org
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: vincenzo.innocente at cern dot ch @ 2014-05-28  9:19 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61338

--- Comment #1 from vincenzo Innocente <vincenzo.innocente at cern dot ch> ---
if I write it "reverse"
void foo2() {
  for (int i=511; i>=0; --i)
    x[1023-i] += y[1023-i]*z[512-i];
}

its ok
__Z4foo2v:
LFB1:
    leaq    2048+_x(%rip), %rdx
    xorl    %eax, %eax
    leaq    4+_z(%rip), %rsi
    leaq    2048+_y(%rip), %rcx
    .align 4,0x90
L6:
    vmovaps    (%rdx,%rax), %ymm1
    vmovups    (%rsi,%rax), %ymm0
    vfmadd132ps    (%rcx,%rax), %ymm1, %ymm0
    vmovaps    %ymm0, (%rdx,%rax)
    addq    $32, %rax
    cmpq    $2048, %rax
    jne    L6
    vzeroupper
    ret


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/61338] too many permutation in a vectorized "reverse loop"
  2014-05-28  9:12 [Bug tree-optimization/61338] New: too many permutation in a vectorized "reverse loop" vincenzo.innocente at cern dot ch
  2014-05-28  9:19 ` [Bug tree-optimization/61338] " vincenzo.innocente at cern dot ch
@ 2014-05-28 10:33 ` rguenth at gcc dot gnu.org
  2020-03-16  8:49 ` glisse at gcc dot gnu.org
  2023-12-07  7:29 ` pinskia at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: rguenth at gcc dot gnu.org @ 2014-05-28 10:33 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61338

Richard Biener <rguenth at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
           Keywords|                            |missed-optimization
             Status|UNCONFIRMED                 |NEW
   Last reconfirmed|                            |2014-05-28
             Blocks|                            |53947
     Ever confirmed|0                           |1
           Severity|normal                      |enhancement

--- Comment #2 from Richard Biener <rguenth at gcc dot gnu.org> ---
Confirmed.  We fail to detect that all DRs are accessed "reverse" which is the
case where we can drop the permutes.  We also fail to reverse the
positive vectors if they happen to be lower in number:

float x[1024];
float y[1024];
float z[1024];

void foo() {
    for (int i=0; i<512; ++i)
      x[i] += y[1023-i]*z[512-i];
}

produces

.L2:
        vpermd  (%rdx), %ymm1, %ymm0
        subq    $32, %rdx
        vpermd  (%rcx), %ymm1, %ymm2
        addq    $32, %rax
        vfmadd213ps     -32(%rax), %ymm2, %ymm0
        subq    $32, %rcx
        vmovaps %ymm0, -32(%rax)
        cmpq    $z-28, %rdx
        jne     .L2

instead of permuting the result before storing it.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/61338] too many permutation in a vectorized "reverse loop"
  2014-05-28  9:12 [Bug tree-optimization/61338] New: too many permutation in a vectorized "reverse loop" vincenzo.innocente at cern dot ch
  2014-05-28  9:19 ` [Bug tree-optimization/61338] " vincenzo.innocente at cern dot ch
  2014-05-28 10:33 ` rguenth at gcc dot gnu.org
@ 2020-03-16  8:49 ` glisse at gcc dot gnu.org
  2023-12-07  7:29 ` pinskia at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: glisse at gcc dot gnu.org @ 2020-03-16  8:49 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61338

--- Comment #3 from Marc Glisse <glisse at gcc dot gnu.org> ---
Possibly easier is the case of a reduction, where permutations are clearly
irrelevant.

int f(int*arr,int size){
  int sum=0;
  for(int i = 0; i < size; i++){
    sum += arr[size-1-i];
  }
  return sum;
}

We still have a VEC_PERM_EXPR in the hot loop before accumulating.

(by the way, we accumulate in a variable of type "vector(4) int", while I would
expect "vector(4) unsigned int" for overflow reasons)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug tree-optimization/61338] too many permutation in a vectorized "reverse loop"
  2014-05-28  9:12 [Bug tree-optimization/61338] New: too many permutation in a vectorized "reverse loop" vincenzo.innocente at cern dot ch
                   ` (2 preceding siblings ...)
  2020-03-16  8:49 ` glisse at gcc dot gnu.org
@ 2023-12-07  7:29 ` pinskia at gcc dot gnu.org
  3 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2023-12-07  7:29 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61338

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |pinskia at gcc dot gnu.org

--- Comment #4 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
*** Bug 112892 has been marked as a duplicate of this bug. ***

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-12-07  7:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-05-28  9:12 [Bug tree-optimization/61338] New: too many permutation in a vectorized "reverse loop" vincenzo.innocente at cern dot ch
2014-05-28  9:19 ` [Bug tree-optimization/61338] " vincenzo.innocente at cern dot ch
2014-05-28 10:33 ` rguenth at gcc dot gnu.org
2020-03-16  8:49 ` glisse at gcc dot gnu.org
2023-12-07  7:29 ` pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).