[Bug rtl-optimization/61722] New: [ 4.9 ] gcc sometimes does not optimise movaps with movups

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug rtl-optimization/61722] New: [ 4.9 ] gcc sometimes does not optimise movaps with movups
@ 2014-07-05 15:57 pietrek.j at gmail dot com
  2021-08-06  4:22 ` [Bug tree-optimization/61722] " pinskia at gcc dot gnu.org
  0 siblings, 1 reply; 2+ messages in thread
From: pietrek.j at gmail dot com @ 2014-07-05 15:57 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61722

            Bug ID: 61722
           Summary: [ 4.9 ] gcc sometimes does not optimise movaps with
                    movups
           Product: gcc
           Version: 4.9.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: pietrek.j at gmail dot com

I have two functions that use unaligned moving of __m128 ( instruction movups
).
The first one is optimized well, but in the second function gcc does not
eliminate unneeded movaps in the while loop.
Code:
typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));

void __test_fill_1( __m128 *dst, __m128 v, int count )
{
    while ( count-- )
       {
           __builtin_ia32_storeups((float*)(dst++),v);
           __builtin_ia32_storeups((float*)(dst++),v);
       }
}

void __test_fill_2( __m128 *dst, long long _v, int count )
{
    __m128 v;
    ((long long*)&v)[0]=((long long*)&v)[1]=_v;
    while ( count-- )
       {
           __builtin_ia32_storeups((float*)(dst++),v);
           __builtin_ia32_storeups((float*)(dst++),v);
       }
}

Compilation:
$ gcc -O3 test_fill.c -o test_fill -S            
$ cat test_fill
        .file   "test_fill.c"
        .section        .text.unlikely,"ax",@progbits
.LCOLDB0:
        .text
.LHOTB0:
        .p2align 4,,15
        .globl  __test_fill_1
        .type   __test_fill_1, @function
__test_fill_1: <------------ first function, optimisation works well here
.LFB0:
        .cfi_startproc
        testl   %esi, %esi
        je      .L1
        subl    $1, %esi
        leaq    16(%rdi), %rax
        salq    $5, %rsi
        leaq    48(%rdi,%rsi), %rdx
        .p2align 4,,10
        .p2align 3
.L3:              v----------------------- well-optimized while loop
        movups  %xmm0, -16(%rax)
        addq    $32, %rax
        movups  %xmm0, -32(%rax)
        cmpq    %rdx, %rax
        jne     .L3
.L1:
        rep ret
        .cfi_endproc
.LFE0:
        .size   __test_fill_1, .-__test_fill_1
        .section        .text.unlikely
.LCOLDE0:
        .text
.LHOTE0:
        .section        .text.unlikely
.LCOLDB1:
        .text
.LHOTB1:
        .p2align 4,,15
        .globl  __test_fill_2
        .type   __test_fill_2, @function
__test_fill_2: <------------ second function, problem with optimizing while
loop
.LFB1:
        .cfi_startproc
        testl   %edx, %edx
        movq    %rsi, -16(%rsp)
        movq    %rsi, -24(%rsp)
        je      .L7
        subl    $1, %edx
        leaq    16(%rdi), %rax
        salq    $5, %rdx
        leaq    48(%rdi,%rdx), %rdx
        .p2align 4,,10
        .p2align 3
.L9:
        movaps  -24(%rsp), %xmm0 <-------- why movaps here?
        addq    $32, %rax
        movups  %xmm0, -48(%rax)
        movaps  -24(%rsp), %xmm1 <-------- why movaps here?
        movups  %xmm1, -32(%rax)
        cmpq    %rdx, %rax
        jne     .L9
.L7:
        rep ret
        .cfi_endproc
.LFE1:
        .size   __test_fill_2, .-__test_fill_2
        .section        .text.unlikely
.LCOLDE1:
        .text
.LHOTE1:
        .ident  "GCC: (GNU) 4.9.0 20140604 (prerelease)"
        .section        .note.GNU-stack,"",@progbits


^ permalink raw reply	[flat|nested] 2+ messages in thread

* [Bug tree-optimization/61722] [ 4.9 ] gcc sometimes does not optimise movaps with movups
  2014-07-05 15:57 [Bug rtl-optimization/61722] New: [ 4.9 ] gcc sometimes does not optimise movaps with movups pietrek.j at gmail dot com
@ 2021-08-06  4:22 ` pinskia at gcc dot gnu.org
  0 siblings, 0 replies; 2+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-08-06  4:22 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61722

Andrew Pinski <pinskia at gcc dot gnu.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
           Keywords|                            |missed-optimization
   Last reconfirmed|                            |2021-08-06
     Ever confirmed|0                           |1
                 CC|                            |pinskia at gcc dot gnu.org
          Component|rtl-optimization            |tree-optimization
           Severity|normal                      |enhancement

--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
This comes from doing this:

typedef float m128 __attribute__ ((__vector_size__ (16)));
m128 a(long long _v )
{
    m128 v;//={0,0};
    ((long long*)&v)[0]=((long long*)&v)[1]=_v;
  return v;
}

v is still forced on the stack.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2021-08-06  4:22 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-07-05 15:57 [Bug rtl-optimization/61722] New: [ 4.9 ] gcc sometimes does not optimise movaps with movups pietrek.j at gmail dot com
2021-08-06  4:22 ` [Bug tree-optimization/61722] " pinskia at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).