[Bug target/33717] slow code generated for 64-bit arithmetic

public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed

* [Bug target/33717] slow code generated for 64-bit arithmetic
       [not found] <bug-33717-4@http.gcc.gnu.org/bugzilla/>
@ 2021-12-26 21:44 ` pinskia at gcc dot gnu.org
  2021-12-26 21:58 ` pinskia at gcc dot gnu.org
  1 sibling, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-12-26 21:44 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=33717

--- Comment #5 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
#include <stdio.h>
#include <assert.h>

#define rdtscl(low) \
     __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")

int main() {
  unsigned int x[100];
  unsigned int y[100];
  unsigned int z[100];

  long a,b,c;
  size_t i;

  unsigned long long l;

  for (i=0; i<100; ++i) {
    x[i]=0; y[i]=-1;
    z[i]=0;
  }
  x[0]=1;
  rdtscl(a);
  l=0;
  for (i=0; i<100; ++i) {
    l += (unsigned long long)x[i] + y[i];
    z[i]=l;
    l>>=32;
  }
  rdtscl(b);
  printf("C: %ld cycles\n",b-a);

  for (i=0; i<100; ++i)
    assert(z[i]==0);
  assert(l==1);

  rdtscl(a);
  asm volatile(
    "mov (%%esi,%%ecx),%%eax\n"
    "add (%%edi,%%ecx),%%eax\n"
    "mov %%eax,(%%ebx,%%ecx)\n"
    "1:\n"
    "lea 4(%%ecx),%%ecx\n"
    "jecxz 1f\n"
    "mov (%%esi,%%ecx),%%eax\n"
    "adc (%%edi,%%ecx),%%eax\n"
    "mov %%eax,(%%ebx,%%ecx)\n"
    "jmp 1b\n"
    "1:\n"
    : : "S" (x+100), "D" (y+100), "b" (z+100), "c" (-400) : "eax" );

  rdtscl(b);
  printf("asm: %ld cycles\n",b-a);

  for (i=0; i<100; ++i)
    assert(z[i]==0);
  assert(l==1);

}

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/33717] slow code generated for 64-bit arithmetic
       [not found] <bug-33717-4@http.gcc.gnu.org/bugzilla/>
  2021-12-26 21:44 ` [Bug target/33717] slow code generated for 64-bit arithmetic pinskia at gcc dot gnu.org
@ 2021-12-26 21:58 ` pinskia at gcc dot gnu.org
  1 sibling, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu.org @ 2021-12-26 21:58 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=33717

--- Comment #6 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
GCC does better now since GCC 10:
.L2:
        movl    (%ebx,%ecx,4), %eax
        xorl    %edx, %edx
        addl    $-1, %eax
        adcl    $0, %edx
        addl    %eax, %esi
        adcl    %edx, %edi
        movl    %esi, -424(%ebp,%ecx,4)
        addl    $1, %ecx
        movl    %edi, %esi
        xorl    %edi, %edi
        cmpl    $100, %ecx
        jne     .L2

But still not as good as ICC:
..B1.4:                         # Preds ..B1.4 ..B1.3
                                # Execution count [5.00e+01]
        addl      32(%esp,%eax,8), %ebx                         #25.5
        movl      %esi, %edx                                    #25.5
        adcl      $0, %edx                                      #25.5
        addl      432(%esp,%eax,8), %ebx                        #25.37
        movl      %ebx, 832(%esp,%eax,8)                        #26.5
        movl      %esi, %ebx                                    #25.5
        adcl      $0, %edx                                      #25.37
        addl      36(%esp,%eax,8), %edx                         #25.5
        adcl      $0, %ebx                                      #25.5
        addl      436(%esp,%eax,8), %edx                        #25.37
        movl      %edx, 836(%esp,%eax,8)                        #26.5
        adcl      $0, %ebx                                      #25.37
        incl      %eax                                          #24.3
        cmpl      $50, %eax                                     #24.3
        jb        ..B1.4        # Prob 98%


LLVM just falls over:
.LBB0_1:                                # =>This Inner Loop Header: Depth=1
        xorl    %ebx, %ebx
        addl    808(%esp,%edx,4), %eax
        setb    %bl
        addl    408(%esp,%edx,4), %eax
        adcl    $0, %ebx
        movl    %eax, 8(%esp,%edx,4)
        cmpl    $100, %edx
        je      .LBB0_3
# %bb.2:                                #   in Loop: Header=BB0_1 Depth=1
        xorl    %eax, %eax
        addl    812(%esp,%edx,4), %ebx
        setb    %al
        addl    412(%esp,%edx,4), %ebx
        adcl    $0, %eax
        movl    %ebx, 12(%esp,%edx,4)
        addl    $2, %edx
        jmp     .LBB0_1
.LBB0_3:

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/33717] slow code generated for 64-bit arithmetic
  2007-10-09 16:53 [Bug rtl-optimization/33717] New: " felix-gcc at fefe dot de
  2008-12-31 18:38 ` [Bug target/33717] " pinskia at gcc dot gnu dot org
  2008-12-31 18:41 ` pinskia at gcc dot gnu dot org
@ 2009-01-01 17:37 ` ubizjak at gmail dot com
  2 siblings, 0 replies; 5+ messages in thread
From: ubizjak at gmail dot com @ 2009-01-01 17:37 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #4 from ubizjak at gmail dot com  2009-01-01 17:35 -------
(In reply to comment #3)

> Most likely addsi3_carry should accept 0 as one of the operands.

It does:

(define_insn "addsi3_carry"
  [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r")
          (plus:SI (plus:SI (match_operand:SI 3 "ix86_carry_flag_operator" "")
                            (match_operand:SI 1 "nonimmediate_operand" "%0,0"))
                   (match_operand:SI 2 "general_operand" "ri,rm")))
   (clobber (reg:CC FLAGS_REG))]


It looks to me that cprop_hardreg is the pass to handle this case, at least
this sequence should be handled (to propagate cx):

(insn 74 50 52 3 pr33717.c:12 (parallel [
            (set (reg:SI 2 cx [+4 ])
                (const_int 0 [0x0]))
            (clobber (reg:CC 17 flags))
        ]) 45 {*movsi_xor} (nil))


(insn 53 52 54 3 pr33717.c:12 (parallel [
            (set (reg:SI 4 si [+4 ])
                (plus:SI (plus:SI (ltu:SI (reg:CC 17 flags)
                            (const_int 0 [0x0]))
                        (reg:SI 4 si [+4 ]))
                    (reg:SI 2 cx [+4 ])))
            (clobber (reg:CC 17 flags))
        ]) 266 {addsi3_carry} (expr_list:REG_DEAD (reg:CC 17 flags)
        (expr_list:REG_DEAD (reg:SI 2 cx [+4 ])
            (expr_list:REG_UNUSED (reg:CC 17 flags)
                (nil)))))


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33717


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/33717] slow code generated for 64-bit arithmetic
  2007-10-09 16:53 [Bug rtl-optimization/33717] New: " felix-gcc at fefe dot de
  2008-12-31 18:38 ` [Bug target/33717] " pinskia at gcc dot gnu dot org
@ 2008-12-31 18:41 ` pinskia at gcc dot gnu dot org
  2009-01-01 17:37 ` ubizjak at gmail dot com
  2 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu dot org @ 2008-12-31 18:41 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #3 from pinskia at gcc dot gnu dot org  2008-12-31 18:39 -------
GCC does not produce "adcl      $0" which is where the extra xors come from.


Most likely addsi3_carry should accept 0 as one of the operands.


-- 

pinskia at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |pinskia at gcc dot gnu dot
                   |                            |org
  GCC build triplet|i386-pc-linux-gnu           |
   GCC host triplet|i386-pc-linux-gnu           |
 GCC target triplet|i386-pc-linux-gnu           |i?86-*-*


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33717


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Bug target/33717] slow code generated for 64-bit arithmetic
  2007-10-09 16:53 [Bug rtl-optimization/33717] New: " felix-gcc at fefe dot de
@ 2008-12-31 18:38 ` pinskia at gcc dot gnu dot org
  2008-12-31 18:41 ` pinskia at gcc dot gnu dot org
  2009-01-01 17:37 ` ubizjak at gmail dot com
  2 siblings, 0 replies; 5+ messages in thread
From: pinskia at gcc dot gnu dot org @ 2008-12-31 18:38 UTC (permalink / raw)
  To: gcc-bugs



------- Comment #2 from pinskia at gcc dot gnu dot org  2008-12-31 18:37 -------
4.4 with the new register allocator (which is turned on by default):
C: 522 cycles
asm: 342 cycles

4.4 with the old one:
C: 749 cycles
asm: 344 cycles


So 4.4 is much better but still has extra instructions but that is a target
issue now.


-- 

pinskia at gcc dot gnu dot org changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|UNCONFIRMED                 |NEW
          Component|rtl-optimization            |target
     Ever Confirmed|0                           |1
           Keywords|ra                          |
   Last reconfirmed|0000-00-00 00:00:00         |2008-12-31 18:37:00
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33717


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-12-26 21:58 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <bug-33717-4@http.gcc.gnu.org/bugzilla/>
2021-12-26 21:44 ` [Bug target/33717] slow code generated for 64-bit arithmetic pinskia at gcc dot gnu.org
2021-12-26 21:58 ` pinskia at gcc dot gnu.org
2007-10-09 16:53 [Bug rtl-optimization/33717] New: " felix-gcc at fefe dot de
2008-12-31 18:38 ` [Bug target/33717] " pinskia at gcc dot gnu dot org
2008-12-31 18:41 ` pinskia at gcc dot gnu dot org
2009-01-01 17:37 ` ubizjak at gmail dot com

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).