public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
* [Bug middle-end/99634] New: s2102 benchmarks of TSVC is vectorized better by icc than gcc
@ 2021-03-17 18:49 hubicka at gcc dot gnu.org
  2021-03-18  9:03 ` [Bug middle-end/99634] s2102 benchmarks of TSVC is vectorized better by icc than gcc, interchange is missing rguenth at gcc dot gnu.org
  2023-01-11 22:28 ` hubicka at gcc dot gnu.org
  0 siblings, 2 replies; 3+ messages in thread
From: hubicka at gcc dot gnu.org @ 2021-03-17 18:49 UTC (permalink / raw)
  To: gcc-bugs

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99634

            Bug ID: 99634
           Summary: s2102 benchmarks of TSVC is vectorized better by icc
                    than gcc
           Product: gcc
           Version: 11.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: hubicka at gcc dot gnu.org
  Target Milestone: ---

typedef float real_t;

#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
// array definitions

real_t
a[LEN_2D],d[LEN_2D],aa[LEN_2D][LEN_2D],bb[LEN_2D][LEN_2D],cc[LEN_2D][LEN_2D],tt[LEN_2D][LEN_2D];


int main(struct args_t * func_args)
{
//    diagonals
//    identity matrix, best results vectorize both inner and outer loops

    for (int nl = 0; nl < 100*(iterations/LEN_2D); nl++) {
        for (int i = 0; i < LEN_2D; i++) {
            for (int j = 0; j < LEN_2D; j++) {
                aa[j][i] = (real_t)0.;
            }
            aa[i][i] = (real_t)1.;
        }
        dummy();
    }
   return aa[0][0];
}

is vectorized by ic as:
min:
# parameter 1: %rdi
..B1.1:                         # Preds ..B1.0
                                # Execution count [5.00e-03]
        .cfi_startproc
..___tag_value_min.1:
..L2:
                                                          #36.1
        pushq     %rbp                                          #36.1
        .cfi_def_cfa_offset 16
        movq      %rsp, %rbp                                    #36.1
        .cfi_def_cfa 6, 16
        .cfi_offset 6, -16
        andq      $-32, %rsp                                    #36.1
        movl      $aa, %edi                                     #38.13
        xorl      %esi, %esi                                    #38.13
        movl      $262144, %edx                                 #38.13
        call      _intel_fast_memset                            #38.13
                                # LOE rbx r12 r13 r14 r15
..B1.2:                         # Preds ..B1.1
                                # Execution count [1.00e+00]
        vmovups   .L_2il0floatpacket.0(%rip), %ymm1             #41.24
        xorl      %edx, %edx                                    #37.9
        xorl      %eax, %eax                                    #37.9
        vextractf128 $1, %ymm1, %xmm0                           #41.13
                                # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1
..B1.3:                         # Preds ..B1.3 ..B1.2
                                # Execution count [2.56e+02]
        vextractps $3, %xmm1, 44204+aa(%rax,%rdx,4)             #41.13
        lea       (%rax,%rdx,4), %rcx                           #41.13
        vmovss    %xmm0, 45232+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm0, 46260+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm0, 47288+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm0, 48316+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm1, 49344+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm1, 50372+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm1, 51400+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm1, 52428+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm0, 53456+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm0, 54484+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm0, 55512+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm0, 56540+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm1, 57568+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm1, 58596+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm1, 59624+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm1, 60652+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm0, 61680+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm0, 62708+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm0, 63736+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm0, 64764+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm1, 65792+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm1, 66820+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm1, 67848+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm1, 68876+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm0, 69904+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm0, 70932+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm0, 71960+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm0, 72988+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm1, 74016+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm1, 75044+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm1, 76072+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm1, 77100+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm0, 78128+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm0, 79156+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm0, 80184+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm0, 81212+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm1, 82240+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm1, 83268+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm1, 84296+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm1, 85324+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm0, 86352+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm0, 87380+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm0, 88408+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm0, 89436+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm1, 90464+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm1, 91492+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm1, 92520+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm1, 93548+aa(%rax,%rdx,4)             #41.13
        vmovss    %xmm0, 94576+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm0, 95604+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm0, 96632+aa(%rax,%rdx,4)             #41.13
        vextractps $3, %xmm0, 97660+aa(%rax,%rdx,4)             #41.13
       vmovss    %xmm1, 98688+aa(%rax,%rdx,4)                  #41.13
        vextractps $1, %xmm1, 99716+aa(%rax,%rdx,4)             #41.13
        vextractps $2, %xmm1, 100744+aa(%rax,%rdx,4)            #41.13
        vextractps $3, %xmm1, 101772+aa(%rax,%rdx,4)            #41.13
        vmovss    %xmm0, 102800+aa(%rax,%rdx,4)                 #41.13
        vextractps $1, %xmm0, 103828+aa(%rax,%rdx,4)            #41.13
        vextractps $2, %xmm0, 104856+aa(%rax,%rdx,4)            #41.13
        vextractps $3, %xmm0, 105884+aa(%rax,%rdx,4)            #41.13
        vmovss    %xmm1, 106912+aa(%rax,%rdx,4)                 #41.13
        vextractps $1, %xmm1, 107940+aa(%rax,%rdx,4)            #41.13
        vextractps $2, %xmm1, 108968+aa(%rax,%rdx,4)            #41.13
        vextractps $3, %xmm1, 109996+aa(%rax,%rdx,4)            #41.13
        vmovss    %xmm0, 111024+aa(%rax,%rdx,4)                 #41.13
        vextractps $1, %xmm0, 112052+aa(%rax,%rdx,4)            #41.13
        vextractps $2, %xmm0, 113080+aa(%rax,%rdx,4)            #41.13
        vextractps $3, %xmm0, 114108+aa(%rax,%rdx,4)            #41.13
        vmovss    %xmm1, 115136+aa(%rax,%rdx,4)                 #41.13
        vextractps $1, %xmm1, 116164+aa(%rax,%rdx,4)            #41.13
        vextractps $2, %xmm1, 117192+aa(%rax,%rdx,4)            #41.13
        vextractps $3, %xmm1, 118220+aa(%rax,%rdx,4)            #41.13
        vmovss    %xmm0, 119248+aa(%rax,%rdx,4)                 #41.13
        vextractps $1, %xmm0, 120276+aa(%rax,%rdx,4)            #41.13
        vextractps $2, %xmm0, 121304+aa(%rax,%rdx,4)            #41.13
        vextractps $3, %xmm0, 122332+aa(%rax,%rdx,4)            #41.13
        vmovss    %xmm1, 123360+aa(%rax,%rdx,4)                 #41.13
        vextractps $1, %xmm1, 124388+aa(%rax,%rdx,4)            #41.13
        vextractps $2, %xmm1, 125416+aa(%rax,%rdx,4)            #41.13
        vextractps $3, %xmm1, 126444+aa(%rax,%rdx,4)            #41.13
        vmovss    %xmm0, 127472+aa(%rax,%rdx,4)                 #41.13
        vextractps $1, %xmm0, 128500+aa(%rax,%rdx,4)            #41.13
        vextractps $2, %xmm0, 129528+aa(%rax,%rdx,4)            #41.13
        vextractps $3, %xmm0, 130556+aa(%rax,%rdx,4)            #41.13
        addq      $128, %rdx                                    #37.9
        addq      $131072, %rax                                 #37.9
        vmovss    %xmm1, aa(%rcx)                               #41.13
        vextractps $1, %xmm1, 1028+aa(%rcx)                     #41.13
        vextractps $2, %xmm1, 2056+aa(%rcx)                     #41.13
        vextractps $3, %xmm1, 3084+aa(%rcx)                     #41.13
        vmovss    %xmm0, 4112+aa(%rcx)                          #41.13
        vextractps $1, %xmm0, 5140+aa(%rcx)                     #41.13
        vextractps $2, %xmm0, 6168+aa(%rcx)                     #41.13
       vextractps $3, %xmm0, 7196+aa(%rcx)                     #41.13
        vmovss    %xmm1, 8224+aa(%rcx)                          #41.13
        vextractps $1, %xmm1, 9252+aa(%rcx)                     #41.13
        vextractps $2, %xmm1, 10280+aa(%rcx)                    #41.13
        vextractps $3, %xmm1, 11308+aa(%rcx)                    #41.13
        vmovss    %xmm0, 12336+aa(%rcx)                         #41.13
        vextractps $1, %xmm0, 13364+aa(%rcx)                    #41.13
        vextractps $2, %xmm0, 14392+aa(%rcx)                    #41.13
        vextractps $3, %xmm0, 15420+aa(%rcx)                    #41.13
        vmovss    %xmm1, 16448+aa(%rcx)                         #41.13
        vextractps $1, %xmm1, 17476+aa(%rcx)                    #41.13
        vextractps $2, %xmm1, 18504+aa(%rcx)                    #41.13
        vextractps $3, %xmm1, 19532+aa(%rcx)                    #41.13
        vmovss    %xmm0, 20560+aa(%rcx)                         #41.13
        vextractps $1, %xmm0, 21588+aa(%rcx)                    #41.13
        vextractps $2, %xmm0, 22616+aa(%rcx)                    #41.13
        vextractps $3, %xmm0, 23644+aa(%rcx)                    #41.13
        vmovss    %xmm1, 24672+aa(%rcx)                         #41.13
        vextractps $1, %xmm1, 25700+aa(%rcx)                    #41.13
        vextractps $2, %xmm1, 26728+aa(%rcx)                    #41.13
        vextractps $3, %xmm1, 27756+aa(%rcx)                    #41.13
        vmovss    %xmm0, 28784+aa(%rcx)                         #41.13
        vextractps $1, %xmm0, 29812+aa(%rcx)                    #41.13
        vextractps $2, %xmm0, 30840+aa(%rcx)                    #41.13
        vextractps $3, %xmm0, 31868+aa(%rcx)                    #41.13
        vmovss    %xmm1, 32896+aa(%rcx)                         #41.13
        vextractps $1, %xmm1, 33924+aa(%rcx)                    #41.13
        vextractps $2, %xmm1, 34952+aa(%rcx)                    #41.13
        vextractps $3, %xmm1, 35980+aa(%rcx)                    #41.13
        vmovss    %xmm0, 37008+aa(%rcx)                         #41.13
        vextractps $1, %xmm0, 38036+aa(%rcx)                    #41.13
        vextractps $2, %xmm0, 39064+aa(%rcx)                    #41.13
        vextractps $3, %xmm0, 40092+aa(%rcx)                    #41.13
        vmovss    %xmm1, 41120+aa(%rcx)                         #41.13
        vextractps $1, %xmm1, 42148+aa(%rcx)                    #41.13
        vextractps $2, %xmm1, 43176+aa(%rcx)                    #41.13
        cmpq      $256, %rdx                                    #37.9
        jb        ..B1.3        # Prob 99%                      #37.9
                                # LOE rax rdx rbx r12 r13 r14 r15 xmm0 xmm1
..B1.4:                         # Preds ..B1.3
                                # Execution count [1.00e+00]
        vzeroupper                                              #43.1
        movq      %rbp, %rsp                                    #43.1
        popq      %rbp                                          #43.1
        .cfi_def_cfa 7, 8
        .cfi_restore 6
        ret                                                     #43.1

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2023-01-11 22:28 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-03-17 18:49 [Bug middle-end/99634] New: s2102 benchmarks of TSVC is vectorized better by icc than gcc hubicka at gcc dot gnu.org
2021-03-18  9:03 ` [Bug middle-end/99634] s2102 benchmarks of TSVC is vectorized better by icc than gcc, interchange is missing rguenth at gcc dot gnu.org
2023-01-11 22:28 ` hubicka at gcc dot gnu.org

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).