From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <fortran-return-50217-listarch-fortran=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 64128 invoked by alias); 12 Apr 2018 15:00:59 -0000
Mailing-List: contact fortran-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <fortran.gcc.gnu.org>
List-Subscribe: <mailto:fortran-subscribe@gcc.gnu.org>
List-Post: <mailto:fortran@gcc.gnu.org>
List-Help: <mailto:fortran-help@gcc.gnu.org>, <http://sourceware.org/lists.html#faqs>
Sender: fortran-owner@gcc.gnu.org
Received: (qmail 64116 invoked by uid 89); 12 Apr 2018 15:00:58 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-5.5 required=5.0 tests=AWL,BAYES_00,GIT_PATCH_2,HTML_MESSAGE,KAM_LAZY_DOMAIN_SECURITY,MANY_SPAN_IN_TEXT autolearn=ham version=3.3.2 spammy=permutation, H*c:alternative
X-HELO: mail3-relais-sop.national.inria.fr
Received: from mail3-relais-sop.national.inria.fr (HELO mail3-relais-sop.national.inria.fr) (192.134.164.104) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Thu, 12 Apr 2018 15:00:55 +0000
Received: from mail-wr0-f174.google.com ([209.85.128.174])  by mail3-relais-sop.national.inria.fr with ESMTP/TLS/AES128-GCM-SHA256; 12 Apr 2018 17:00:42 +0200
Received: by mail-wr0-f174.google.com with SMTP id d19so5446508wre.1        for <fortran@gcc.gnu.org>; Thu, 12 Apr 2018 08:00:42 -0700 (PDT)
X-Gm-Message-State: ALQs6tBW+Xi9mOU1oYWI/BI1Fn21HqGzTrOXplRCjl+P7m5molMR5JUK	4c0qFmDMUS7Lu9ZItyjmtYKmKMp9MYO18pw36pY=
X-Google-Smtp-Source: AIpwx48k+G8nlBp4qyy0wIFvL/Pp4PRr7HsZaxTbGYwv3RolqiqQo6dU3VO1eoQQDFBvOakL0H15vyV1EvI6TElLiko=
X-Received: by 10.223.186.133 with SMTP id p5mr1085032wrg.196.1523545242456; Thu, 12 Apr 2018 08:00:42 -0700 (PDT)
MIME-Version: 1.0
Received: by 10.223.169.51 with HTTP; Thu, 12 Apr 2018 08:00:01 -0700 (PDT)
In-Reply-To: <CAFiYyc3P-vvVe3L6uRkamgEpOZQ8yE53tGpp_HphD5AJOwTNgw@mail.gmail.com>
References: <CAF1HNd+tjDQjx-NM9mDpvRn=dDSZjCcent5y=XG35jJWJE2z2A@mail.gmail.com> <CAFiYyc3P-vvVe3L6uRkamgEpOZQ8yE53tGpp_HphD5AJOwTNgw@mail.gmail.com>
From: =?UTF-8?Q?La=C3=A9rcio_LIMA_PILLA?= <laercio.lima@inria.fr>
Date: Thu, 12 Apr 2018 15:00:00 -0000
X-Gmail-Original-Message-ID: <CAF1HNdJFYK+HSYh90SA2HCGWVyORXELtWySaQBy34Q7sOpPAog@mail.gmail.com>
Message-ID: <CAF1HNdJFYK+HSYh90SA2HCGWVyORXELtWySaQBy34Q7sOpPAog@mail.gmail.com>
Subject: Re: Difference in assembly code generated (gfortran vs ifort) (optimization flags missing?)
To: Richard Biener <richard.guenther@gmail.com>
Cc: "fortran@gcc.gnu.org" <fortran@gcc.gnu.org>
Content-Type: text/plain; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
X-IsSubscribed: yes
X-SW-Source: 2018-04/txt/msg00059.txt.bz2

Thank you for the quick reply.

2018-04-12 16:00 GMT+02:00 Richard Biener <richard.guenther@gmail.com>:

> On Thu, Apr 12, 2018 at 3:55 PM,
>
>
>
> <laercio.lima@inria.fr> wrote:
> > Dear all,
> >
> > TL;DR version: I have been noticing very extreme performance differences
> > (up to a factor of 3) between ifort and gfortran.
> > As I checked the assembly code, I noticed that the compilers are using
> > different instructions (e.g., ifort uses 'vbroadcastsd').
> > Am I missing any special optimization flags (besides -march and -mtune
> > native) or is this expected?
> >
> > Original version:
> >
> > I have been working on the optimization of a Fortran 95 [and over]
> > application that makes use of several matrix-vector multiplication
> kernels.
> > As the sizes of the matrices are well-known, the original developers
> > generated different kernels for different sizes.
> > An example for size 4 is given below.
> >
> >   USE ISO_C_BINDING
> >   !...
> >   subroutine mv_mult_4_4(mat,vec,res)
> >     REAL(C_DOUBLE), INTENT(IN),  DIMENSION(4,4) :: mat
> >     REAL(C_DOUBLE), INTENT(IN),  DIMENSION(4)   :: vec
> >     REAL(C_DOUBLE), INTENT(OUT), DIMENSION(4)   :: res
> >     INTEGER(C_INT) :: iRow, iCol
> >
> >     res =3D 0.0
> >
> >     do iCol=3D1,4
> >        do iRow=3D1,4
> >           res(iRow) =3D res(iRow) + mat(iRow,iCol)*vec(iCol)
> >        end do
> >     end do
> >
> >   end subroutine mv_mult_4_4
>
> This doesn't seem complete as it doesn't compile for me...
>

Yes. My fault. When I took this part out of the code, I forgot to add the
module information. Here is a more complete version:

module example

  USE ISO_C_BINDING

contains

  subroutine mv_mult_4_4(mat,vec,res)
    REAL(C_DOUBLE), INTENT(IN),  DIMENSION(4,4) :: mat
    REAL(C_DOUBLE), INTENT(IN),  DIMENSION(4)   :: vec
    REAL(C_DOUBLE), INTENT(OUT), DIMENSION(4)   :: res
    INTEGER(C_INT) :: iRow, iCol

    res =3D 0.0

    do iCol=3D1,4
       do iRow=3D1,4
          res(iRow) =3D res(iRow) + mat(iRow,iCol)*vec(iCol)
       end do
    end do

  end subroutine mv_mult_4_4

end module example

>
> > I have been noticing very significant performance differences in my tes=
ts
> > with gfortran, ifort, and different optimizations on my local system.
> > On the special case for a 20x20 matrix, ifort provides a code that
> reduces
> > the execution time by a factor of 3 for the same optimization flags.
> > I started checking the assembly code generated by the different compile=
rs
> > and noticed some differences.
> > For the code snippet above, the assembly versions from ifort and gfortr=
an
> > are presented below.
> > We can notice that ifort is using some instructions (vbroadcastsd) that
> are
> > not used by gfortran even though I am telling the compiler the specific
> > architecture of my processor.
> > As the general users of the application use gfortran, I would like to
> know:
> >
> > 1) Is this difference in instructions used expected?
> > 2) Am I missing any additional optimization flag (besides -march and
> > -mtune) that could change that?
> > 3) Are there any directives (besides OpenMP ones) that could help in th=
is
> > case?
>
> It looks like ifort does loop vecotrization on the inner loop while GCC
> most certainly unrolls that fully and vectorizes the outer loop which in
> turn
> requires all the shuffling.  You can see if -fdisable-tree-cunrolli solves
> this
> (just for debugging!).
>

I took your suggestion into account and added that flag. The result is a
better code that even includes vbroadcast:

.file "example.f90"
.text
.p2align 4,,15
.globl __example_MOD_mv_mult_4_4
.type __example_MOD_mv_mult_4_4, @function
__example_MOD_mv_mult_4_4:
.LFB0:
.cfi_startproc
vpxor %xmm0, %xmm0, %xmm0
vbroadcastsd (%rsi), %ymm1
vmovups %xmm0, (%rdx)
vmovups %xmm0, 16(%rdx)
vmovupd (%rdi), %ymm0
vfmadd213pd (%rdx), %ymm1, %ymm0
vbroadcastsd 8(%rsi), %ymm1
vfmadd132pd 32(%rdi), %ymm0, %ymm1
vbroadcastsd 16(%rsi), %ymm0
vfmadd231pd 64(%rdi), %ymm0, %ymm1
vbroadcastsd 24(%rsi), %ymm0
vfmadd132pd 96(%rdi), %ymm1, %ymm0
vmovupd %ymm0, (%rdx)
vzeroupper
ret
.cfi_endproc
.LFE0:
.size __example_MOD_mv_mult_4_4, .-__example_MOD_mv_mult_4_4
.ident "GCC: (Ubuntu 7.2.0-1ubuntu1~16.04) 7.2.0"
.section .note.GNU-stack,"",@progbits

I also experimented with loop permutation, which also lead to better
assembly:

.file "example.f90"
.text
.p2align 4,,15
.globl __example_MOD_mv_mult_4_4
.type __example_MOD_mv_mult_4_4, @function
__example_MOD_mv_mult_4_4:
.LFB0:
.cfi_startproc
vpxor %xmm0, %xmm0, %xmm0
vbroadcastsd (%rsi), %ymm3
vbroadcastsd 8(%rsi), %ymm2
vmovups %xmm0, (%rdx)
vbroadcastsd 16(%rsi), %ymm1
vmovups %xmm0, 16(%rdx)
vmovupd (%rdx), %ymm4
vfmadd132pd (%rdi), %ymm4, %ymm3
vfmadd132pd 32(%rdi), %ymm3, %ymm2
vbroadcastsd 24(%rsi), %ymm0
vfmadd132pd 64(%rdi), %ymm2, %ymm1
vfmadd132pd 96(%rdi), %ymm1, %ymm0
vmovupd %ymm0, (%rdx)
vzeroupper
ret
.cfi_endproc
.LFE0:
.size __example_MOD_mv_mult_4_4, .-__example_MOD_mv_mult_4_4
.ident "GCC: (Ubuntu 7.2.0-1ubuntu1~16.04) 7.2.0"
.section .note.GNU-stack,"",@progbits

Still, this does not seem to improve the code for the situation with a
20x20 matrix.
I will try some more things in the next few days.

Best regards,


>
> Richard.
>
> > Assembly:
> > CPU: Intel(R) Core(TM) i5-5200U CPU @ 2.20GHz
> >
> > ifort w/ -O3 -march=3Dnative -mtune=3Dnative -autodouble -S:
> > # mark_description "Intel(R) Fortran Intel(R) 64 Compiler for
> applications
> > running on Intel(R) 64, Version 17.0.3.191 Build 2017";
> > # mark_description "0404";
> > # mark_description "-O3 -march=3Dnative -mtune=3Dnative -autodouble -S";
> > # -- Begin  mv_mult_4_4_
> > .text
> > # mark_begin;
> >        .align    16,0x90
> > .globl mv_mult_4_4_
> > mv_mult_4_4_:
> > # parameter 1: %rdi
> > # parameter 2: %rsi
> > # parameter 3: %rdx
> > #...
> >         vbroadcastsd (%rsi), %ymm0                              #157.50
> >         vbroadcastsd 8(%rsi), %ymm2                             #157.50
> >         vbroadcastsd 16(%rsi), %ymm3                            #157.50
> >         vbroadcastsd 24(%rsi), %ymm4                            #157.50
> >         vmulpd    (%rdi), %ymm0, %ymm1                          #157.11
> >         vfmadd132pd 32(%rdi), %ymm1, %ymm2                      #157.11
> >         vfmadd132pd 64(%rdi), %ymm2, %ymm3                      #157.11
> >         vfmadd132pd 96(%rdi), %ymm3, %ymm4                      #157.11
> >         vmovupd   %ymm4, (%rdx)                                 #157.11
> >         vzeroupper                                              #165.3
> >         ret                                                     #165.3
> >         .align    16,0x90
> >                                 # LOE
> > .cfi_endproc
> > # mark_end;
> >
> > ---
> >
> > gfortran (GNU Fortran (Ubuntu 7.2.0-1ubuntu1~16.04) 7.2.0) w/ -O3
> > -march=3Dnative -mtune=3Dnative -fdefault-double-8 -fdefault-real-8 -S:
> > .p2align 4,,15
> > .globl __mv_mult_4_4
> > .type __mv_mult_4_4, @function
> > __mv_mult_4_4:
> > .LFB12:
> > .cfi_startproc
> > vpxor %xmm0, %xmm0, %xmm0
> > vmovups %xmm0, (%rdx)
> > vmovups %xmm0, 16(%rdx)
> > vmovupd (%rsi), %ymm0
> > vmovupd (%rdx), %ymm4
> > vpermpd $0, %ymm0, %ymm3
> > vfmadd132pd (%rdi), %ymm4, %ymm3
> > vpermpd $85, %ymm0, %ymm2
> > vfmadd132pd 32(%rdi), %ymm3, %ymm2
> > vpermpd $170, %ymm0, %ymm1
> > vpermpd $255, %ymm0, %ymm0
> > vfmadd132pd 64(%rdi), %ymm2, %ymm1
> > vfmadd132pd 96(%rdi), %ymm1, %ymm0
> > vmovupd %ymm0, (%rdx)
> > vzeroupper
> > ret
> > .cfi_endproc
> > .LFE12:
> > .size __mv_mult_4_4, .-__mv_mult_4_4
> > .p2align 4,,15
> >
> > ---
> >
> > Best regards,
> >
> > La=C3=A9rcio LIMA PILLA
> > Postdoctoral Researcher @ Inria Grenoble - Rh=C3=B4ne-Alpes, CORSE
> project-team
> > Associate Professor @ UFSC, Brazil
>

La=C3=A9rcio LIMA PILLA
Postdoctoral Researcher @ Inria Grenoble - Rh=C3=B4ne-Alpes, CORSE project-=
team
Associate Professor @ UFSC, Brazil