From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <gcc-patches-return-394631-listarch-gcc-patches=gcc.gnu.org@gcc.gnu.org>
Received: (qmail 27395 invoked by alias); 5 Apr 2015 18:25:29 -0000
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
List-Id: <gcc-patches.gcc.gnu.org>
List-Archive: <http://gcc.gnu.org/ml/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-help@gcc.gnu.org>
Sender: gcc-patches-owner@gcc.gnu.org
Received: (qmail 27372 invoked by uid 89); 5 Apr 2015 18:25:28 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=3.1 required=5.0 tests=AWL,BAYES_50,KAM_LAZY_DOMAIN_SECURITY,SPAM_BODY,T_RP_MATCHES_RCVD autolearn=no version=3.3.2
X-Spam-User: qpsmtpd, 2 recipients
X-HELO: nef2.ens.fr
Received: from nef2.ens.fr (HELO nef2.ens.fr) (129.199.96.40) by sourceware.org (qpsmtpd/0.93/v0.84-503-g423c35a) with ESMTP; Sun, 05 Apr 2015 18:25:26 +0000
Received: from mailhost.lps.ens.fr (tournesol.lps.ens.fr [129.199.120.1])          by nef2.ens.fr (8.13.6/1.01.28121999) with ESMTP id t35IPKcI055839          ; Sun, 5 Apr 2015 20:25:21 +0200 (CEST)
X-Envelope-To: gcc-patches@gcc.gnu.org
Received: from localhost (localhost [127.0.0.1])	by mailhost.lps.ens.fr (Postfix) with ESMTP id E2D2A105;	Sun,  5 Apr 2015 20:25:20 +0200 (CEST)
Received: from mailhost.lps.ens.fr ([127.0.0.1])	by localhost (tournesol.lps.ens.fr [127.0.0.1]) (amavisd-new, port 10024)	with ESMTP id SHRoCw8A921e; Sun,  5 Apr 2015 20:25:20 +0200 (CEST)
Received: from [192.168.1.11] (log78-1-82-242-47-10.fbx.proxad.net [82.242.47.10])	by mailhost.lps.ens.fr (Postfix) with ESMTPSA id 8B8A0F5;	Sun,  5 Apr 2015 20:25:20 +0200 (CEST)
Content-Type: text/plain; charset=windows-1252
Mime-Version: 1.0 (Mac OS X Mail 8.2 \(2070.6\))
Subject: Re: [patch, fortran, RFC] First steps towards inlining matmul
From: =?windows-1252?Q?Dominique_d=27Humi=E8res?= <dominiq@lps.ens.fr>
In-Reply-To: <552147A3.7000603@netcologne.de>
Date: Sun, 05 Apr 2015 18:25:00 -0000
Cc: gcc-patches@gcc.gnu.org, fortran@gcc.gnu.org, jvdelisle@charter.com
Content-Transfer-Encoding: quoted-printable
Message-Id: <4DA1047C-92D3-485A-9457-61655ED14681@lps.ens.fr>
References: <20150405135519.AF21A105@mailhost.lps.ens.fr> <552147A3.7000603@netcologne.de>
To: Thomas Koenig <tkoenig@netcologne.de>
X-SW-Source: 2015-04/txt/msg00171.txt.bz2

I have done some timings

(1) with the test given below, before the patch I get (last column in Gflop=
s)

[Book15] f90/bug% gfc -Ofast timing/matmul_tst_sys.f90 -framework Accelerate
[Book15] f90/bug% time a.out
 Time, MATMUL:    373.708008       373.69497100000001        4.281566850413=
9435=20=20=20=20=20
 Time, MATMUL:    172.086609       23.117919000000001        69.21038178220=
1068=20=20=20=20=20
545.374u 0.537s 6:36.93 137.5%	0+0k 0+0io 0pf+0w
[Book15] f90/bug% gfc -Ofast -fexternal-blas timing/matmul_tst_sys.f90  -fr=
amework Accelerate
[Book15] f90/bug% time a.out
 Time, MATMUL:    176.327881       23.855111999999998        67.07157778173=
5002=20=20=20=20=20
 Time, MATMUL:    182.086746       24.453551000000001        65.43017003951=
6952=20=20=20=20=20
358.059u 0.471s 0:48.43 740.2%	0+0k 0+0io 0pf+0w

after the patch

[Book15] f90/bug% time a.out
 Time, MATMUL:    392.415436       392.41728799999999        4.077292333766=
9056=20=20=20=20=20
 Time, MATMUL:    171.690399       22.905118000000002        69.85338385945=
0091=20=20=20=20=20
563.671u 0.551s 6:55.44 135.8%	0+0k 0+0io 0pf+0w
[Book15] f90/bug% gfc -Ofast -fexternal-blas timing/matmul_tst_sys.f90 -fra=
mework Accelerate
[Book15] f90/bug% time a.out
 Time, MATMUL:    392.850342       392.84190799999999        4.072885217734=
9673=20=20=20=20=20
 Time, MATMUL:    174.534821       23.784797000000001        67.26986150018=
4334=20=20=20=20=20
566.824u 0.674s 6:56.74 136.1%	0+0k 0+0io 0pf+0w

which means that -fexternal-blas should disable the inlining.

program t2=20
implicit none=20
REAL time_begin, time_end=20
integer, parameter :: n=3D2000;=20
integer(8) :: ts, te, rate8, cmax8
real(8) :: elapsed
REAL(8) :: a(n,n), b(n,n), c(n,n)=20
integer, parameter :: m =3D 100=20
integer :: i=20
call RANDOM_NUMBER(a)=20
call RANDOM_NUMBER(b)=20
call cpu_time(time_begin)=20
call SYSTEM_CLOCK (ts, rate8, cmax8)
do i =3D 1,m=20
    a(1,1) =3D a(1,1) + 0.1=20
    c =3D MATMUL(a,b)=20
enddo=20
call SYSTEM_CLOCK (te, rate8, cmax8)
call cpu_time(time_end)=20
elapsed =3D real(te-ts, kind=3D8)/real(rate8, kind=3D8)
PRINT *, 'Time, MATMUL: ',time_end-time_begin, elapsed , 2*m*real(n, kind=
=3D8)**3/(10**9*elapsed)
call cpu_time(time_begin)=20
call SYSTEM_CLOCK (ts, rate8, cmax8)
do i =3D 1,m=20
    a(1,1) =3D a(1,1) + 0.1=20
    call dgemm('n','n',n, n, n, dble(1.0), a, n, b, n, dble(0.0), c, n)=20
enddo=20
call SYSTEM_CLOCK (te, rate8, cmax8)
call cpu_time(time_end)=20
elapsed =3D real(te-ts, kind=3D8)/real(rate8, kind=3D8)
PRINT *, 'Time, MATMUL: ',time_end-time_begin, elapsed , 2*m*real(n, kind=
=3D8)**3/(10**9*elapsed)
end program=20
! http://groups.google.com/group/comp.lang.fortran/browse_thread/thread/1cb=
a8e6ce5080197

(2) The polyhedron test compiled with -fprotect-parens -Ofast -funroll-loop=
s -ftree-loop-linear -fomit-frame-pointer -fwhole-program -flto runs in ~6.=
5s. After applying the patch below, it runs in ~66s with/without the patch.

Dominique

--- induct.f90	2005-10-11 22:53:32.000000000 +0200
+++ induct_vmc.f90	2015-04-05 19:06:30.000000000 +0200
@@ -1644,18 +1644,17 @@ contains
           coil_tmp_vector(1) =3D -sin(theta)
           coil_tmp_vector(2) =3D cos(theta)
           coil_tmp_vector(3) =3D 0.0_longreal
-          coil_current_vec(1) =3D dot_product(rotate_coil(1,:),coil_tmp_ve=
ctor(:))
-          coil_current_vec(2) =3D dot_product(rotate_coil(2,:),coil_tmp_ve=
ctor(:))
-          coil_current_vec(3) =3D dot_product(rotate_coil(3,:),coil_tmp_ve=
ctor(:))
+          coil_current_vec =3D matmul(rotate_coil,coil_tmp_vector)
 !
           do j =3D 1, 9
               c_vector(3) =3D 0.5 * h_coil * z1gauss(j)
 !
 !       rotate coil vector into the global coordinate system and translate=
 it
 !
-              rot_c_vector(1) =3D dot_product(rotate_coil(1,:),c_vector(:)=
) + dx
-              rot_c_vector(2) =3D dot_product(rotate_coil(2,:),c_vector(:)=
) + dy
-              rot_c_vector(3) =3D dot_product(rotate_coil(3,:),c_vector(:)=
) + dz
+              rot_c_vector =3D matmul(rotate_coil,c_vector)
+              rot_c_vector(1) =3D rot_c_vector(1) + dx
+              rot_c_vector(2) =3D rot_c_vector(2) + dy
+              rot_c_vector(3) =3D rot_c_vector(3) + dz
 !
               do k =3D 1, 9
                   q_vector(1) =3D 0.5_longreal * a * (x2gauss(k) + 1.0_lon=
greal)
@@ -1664,9 +1663,7 @@ contains
 !
 !       rotate quad vector into the global coordinate system
 !
-                  rot_q_vector(1) =3D dot_product(rotate_quad(1,:),q_vecto=
r(:))
-                  rot_q_vector(2) =3D dot_product(rotate_quad(2,:),q_vecto=
r(:))
-                  rot_q_vector(3) =3D dot_product(rotate_quad(3,:),q_vecto=
r(:))
+                  rot_q_vector =3D matmul(rotate_quad,q_vector)
 !
 !       compute and add in quadrature term
 !
@@ -1756,18 +1753,17 @@ contains
           coil_tmp_vector(1) =3D -sin(theta)
           coil_tmp_vector(2) =3D cos(theta)
           coil_tmp_vector(3) =3D 0.0_longreal
-          coil_current_vec(1) =3D dot_product(rotate_coil(1,:),coil_tmp_ve=
ctor(:))
-          coil_current_vec(2) =3D dot_product(rotate_coil(2,:),coil_tmp_ve=
ctor(:))
-          coil_current_vec(3) =3D dot_product(rotate_coil(3,:),coil_tmp_ve=
ctor(:))
+          coil_current_vec =3D matmul(rotate_coil,coil_tmp_vector)
 !
           do j =3D 1, 9
               c_vector(3) =3D 0.5 * h_coil * z1gauss(j)
 !
 !       rotate coil vector into the global coordinate system and translate=
 it
 !
-              rot_c_vector(1) =3D dot_product(rotate_coil(1,:),c_vector(:)=
) + dx
-              rot_c_vector(2) =3D dot_product(rotate_coil(2,:),c_vector(:)=
) + dy
-              rot_c_vector(3) =3D dot_product(rotate_coil(3,:),c_vector(:)=
) + dz
+              rot_c_vector =3D matmul(rotate_coil,c_vector)
+              rot_c_vector(1) =3D rot_c_vector(1) + dx
+              rot_c_vector(2) =3D rot_c_vector(2) + dy
+              rot_c_vector(3) =3D rot_c_vector(3) + dz
 !
               do k =3D 1, 9
                   q_vector(1) =3D 0.5_longreal * a * (x2gauss(k) + 1.0_lon=
greal)
@@ -1776,9 +1772,7 @@ contains
 !
 !       rotate quad vector into the global coordinate system
 !
-                  rot_q_vector(1) =3D dot_product(rotate_quad(1,:),q_vecto=
r(:))
-                  rot_q_vector(2) =3D dot_product(rotate_quad(2,:),q_vecto=
r(:))
-                  rot_q_vector(3) =3D dot_product(rotate_quad(3,:),q_vecto=
r(:))
+                  rot_q_vector =3D matmul(rotate_quad,q_vector)
 !
 !       compute and add in quadrature term
 !
@@ -2061,18 +2055,17 @@ contains
 !
 !       compute current vector for the coil in the global coordinate system
 !
-          coil_current_vec(1) =3D dot_product(rotate_coil(1,:),coil_tmp_ve=
ctor(:))
-          coil_current_vec(2) =3D dot_product(rotate_coil(2,:),coil_tmp_ve=
ctor(:))
-          coil_current_vec(3) =3D dot_product(rotate_coil(3,:),coil_tmp_ve=
ctor(:))
+          coil_current_vec =3D matmul(rotate_coil,coil_tmp_vector)
 !
           do j =3D 1, 9
               c_vector(3) =3D 0.5 * h_coil * z1gauss(j)
 !
 !       rotate coil vector into the global coordinate system and translate=
 it
 !
-              rot_c_vector(1) =3D dot_product(rotate_coil(1,:),c_vector(:)=
) + dx
-              rot_c_vector(2) =3D dot_product(rotate_coil(2,:),c_vector(:)=
) + dy
-              rot_c_vector(3) =3D dot_product(rotate_coil(3,:),c_vector(:)=
) + dz
+              rot_c_vector =3D matmul(rotate_coil,c_vector)
+              rot_c_vector(1) =3D rot_c_vector(1) + dx
+              rot_c_vector(2) =3D rot_c_vector(2) + dy
+              rot_c_vector(3) =3D rot_c_vector(3) + dz
 !
               do k =3D 1, 9
                   q_vector(1) =3D 0.5_longreal * a * (x2gauss(k) + 1.0_lon=
greal)
@@ -2081,9 +2074,7 @@ contains
 !
 !       rotate quad vector into the global coordinate system
 !
-                  rot_q_vector(1) =3D dot_product(rotate_quad(1,:),q_vecto=
r(:))
-                  rot_q_vector(2) =3D dot_product(rotate_quad(2,:),q_vecto=
r(:))
-                  rot_q_vector(3) =3D dot_product(rotate_quad(3,:),q_vecto=
r(:))
+                  rot_q_vector =3D matmul(rotate_quad,q_vector)
 !
 !       compute and add in quadrature term
 !
@@ -2204,18 +2195,17 @@ contains
 !
 !       compute current vector for the coil in the global coordinate system
 !
-          coil_current_vec(1) =3D dot_product(rotate_coil(1,:),coil_tmp_ve=
ctor(:))
-          coil_current_vec(2) =3D dot_product(rotate_coil(2,:),coil_tmp_ve=
ctor(:))
-          coil_current_vec(3) =3D dot_product(rotate_coil(3,:),coil_tmp_ve=
ctor(:))
+          coil_current_vec =3D matmul(rotate_coil,coil_tmp_vector)
 !
           do j =3D 1, 9
               c_vector(3) =3D 0.5 * h_coil * z1gauss(j)
 !
 !       rotate coil vector into the global coordinate system and translate=
 it
 !
-              rot_c_vector(1) =3D dot_product(rotate_coil(1,:),c_vector(:)=
) + dx
-              rot_c_vector(2) =3D dot_product(rotate_coil(2,:),c_vector(:)=
) + dy
-              rot_c_vector(3) =3D dot_product(rotate_coil(3,:),c_vector(:)=
) + dz
+              rot_c_vector =3D matmul(rotate_coil,c_vector)
+              rot_c_vector(1) =3D rot_c_vector(1) + dx
+              rot_c_vector(2) =3D rot_c_vector(2) + dy
+              rot_c_vector(3) =3D rot_c_vector(3) + dz
 !
               do k =3D 1, 9
                   q_vector(1) =3D 0.5_longreal * a * (x2gauss(k) + 1.0_lon=
greal)
@@ -2224,9 +2214,7 @@ contains
 !
 !       rotate quad vector into the global coordinate system
 !
-                  rot_q_vector(1) =3D dot_product(rotate_quad(1,:),q_vecto=
r(:))
-                  rot_q_vector(2) =3D dot_product(rotate_quad(2,:),q_vecto=
r(:))
-                  rot_q_vector(3) =3D dot_product(rotate_quad(3,:),q_vecto=
r(:))
+                  rot_q_vector =3D matmul(rotate_quad,q_vector)
 !
 !       compute and add in quadrature term
 !

> Le 5 avr. 2015 =E0 16:33, Thomas Koenig <tkoenig@netcologne.de> a =E9crit=
 :
>=20
> Hi Dominique,
>=20
>> IMO the inlining of MATMUL should be restricted to small matrices (less =
than 4x4, 9x9
>> or 16x16 depending of your field!-)
>=20
> The problem with the library function we have is that it is quite
> general; it can deal with all the complexity of assumed-shape array
> arguments.  Inlining allows the compiler to take advantage of
> contiguous memory, compile-time knowledge of array sizes, etc.
> to allow for vectorization or even complete unrolling.
>=20
> Of course, if you use c=3Dmatmul(a,b) where all three are assumed-shape
> arrays, your advantage over the library function will be minimal at
> best.
>=20
> It will be interesting to see at what size calling BLAS directly
> will be better.
>=20
> 	Thomas