public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [patch, fortran] Reduce stack use in blocked matmul
@ 2017-05-05 20:35 Thomas Koenig
  2017-05-08 17:02 ` Jerry DeLisle
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Thomas Koenig @ 2017-05-05 20:35 UTC (permalink / raw)
  To: fortran, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1262 bytes --]

Hello world,

the attached patch reduces the stack usage by the blocked
version of matmul for cases where we don't need the full buffer.
This should improve stack usage.

Regression-tested.  I also added a stress test (around 3 secs of
CPU time on my system), it will only run once due to the "dg-do  run"
hack).

OK for trunk?

	Thomas

2017-05-05  Thomas Koenig  <tkoenig@gcc.gnu.org>

         PR fortran/80602
         * m4/matmul_internal.m4:  'matmul_name`:  Change
         t1 to a VLA of the required size.
         * generated/matmul_c10.c: Regenerated.
         * generated/matmul_c16.c: Regenerated.
         * generated/matmul_c4.c: Regenerated.
         * generated/matmul_c8.c: Regenerated.
         * generated/matmul_i1.c: Regenerated.
         * generated/matmul_i16.c: Regenerated.
         * generated/matmul_i2.c: Regenerated.
         * generated/matmul_i4.c: Regenerated.
         * generated/matmul_i8.c: Regenerated.
         * generated/matmul_r10.c: Regenerated.
         * generated/matmul_r16.c: Regenerated.
         * generated/matmul_r4.c: Regenerated.
         * generated/matmul_r8.c: Regenerated.

2017-05-05  Thomas Koenig  <tkoenig@gcc.gnu.org>

         PR fortran/80602
         * gfortran.dg/matmul_15.f90:  New test case.

[-- Attachment #2: p1.diff --]
[-- Type: text/x-patch, Size: 65359 bytes --]

Index: generated/matmul_c10.c
===================================================================
--- generated/matmul_c10.c	(Revision 247566)
+++ generated/matmul_c10.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_c10_avx (gfc_array_c10 * const restrict ret
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_c10_avx (gfc_array_c10 * const restrict ret
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_c10_avx2 (gfc_array_c10 * const restrict re
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_c10_avx2 (gfc_array_c10 * const restrict re
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_c10_avx512f (gfc_array_c10 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_c10_vanilla (gfc_array_c10 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_c10_vanilla (gfc_array_c10 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_c10 (gfc_array_c10 * const restrict retarra
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_c10 (gfc_array_c10 * const restrict retarra
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_c16.c
===================================================================
--- generated/matmul_c16.c	(Revision 247566)
+++ generated/matmul_c16.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_c16_avx (gfc_array_c16 * const restrict ret
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_c16_avx (gfc_array_c16 * const restrict ret
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_c16_avx2 (gfc_array_c16 * const restrict re
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_c16_avx2 (gfc_array_c16 * const restrict re
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_c16_avx512f (gfc_array_c16 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_c16_vanilla (gfc_array_c16 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_c16_vanilla (gfc_array_c16 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_c16 (gfc_array_c16 * const restrict retarra
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_c16 (gfc_array_c16 * const restrict retarra
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_c4.c
===================================================================
--- generated/matmul_c4.c	(Revision 247566)
+++ generated/matmul_c4.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_c4_avx (gfc_array_c4 * const restrict retar
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_c4_avx (gfc_array_c4 * const restrict retar
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_c4_avx2 (gfc_array_c4 * const restrict reta
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_c4_avx2 (gfc_array_c4 * const restrict reta
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_c4_avx512f (gfc_array_c4 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_c4_vanilla (gfc_array_c4 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_c4_vanilla (gfc_array_c4 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_c4 (gfc_array_c4 * const restrict retarray,
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_c8.c
===================================================================
--- generated/matmul_c8.c	(Revision 247566)
+++ generated/matmul_c8.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_c8_avx (gfc_array_c8 * const restrict retar
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_c8_avx (gfc_array_c8 * const restrict retar
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_c8_avx2 (gfc_array_c8 * const restrict reta
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_c8_avx2 (gfc_array_c8 * const restrict reta
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_c8_avx512f (gfc_array_c8 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_c8_vanilla (gfc_array_c8 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_c8_vanilla (gfc_array_c8 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_COMPLEX_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_COMPLEX_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_c8 (gfc_array_c8 * const restrict retarray,
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_COMPLEX_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_i1.c
===================================================================
--- generated/matmul_i1.c	(Revision 247566)
+++ generated/matmul_i1.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_i1_avx (gfc_array_i1 * const restrict retar
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_1 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_i1_avx (gfc_array_i1 * const restrict retar
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_1 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict reta
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_1 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_i1_avx2 (gfc_array_i1 * const restrict reta
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_1 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_1 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_i1_avx512f (gfc_array_i1 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_1 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_1 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_i1_vanilla (gfc_array_i1 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_1 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_i1 (gfc_array_i1 * const restrict retarray,
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_1 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_1 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_i1 (gfc_array_i1 * const restrict retarray,
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_1 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_i16.c
===================================================================
--- generated/matmul_i16.c	(Revision 247566)
+++ generated/matmul_i16.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_i16_avx (gfc_array_i16 * const restrict ret
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_i16_avx (gfc_array_i16 * const restrict ret
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict re
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_i16_avx2 (gfc_array_i16 * const restrict re
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_i16_avx512f (gfc_array_i16 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_i16_vanilla (gfc_array_i16 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_i16 (gfc_array_i16 * const restrict retarra
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_i16 (gfc_array_i16 * const restrict retarra
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_i2.c
===================================================================
--- generated/matmul_i2.c	(Revision 247566)
+++ generated/matmul_i2.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_i2_avx (gfc_array_i2 * const restrict retar
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_2 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_i2_avx (gfc_array_i2 * const restrict retar
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_2 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict reta
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_2 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_i2_avx2 (gfc_array_i2 * const restrict reta
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_2 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_2 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_i2_avx512f (gfc_array_i2 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_2 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_2 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_i2_vanilla (gfc_array_i2 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_2 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_i2 (gfc_array_i2 * const restrict retarray,
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_2 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_2 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_i2 (gfc_array_i2 * const restrict retarray,
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_2 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_i4.c
===================================================================
--- generated/matmul_i4.c	(Revision 247566)
+++ generated/matmul_i4.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_i4_avx (gfc_array_i4 * const restrict retar
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_i4_avx (gfc_array_i4 * const restrict retar
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict reta
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_i4_avx2 (gfc_array_i4 * const restrict reta
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_i4_avx512f (gfc_array_i4 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_i4_vanilla (gfc_array_i4 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_i4 (gfc_array_i4 * const restrict retarray,
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_i8.c
===================================================================
--- generated/matmul_i8.c	(Revision 247566)
+++ generated/matmul_i8.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_i8_avx (gfc_array_i8 * const restrict retar
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_i8_avx (gfc_array_i8 * const restrict retar
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict reta
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_i8_avx2 (gfc_array_i8 * const restrict reta
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_i8_avx512f (gfc_array_i8 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_i8_vanilla (gfc_array_i8 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_INTEGER_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_INTEGER_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_i8 (gfc_array_i8 * const restrict retarray,
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_INTEGER_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_r10.c
===================================================================
--- generated/matmul_r10.c	(Revision 247566)
+++ generated/matmul_r10.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_r10_avx (gfc_array_r10 * const restrict ret
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_r10_avx (gfc_array_r10 * const restrict ret
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_r10_avx2 (gfc_array_r10 * const restrict re
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_r10_avx2 (gfc_array_r10 * const restrict re
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_r10_avx512f (gfc_array_r10 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_r10_vanilla (gfc_array_r10 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_r10_vanilla (gfc_array_r10 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_r10 (gfc_array_r10 * const restrict retarra
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_10 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_10 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_r10 (gfc_array_r10 * const restrict retarra
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_10 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_r16.c
===================================================================
--- generated/matmul_r16.c	(Revision 247566)
+++ generated/matmul_r16.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_r16_avx (gfc_array_r16 * const restrict ret
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_r16_avx (gfc_array_r16 * const restrict ret
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_r16_avx2 (gfc_array_r16 * const restrict re
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_r16_avx2 (gfc_array_r16 * const restrict re
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_r16_avx512f (gfc_array_r16 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_r16_vanilla (gfc_array_r16 * const restrict
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_r16_vanilla (gfc_array_r16 * const restrict
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_r16 (gfc_array_r16 * const restrict retarra
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_16 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_16 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_r16 (gfc_array_r16 * const restrict retarra
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_16 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_r4.c
===================================================================
--- generated/matmul_r4.c	(Revision 247566)
+++ generated/matmul_r4.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_r4_avx (gfc_array_r4 * const restrict retar
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_r4_avx (gfc_array_r4 * const restrict retar
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_r4_avx2 (gfc_array_r4 * const restrict reta
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_r4_avx2 (gfc_array_r4 * const restrict reta
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_r4_avx512f (gfc_array_r4 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_r4_vanilla (gfc_array_r4 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_r4_vanilla (gfc_array_r4 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_4 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_4 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_r4 (gfc_array_r4 * const restrict retarray,
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_4 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: generated/matmul_r8.c
===================================================================
--- generated/matmul_r8.c	(Revision 247566)
+++ generated/matmul_r8.c	(Arbeitskopie)
@@ -286,8 +286,7 @@ matmul_r8_avx (gfc_array_r8 * const restrict retar
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -311,6 +310,17 @@ matmul_r8_avx (gfc_array_r8 * const restrict retar
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -829,8 +839,7 @@ matmul_r8_avx2 (gfc_array_r8 * const restrict reta
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -854,6 +863,17 @@ matmul_r8_avx2 (gfc_array_r8 * const restrict reta
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1372,8 +1392,7 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1397,6 +1416,17 @@ matmul_r8_avx512f (gfc_array_r8 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -1911,8 +1941,7 @@ matmul_r8_vanilla (gfc_array_r8 * const restrict r
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -1936,6 +1965,17 @@ matmul_r8_vanilla (gfc_array_r8 * const restrict r
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
@@ -2508,8 +2548,7 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      GFC_REAL_8 t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      GFC_REAL_8 f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -2533,6 +2572,17 @@ matmul_r8 (gfc_array_r8 * const restrict retarray,
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      GFC_REAL_8 t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)
Index: m4/matmul_internal.m4
===================================================================
--- m4/matmul_internal.m4	(Revision 247566)
+++ m4/matmul_internal.m4	(Arbeitskopie)
@@ -202,8 +202,7 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
 		 i1, i2, i3, i4, i5, i6;
 
       /* Local variables */
-      'rtype_name` t1[65536], /* was [256][256] */
-		 f11, f12, f21, f22, f31, f32, f41, f42,
+      'rtype_name` f11, f12, f21, f22, f31, f32, f41, f42,
 		 f13, f14, f23, f24, f33, f34, f43, f44;
       index_type i, j, l, ii, jj, ll;
       index_type isec, jsec, lsec, uisec, ujsec, ulsec;
@@ -227,6 +226,17 @@ sinclude(`matmul_asm_'rtype_code`.m4')dnl
       if (m == 0 || n == 0 || k == 0)
 	return;
 
+      /* Adjust size of t1 to what is needed.  */
+      index_type t1_dim;
+      t1_dim = (a_dim1-1) * 256 + b_dim1;
+      if (t1_dim > 65536)
+	t1_dim = 65536;
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wvla"
+      'rtype_name` t1[t1_dim]; /* was [256][256] */
+#pragma GCC diagnostic pop
+
       /* Empty c first.  */
       for (j=1; j<=n; j++)
 	for (i=1; i<=m; i++)

[-- Attachment #3: matmul_15.f90 --]
[-- Type: text/x-fortran, Size: 846 bytes --]

! { dg-do  run }
! { dg-options "-finline-matmul-limit=0" }
! Stress-test the matmul blocking code with sizes close to or
! equal to powers ot two.

program main
  implicit none
  integer, dimension(*), parameter :: nn = &
       & [2,3,4,5, 7,8,9, 15,16,17, 31,32,33, 63,64,65, &
       127 ,228,129,  255,256,257];
  integer, parameter :: s = size(nn)
  real, dimension(:,:),allocatable :: a, b, c
  integer :: i1, i2, i3
  integer :: nx, ny, count
  real :: sm

  sm = 0.0
  do i1=1, s
     nx = nn(i1)
     do i2=1,s
        ny = nn(i2)
        do i3=1,s
           count = nn(i3)
           allocate (a(nx,ny), b(ny,count), c(nx,count))
           call random_number(a)
           call random_number(b)
           c = matmul(a,b)
           sm = sm + sum(c)
           deallocate(a,b,c)
        end do
     end do
  end do

end program main

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-05-10 16:06 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-05 20:35 [patch, fortran] Reduce stack use in blocked matmul Thomas Koenig
2017-05-08 17:02 ` Jerry DeLisle
2017-05-08 19:55   ` Thomas Koenig
2017-05-08 21:16     ` Jerry DeLisle
2017-05-09 12:46   ` Christophe Lyon
2017-05-09 10:49 ` Andreas Schwab
2017-05-09 17:23   ` Thomas Koenig
2017-05-10  8:11 ` Andreas Schwab
2017-05-10 15:48   ` Thomas Koenig
2017-05-10 15:56     ` Andreas Schwab
2017-05-10 15:56     ` Thomas Koenig
2017-05-10 16:28       ` Andreas Schwab

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).