From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <rguenth@sourceware.org>
Received: by sourceware.org (Postfix, from userid 1666)
	id 44E913858D35; Wed, 21 Jun 2023 08:43:36 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 44E913858D35
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org;
	s=default; t=1687337016;
	bh=tJO0asN8hz93zwBYEHQSawZcVjdOga6jxkkHkJ1pZmE=;
	h=From:To:Subject:Date:From;
	b=efHHYCebGr3iUFtFSp583gyJ4mlzttcEdF1AzmHMRlDijryWBy4WyhiheeiI2RsX8
	 ZcMuZ1hU01JGBf155Gj82H9cO5y0MlYpMvEOjS91TyuweVNoX5yNmDxRsJLxF2gzz4
	 NtevCWKnrTFeaFQnOnjDcptKlxIp7HbmqNMVqD8E=
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
Content-Type: text/plain; charset="utf-8"
From: Richard Biener <rguenth@gcc.gnu.org>
To: gcc-cvs@gcc.gnu.org
Subject: [gcc r14-2010] [i386] Reject too large vectors for partial vector
 vectorization
X-Act-Checkin: gcc
X-Git-Author: Richard Biener <rguenther@suse.de>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 864c6471bdc6cdec6da60b66ac13e9fe3cd73fb8
X-Git-Newrev: 24c125fe47ac95f9e83f7e2bfa8594592a76368f
Message-Id: <20230621084336.44E913858D35@sourceware.org>
Date: Wed, 21 Jun 2023 08:43:36 +0000 (GMT)
List-Id: <gcc-cvs.sourceware.org>

https://gcc.gnu.org/g:24c125fe47ac95f9e83f7e2bfa8594592a76368f

commit r14-2010-g24c125fe47ac95f9e83f7e2bfa8594592a76368f
Author: Richard Biener <rguenther@suse.de>
Date:   Mon Jun 19 12:28:32 2023 +0200

    [i386] Reject too large vectors for partial vector vectorization
    
    The following works around the lack of the x86 backend making the
    vectorizer compare the costs of the different possible vector
    sizes the backed advertises through the vector_modes hook.  When
    enabling masked epilogues or main loops then this means we will
    select the prefered vector mode which is usually the largest even
    for loops that do not iterate close to the times the vector has
    lanes.  When not using masking the vectorizer would reject any
    mode resulting in a VF bigger than the number of iterations
    but with masking they are simply masked out.
    
    So this overloads the finish_cost function and matches for
    the problematic case, forcing a high cost to make us try a
    smaller vector size.
    
            * config/i386/i386.cc (ix86_vector_costs::finish_cost):
            Overload.  For masked main loops make sure the vectorization
            factor isn't more than double the number of iterations.
    
            * gcc.target/i386/vect-partial-vectors-1.c: New testcase.
            * gcc.target/i386/vect-partial-vectors-2.c: Likewise.

Diff:
---
 gcc/config/i386/i386.cc                            | 26 ++++++++++++++++++++++
 .../gcc.target/i386/vect-partial-vectors-1.c       | 13 +++++++++++
 .../gcc.target/i386/vect-partial-vectors-2.c       | 12 ++++++++++
 3 files changed, 51 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index b20cb86b822..32851a514a9 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -23666,6 +23666,7 @@ class ix86_vector_costs : public vector_costs
 			      stmt_vec_info stmt_info, slp_tree node,
 			      tree vectype, int misalign,
 			      vect_cost_model_location where) override;
+  void finish_cost (const vector_costs *) override;
 };
 
 /* Implement targetm.vectorize.create_costs.  */
@@ -23918,6 +23919,31 @@ ix86_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
   return retval;
 }
 
+void
+ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
+{
+  loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
+  if (loop_vinfo && !m_costing_for_scalar)
+    {
+      /* We are currently not asking the vectorizer to compare costs
+	 between different vector mode sizes.  When using predication
+	 that will end up always choosing the prefered mode size even
+	 if there's a smaller mode covering all lanes.  Test for this
+	 situation and artificially reject the larger mode attempt.
+	 ???  We currently lack masked ops for sub-SSE sized modes,
+	 so we could restrict this rejection to AVX and AVX512 modes
+	 but error on the safe side for now.  */
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+	  && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+	  && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
+	      > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
+	m_costs[vect_body] = INT_MAX;
+    }
+
+  vector_costs::finish_cost (scalar_costs);
+}
+
 /* Validate target specific memory model bits in VAL. */
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/testsuite/gcc.target/i386/vect-partial-vectors-1.c b/gcc/testsuite/gcc.target/i386/vect-partial-vectors-1.c
new file mode 100644
index 00000000000..3834720e8e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-partial-vectors-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl -mprefer-vector-width=512 --param vect-partial-vector-usage=1" } */
+
+void foo (int * __restrict a, int *b)
+{
+  for (int i = 0; i < 4; ++i)
+    a[i] = b[i] + 42;
+}
+
+/* We do not want to optimize this using masked AVX or AXV512
+   but unmasked SSE.  */
+/* { dg-final { scan-assembler-not "\[yz\]mm" } } */
+/* { dg-final { scan-assembler "xmm" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-partial-vectors-2.c b/gcc/testsuite/gcc.target/i386/vect-partial-vectors-2.c
new file mode 100644
index 00000000000..4ab2cbc4203
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-partial-vectors-2.c
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx512f -mavx512vl -mprefer-vector-width=512 --param vect-partial-vector-usage=1" } */
+
+void foo (int * __restrict a, int *b)
+{
+  for (int i = 0; i < 7; ++i)
+    a[i] = b[i] + 42;
+}
+
+/* We want to optimize this using masked AVX, not AXV512 or SSE.  */
+/* { dg-final { scan-assembler-not "zmm" } } */
+/* { dg-final { scan-assembler "ymm\[^\r\n\]*\{%k" } } */