[PATCH v2] i386: Disable ix86_expand_vecop_qihi2 when !TARGET_AVX512BW

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

From: Haochen Jiang <haochen.jiang@intel.com>
To: gcc-patches@gcc.gnu.org
Cc: hongtao.liu@intel.com, ubizjak@gmail.com
Subject: [PATCH v2] i386: Disable ix86_expand_vecop_qihi2 when !TARGET_AVX512BW
Date: Tue, 21 May 2024 15:13:34 +0800	[thread overview]
Message-ID: <20240521071334.1450276-1-haochen.jiang@intel.com> (raw)
In-Reply-To: <CAFULd4YnZBs+Ov2efbr4tfu2YUeO2T1qkrs+Kvh7a7LqADZPSA@mail.gmail.com>

Hi all,

This is the v2 patch to fix PR115069. The new testcase has passed.

Changes in v2:
  - Added a testcase.
  - Change the comment for the early exit.

Thx,
Haochen

Since vpermq is really slow, we should avoid using it for permutation
when vpmovwb is not available (needs AVX512BW) for ix86_expand_vecop_qihi2
and fall back to ix86_expand_vecop_qihi.

gcc/ChangeLog:

        PR target/115069
	* config/i386/i386-expand.cc (ix86_expand_vecop_qihi2):
	Do not enable the optimization when AVX512BW is not enabled.

gcc/testsuite/ChangeLog:

        PR target/115069
	* gcc.target/i386/pr115069.c: New.
---
 gcc/config/i386/i386-expand.cc           |  7 +++
 gcc/testsuite/gcc.target/i386/pr115069.c | 78 ++++++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr115069.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index a6132911e6a..f7939761879 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24323,6 +24323,13 @@ ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
   bool uns_p = code != ASHIFTRT;
 
+  /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
+     generic permutation to merge the data back into the right place.  This
+     permutation results in VPERMQ, which is slow, so better fall back to
+     ix86_expand_vecop_qihi.  */
+  if (!TARGET_AVX512BW)
+    return false;
+
   if ((qimode == V16QImode && !TARGET_AVX2)
       || (qimode == V32QImode && (!TARGET_AVX512BW || !TARGET_EVEX512))
       /* There are no V64HImode instructions.  */
diff --git a/gcc/testsuite/gcc.target/i386/pr115069.c b/gcc/testsuite/gcc.target/i386/pr115069.c
new file mode 100644
index 00000000000..c4b48b602ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr115069.c
@@ -0,0 +1,78 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+/* { dg-final { scan-assembler-not "vpermq" } } */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+typedef int8_t  stress_vint8_t  __attribute__ ((vector_size (16)));
+
+#define OPS(a, b, c, s, v23, v3) \
+do {				\
+	a += b;			\
+	a |= b;			\
+	a -= b;			\
+	a &= ~b;		\
+	a *= c;			\
+	a = ~a;			\
+	a *= s;			\
+	a ^= c;			\
+	a <<= 1;		\
+	b >>= 1;		\
+	b += c;			\
+	a %= v23;		\
+	c /= v3;		\
+	b = b ^ c;		\
+	c = b ^ c;		\
+	b = b ^ c;		\
+} while (0)
+
+volatile uint8_t csum8_put;
+
+void stress_vecmath(void)
+{
+	const stress_vint8_t v23_8 = { 
+		0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,	
+		0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17 
+	};
+	const stress_vint8_t v3_8 = {
+		0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+		0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+	};
+	stress_vint8_t a8 = {
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+	};
+	stress_vint8_t b8 = {
+		0x01, 0x23, 0x45, 0x67, 0x89, 0xab, 0xcd, 0xef,
+		0x0f, 0x1e, 0x2d, 0x3c, 0x4b, 0x5a, 0x69, 0x78
+	};
+	stress_vint8_t c8 = {
+		0x01, 0x02, 0x03, 0x02, 0x01, 0x02, 0x03, 0x02,
+		0x03, 0x02, 0x01, 0x02, 0x03, 0x02, 0x01, 0x02
+	};
+	stress_vint8_t s8 = {
+		0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02,
+		0x01, 0x01, 0x02, 0x02, 0x01, 0x01, 0x02, 0x02,
+	};
+	const uint8_t csum8_val =  (uint8_t)0x1b;
+	int i;
+	uint8_t csum8;
+
+	for (i = 1000; i; i--) {
+		OPS(a8, b8, c8, s8, v23_8, v3_8);
+		OPS(a8, b8, c8, s8, v23_8, v3_8);
+		OPS(a8, b8, c8, s8, v23_8, v3_8);
+		OPS(a8, b8, c8, s8, v23_8, v3_8);
+		OPS(a8, b8, c8, s8, v23_8, v3_8);
+		OPS(a8, b8, c8, s8, v23_8, v3_8);
+	}
+
+	csum8 = a8[0]  ^ a8[1]  ^ a8[2]  ^ a8[3]  ^
+		a8[4]  ^ a8[5]  ^ a8[6]  ^ a8[7]  ^
+		a8[8]  ^ a8[9]  ^ a8[10] ^ a8[11] ^
+		a8[12] ^ a8[13] ^ a8[14] ^ a8[15];
+	csum8_put = csum8;
+}
-- 
2.31.1

next prev parent reply	other threads:[~2024-05-21  7:13 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-21  6:16 [PATCH] " Haochen Jiang
2024-05-21  6:17 ` Hongtao Liu
2024-05-21  6:35 ` Uros Bizjak
2024-05-21  7:13   ` Haochen Jiang [this message]
2024-05-21  7:21     ` [PATCH v2] " Hongtao Liu
2024-05-21  8:36       ` Jiang, Haochen
2024-05-21  9:01         ` [PATCH v3] " Haochen Jiang
2024-05-21 13:04           ` Uros Bizjak
2024-05-22  2:21             ` Jiang, Haochen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240521071334.1450276-1-haochen.jiang@intel.com \
    --to=haochen.jiang@intel.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=hongtao.liu@intel.com \
    --cc=ubizjak@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).