public inbox for binutils@sourceware.org
 help / color / mirror / Atom feed
From: Jan Beulich <jbeulich@suse.com>
To: Binutils <binutils@sourceware.org>
Cc: "H.J. Lu" <hjl.tools@gmail.com>
Subject: [PATCH 1/4] x86: optimize pre-AVX512 {,V}PCMPEQQ with identical sources
Date: Fri, 16 Jun 2023 09:30:41 +0200	[thread overview]
Message-ID: <503caac8-8824-823a-81c2-762cba207cb6@suse.com> (raw)
In-Reply-To: <bbc351cd-fa35-a3b7-8e2b-9d2edc4ab379@suse.com>

The {,V}PCMPEQD alternative is 1 byte shorter in many cases.
---
It's not really clear whether the same would be worthwhile for AVX512
forms: Some could be expressed via KXNOR* (when no masking is in effect)
or KOR* (when masking is in effect), but others cannot. And while in
pre-AVX512 code these patterns are likely to be used to produce all-ones
idioms, this looks less likely in AVX512.

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -4563,6 +4563,23 @@ optimize_encoding (void)
 	i.types[j].bitfield.disp8
 	  = fits_in_disp8 (i.op[j].disps->X_add_number);
     }
+  else if (optimize_for_space
+	   && i.tm.base_opcode == 0x29
+	   && i.tm.opcode_space == SPACE_0F38
+	   && i.operands == i.reg_operands
+	   && i.op[0].regs == i.op[1].regs
+	   && (!i.tm.opcode_modifier.vex
+	       || !(i.op[0].regs->reg_flags & RegRex))
+	   && !is_evex_encoding (&i.tm))
+    {
+      /* Optimize: -Os:
+         pcmpeqq %xmmN, %xmmN          -> pcmpeqd %xmmN, %xmmN
+         vpcmpeqq %xmmN, %xmmN, %xmmM  -> vpcmpeqd %xmmN, %xmmN, %xmmM (N < 8)
+         vpcmpeqq %ymmN, %ymmN, %ymmM  -> vpcmpeqd %ymmN, %ymmN, %ymmM (N < 8)
+       */
+      i.tm.opcode_space = SPACE_0F;
+      i.tm.base_opcode = 0x76;
+    }
 }
 
 /* Return non-zero for load instruction.  */
--- a/gas/testsuite/gas/i386/optimize-2.d
+++ b/gas/testsuite/gas/i386/optimize-2.d
@@ -161,4 +161,7 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  \(%eax\)\{1to2\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxord \(%eax\)\{1to4\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxorq \(%eax\)\{1to4\},%ymm2,%ymm3
+ +[a-f0-9]+:	66 .*	pcmpeqd %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpcmpeqd %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpcmpeqd %ymm2,%ymm2,%ymm0
 #pass
--- a/gas/testsuite/gas/i386/optimize-2.s
+++ b/gas/testsuite/gas/i386/optimize-2.s
@@ -180,3 +180,7 @@ _start:
 	vporq		(%eax){1to2}, %xmm2, %xmm3
 	vpxord		(%eax){1to4}, %xmm2, %xmm3
 	vpxorq		(%eax){1to4}, %ymm2, %ymm3
+
+	pcmpeqq		%xmm2, %xmm2
+	vpcmpeqq	%xmm2, %xmm2, %xmm0
+	vpcmpeqq	%ymm2, %ymm2, %ymm0
--- a/gas/testsuite/gas/i386/optimize-2b.d
+++ b/gas/testsuite/gas/i386/optimize-2b.d
@@ -162,4 +162,7 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  \(%eax\)\{1to2\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxord \(%eax\)\{1to4\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxorq \(%eax\)\{1to4\},%ymm2,%ymm3
+ +[a-f0-9]+:	66 .*	pcmpeqq %xmm2,%xmm2
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm2,%ymm2,%ymm0
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.d
@@ -199,4 +199,10 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  \(%rax\)\{1to2\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxord \(%rax\)\{1to4\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxorq \(%rax\)\{1to4\},%ymm2,%ymm3
+ +[a-f0-9]+:	66 .*	pcmpeqd %xmm2,%xmm2
+ +[a-f0-9]+:	c5 .*	vpcmpeqd %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c5 .*	vpcmpeqd %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	66 .*	pcmpeqd %xmm12,%xmm12
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm12,%xmm12,%xmm0
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm12,%ymm12,%ymm0
 #pass
--- a/gas/testsuite/gas/i386/x86-64-optimize-3.s
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3.s
@@ -221,3 +221,11 @@ _start:
 	vporq		(%rax){1to2}, %xmm2, %xmm3
 	vpxord		(%rax){1to4}, %xmm2, %xmm3
 	vpxorq		(%rax){1to4}, %ymm2, %ymm3
+
+	pcmpeqq		%xmm2, %xmm2
+	vpcmpeqq	%xmm2, %xmm2, %xmm0
+	vpcmpeqq	%ymm2, %ymm2, %ymm0
+
+	pcmpeqq		%xmm12, %xmm12
+	vpcmpeqq	%xmm12, %xmm12, %xmm0
+	vpcmpeqq	%ymm12, %ymm12, %ymm0
--- a/gas/testsuite/gas/i386/x86-64-optimize-3b.d
+++ b/gas/testsuite/gas/i386/x86-64-optimize-3b.d
@@ -200,4 +200,10 @@ Disassembly of section .text:
  +[a-f0-9]+:	62 .*	vporq  \(%rax\)\{1to2\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxord \(%rax\)\{1to4\},%xmm2,%xmm3
  +[a-f0-9]+:	62 .*	vpxorq \(%rax\)\{1to4\},%ymm2,%ymm3
+ +[a-f0-9]+:	66 .*	pcmpeqq %xmm2,%xmm2
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm2,%xmm2,%xmm0
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm2,%ymm2,%ymm0
+ +[a-f0-9]+:	66 .*	pcmpeqq %xmm12,%xmm12
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %xmm12,%xmm12,%xmm0
+ +[a-f0-9]+:	c4 .*	vpcmpeqq %ymm12,%ymm12,%ymm0
 #pass
--- a/opcodes/i386-opc.tbl
+++ b/opcodes/i386-opc.tbl
@@ -1363,7 +1363,7 @@ pblendvb, 0x664c, AVX, Modrm|Vex128|Spac
 pblendvb, 0x660f3810, SSE4_1, Modrm|NoSuf, { Acc|Xmmword, RegXMM|Unspecified|BaseIndex, RegXMM }
 pblendvb, 0x660f3810, SSE4_1, Modrm|NoSuf, { RegXMM|Unspecified|BaseIndex, RegXMM }
 pblendw<sse41>, 0x660f3a0e, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf, { Imm8|Imm8S, RegXMM|Unspecified|BaseIndex, RegXMM }
-pcmpeqq<sse41>, 0x660f3829, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf, { RegXMM|Unspecified|BaseIndex, RegXMM }
+pcmpeqq<sse41>, 0x660f3829, <sse41:cpu>, Modrm|<sse41:attr>|<sse41:vvvv>|NoSuf|Optimize, { RegXMM|Unspecified|BaseIndex, RegXMM }
 pextr<bw><sse41>, 0x660f3a14 | <bw:opc>, <sse41:cpu>, RegMem|<sse41:attr>|NoSuf|IgnoreSize|NoRex64, { Imm8, RegXMM, Reg32|Reg64 }
 pextr<bw><sse41>, 0x660f3a14 | <bw:opc>, <sse41:cpu>, Modrm|<sse41:attr>|NoSuf, { Imm8, RegXMM, <bw:elem>|Unspecified|BaseIndex }
 pextrd<sse41>, 0x660f3a16, <sse41:cpu>, Modrm|<sse41:attr>|NoSuf|IgnoreSize, { Imm8, RegXMM, Reg32|Unspecified|BaseIndex }
@@ -1592,7 +1592,7 @@ vpblendvb, 0x664c, AVX|AVX2, Modrm|Vex|S
 vpblendw, 0x660e, AVX|AVX2, Modrm|Vex|Space0F3A|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Imm8|Imm8S, Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpcmpeq<bw>, 0x6674 | <bw:opc>, AVX|AVX2, Modrm|C|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpcmpeqd, 0x6676, AVX|AVX2, Modrm|C|Vex|Space0F|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
-vpcmpeqq, 0x6629, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
+vpcmpeqq, 0x6629, AVX|AVX2, Modrm|Vex|Space0F38|VexVVVV|VexWIG|CheckOperandSize|NoSuf|Optimize, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM, RegXMM|RegYMM }
 vpcmpestri, 0x6661, AVX|No64, Modrm|Vex|Space0F3A|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }
 vpcmpestri, 0x6661, AVX|x64, Modrm|Vex|Space0F3A|IgnoreSize|No_bSuf|No_wSuf|No_sSuf, { Imm8, Xmmword|Unspecified|BaseIndex|RegXMM, RegXMM }
 vpcmpestrm, 0x6660, AVX|No64, Modrm|Vex|Space0F3A|NoSuf, { Imm8, Unspecified|BaseIndex|RegXMM, RegXMM }


  reply	other threads:[~2023-06-16  7:30 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-16  7:29 [PATCH 0/4] x86: some more optimization plus a new pseudo insn form Jan Beulich
2023-06-16  7:30 ` Jan Beulich [this message]
2023-06-16  7:31 ` [PATCH 2/4] x86: optimize pre-AVX512 {,V}PCMPGT* with identical sources Jan Beulich
2023-06-16  7:31 ` [PATCH 3/4] x86: optimize 128-bit VPBROADCASTQ to VPUNPCKLQDQ Jan Beulich
2023-06-16  7:32 ` [PATCH 4/4] x86: provide a 128-bit VBROADCASTSD pseudo Jan Beulich
2023-06-16 16:59   ` H.J. Lu
2023-06-19  7:20     ` Jan Beulich
2023-06-20 16:07       ` H.J. Lu
2023-06-21  9:01         ` Jan Beulich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=503caac8-8824-823a-81c2-762cba207cb6@suse.com \
    --to=jbeulich@suse.com \
    --cc=binutils@sourceware.org \
    --cc=hjl.tools@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).